001 /* XmlParser.java --
002 Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
003 Portions Copyright 2006 Henri Sivonen.
004
005 This file is part of GNU JAXP.
006
007 GNU JAXP is free software; you can redistribute it and/or modify
008 it under the terms of the GNU General Public License as published by
009 the Free Software Foundation; either version 2, or (at your option)
010 any later version.
011
012 GNU JAXP is distributed in the hope that it will be useful, but
013 WITHOUT ANY WARRANTY; without even the implied warranty of
014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 General Public License for more details.
016
017 You should have received a copy of the GNU General Public License
018 along with GNU JAXP; see the file COPYING. If not, write to the
019 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
020 02111-1307 USA.
021
022 Linking this library statically or dynamically with other modules is
023 making a combined work based on this library. Thus, the terms and
024 conditions of the GNU General Public License cover the whole
025 combination.
026
027 As a special exception, the copyright holders of this library give you
028 permission to link this library with independent modules to produce an
029 executable, regardless of the license terms of these independent
030 modules, and to copy and distribute the resulting executable under
031 terms of your choice, provided that you also meet, for each linked
032 independent module, the terms and conditions of the license of that
033 module. An independent module is a module which is not derived from
034 or based on this library. If you modify this library, you may extend
035 this exception to your version of the library, but you are not
036 obligated to do so. If you do not wish to do so, delete this
037 exception statement from your version.
038
039 Partly derived from code which carried the following notice:
040
041 Copyright (c) 1997, 1998 by Microstar Software Ltd.
042
043 AElfred is free for both commercial and non-commercial use and
044 redistribution, provided that Microstar's copyright and disclaimer are
045 retained intact. You are free to modify AElfred for your own use and
046 to redistribute AElfred with your modifications, provided that the
047 modifications are clearly documented.
048
049 This program is distributed in the hope that it will be useful, but
050 WITHOUT ANY WARRANTY; without even the implied warranty of
051 merchantability or fitness for a particular purpose. Please use it AT
052 YOUR OWN RISK.
053 */
054
055 package nu.validator.gnu.xml.aelfred2;
056
057 import java.io.BufferedInputStream;
058 import java.io.EOFException;
059 import java.io.IOException;
060 import java.io.InputStream;
061 import java.io.InputStreamReader;
062 import java.io.Reader;
063 import java.nio.charset.CharacterCodingException;
064 import java.nio.charset.Charset;
065 import java.nio.charset.CharsetDecoder;
066 import java.nio.charset.CodingErrorAction;
067 import java.nio.charset.IllegalCharsetNameException;
068 import java.nio.charset.UnsupportedCharsetException;
069 import java.util.HashMap;
070 import java.util.Iterator;
071 import java.util.LinkedList;
072
073 import nu.validator.htmlparser.impl.CharacterHandler;
074 import nu.validator.htmlparser.impl.NormalizationChecker;
075 import nu.validator.io.EncodingInfo;
076
077 import org.xml.sax.InputSource;
078 import org.xml.sax.SAXException;
079
080 // Organized imports -- 2005-08-20 hsivonen
081
082 /**
083 * Parse XML documents and return parse events through call-backs. Use the
084 * <code>SAXDriver</code> class as your entry point, as all internal parser
085 * interfaces are subject to change.
086 *
087 * @author Written by David Megginson <dmeggins@microstar.com> (version
088 * 1.2a with bugfixes)
089 * @author Updated by David Brownell <dbrownell@users.sourceforge.net>
090 * @author Modified by Henri Sivonen <hsivonen@iki.fi>
091 * @see SAXDriver
092 */
093 final class XmlParser {
094
095 // avoid slow per-character readCh()
096 private final static boolean USE_CHEATS = false;
097
098 // //////////////////////////////////////////////////////////////////////
099 // Constants.
100 // //////////////////////////////////////////////////////////////////////
101
102 private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
103
104 //
105 // Constants for element content type.
106 //
107
108 /**
109 * Constant: an element has not been declared.
110 *
111 * @see #getElementContentType
112 */
113 public final static int CONTENT_UNDECLARED = 0;
114
115 /**
116 * Constant: the element has a content model of ANY.
117 *
118 * @see #getElementContentType
119 */
120 public final static int CONTENT_ANY = 1;
121
122 /**
123 * Constant: the element has declared content of EMPTY.
124 *
125 * @see #getElementContentType
126 */
127 public final static int CONTENT_EMPTY = 2;
128
129 /**
130 * Constant: the element has mixed content.
131 *
132 * @see #getElementContentType
133 */
134 public final static int CONTENT_MIXED = 3;
135
136 /**
137 * Constant: the element has element content.
138 *
139 * @see #getElementContentType
140 */
141 public final static int CONTENT_ELEMENTS = 4;
142
143 //
144 // Constants for the entity type.
145 //
146
147 /**
148 * Constant: the entity has not been declared.
149 *
150 * @see #getEntityType
151 */
152 public final static int ENTITY_UNDECLARED = 0;
153
154 /**
155 * Constant: the entity is internal.
156 *
157 * @see #getEntityType
158 */
159 public final static int ENTITY_INTERNAL = 1;
160
161 /**
162 * Constant: the entity is external, non-parsable data.
163 *
164 * @see #getEntityType
165 */
166 public final static int ENTITY_NDATA = 2;
167
168 /**
169 * Constant: the entity is external XML data.
170 *
171 * @see #getEntityType
172 */
173 public final static int ENTITY_TEXT = 3;
174
175 //
176 // Attribute type constants are interned literal strings.
177 //
178
179 //
180 // Constants for attribute default value.
181 //
182
183 /**
184 * Constant: the attribute is not declared.
185 *
186 * @see #getAttributeDefaultValueType
187 */
188 public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
189
190 /**
191 * Constant: the attribute has a literal default value specified.
192 *
193 * @see #getAttributeDefaultValueType
194 * @see #getAttributeDefaultValue
195 */
196 public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
197
198 /**
199 * Constant: the attribute was declared #IMPLIED.
200 *
201 * @see #getAttributeDefaultValueType
202 */
203 public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
204
205 /**
206 * Constant: the attribute was declared #REQUIRED.
207 *
208 * @see #getAttributeDefaultValueType
209 */
210 public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
211
212 /**
213 * Constant: the attribute was declared #FIXED.
214 *
215 * @see #getAttributeDefaultValueType
216 * @see #getAttributeDefaultValue
217 */
218 public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
219
220 //
221 // Constants for input.
222 //
223 private final static int INPUT_NONE = 0;
224
225 private final static int INPUT_INTERNAL = 1;
226
227 private final static int INPUT_READER = 5;
228
229 //
230 // Flags for reading literals.
231 //
232 // expand general entity refs (attribute values in dtd and content)
233 private final static int LIT_ENTITY_REF = 2;
234
235 // normalize this value (space chars) (attributes, public ids)
236 private final static int LIT_NORMALIZE = 4;
237
238 // literal is an attribute value
239 private final static int LIT_ATTRIBUTE = 8;
240
241 // don't expand parameter entities
242 private final static int LIT_DISABLE_PE = 16;
243
244 // don't expand [or parse] character refs
245 private final static int LIT_DISABLE_CREF = 32;
246
247 // don't parse general entity refs
248 private final static int LIT_DISABLE_EREF = 64;
249
250 // literal is a public ID value
251 private final static int LIT_PUBID = 256;
252
253 //
254 // Flags affecting PE handling in DTDs (if expandPE is true).
255 // PEs expand with space padding, except inside literals.
256 //
257 private final static int CONTEXT_NORMAL = 0;
258
259 private final static int CONTEXT_LITERAL = 1;
260
261 // Emit warnings for relative URIs with no base URI.
262 static boolean uriWarnings;
263 static {
264 String key = "gnu.xml.aelfred2.XmlParser.uriWarnings";
265 try {
266 uriWarnings = "true".equals(System.getProperty(key));
267 } catch (SecurityException e) {
268 uriWarnings = false;
269 }
270 }
271
272 //
273 // The current XML handler interface.
274 //
275 private SAXDriver handler;
276
277 //
278 // I/O information.
279 //
280 private Reader reader; // current reader
281
282 private InputStream is; // current input stream
283
284 private int line; // current line number
285
286 private int linePrev; // the line of the previous character -- hsivonen
287 // 2007-09-28
288
289 private int column; // current column number
290
291 private int columnPrev; // the column of the previous character -- hsivonen
292 // 2007-09-28
293
294 private boolean nextCharOnNewLine; // indicates whether the next character
295 // is on the next line -- hsivonen
296 // 2007-09-28
297
298 private int sourceType; // type of input source
299
300 private LinkedList<Input> inputStack; // stack of input soruces
301
302 private String characterEncoding; // current character encoding
303
304 private int currentByteCount; // bytes read from current source
305
306 private InputSource scratch; // temporary
307
308 //
309 // Buffers for decoded but unparsed character input.
310 //
311 private char[] readBuffer;
312
313 private int readBufferPos;
314
315 private int readBufferLength;
316
317 private int readBufferOverflow; // overflow from last data chunk.
318
319 //
320 // Buffer for undecoded raw byte input.
321 //
322 private final static int READ_BUFFER_MAX = 16384;
323
324 private byte[] rawReadBuffer;
325
326 //
327 // Buffer for attribute values, char refs, DTD stuff.
328 //
329 private static int DATA_BUFFER_INITIAL = 4096;
330
331 private char[] dataBuffer;
332
333 private int dataBufferPos;
334
335 //
336 // Buffer for parsed names.
337 //
338 private static int NAME_BUFFER_INITIAL = 1024;
339
340 private char[] nameBuffer;
341
342 private int nameBufferPos;
343
344 //
345 // Save any standalone flag
346 //
347 private boolean docIsStandalone;
348
349 //
350 // Hashtables for DTD information on elements, entities, and notations.
351 // Populated until we start ignoring decls (because of skipping a PE)
352 //
353 private HashMap<String, ElementDecl> elementInfo;
354
355 private HashMap<String, EntityInfo> entityInfo;
356
357 private HashMap<String, String> notationInfo;
358
359 private boolean skippedPE;
360
361 //
362 // Element type currently in force.
363 //
364 private String currentElement;
365
366 private int currentElementContent;
367
368 //
369 // Stack of entity names, to detect recursion.
370 //
371 private LinkedList<String> entityStack;
372
373 //
374 // PE expansion is enabled in most chunks of the DTD, not all.
375 // When it's enabled, literals are treated differently.
376 //
377 private boolean inLiteral;
378
379 private boolean expandPE;
380
381 private boolean peIsError;
382
383 //
384 // can't report entity expansion inside two constructs:
385 // - attribute expansions (internal entities only)
386 // - markup declarations (parameter entities only)
387 //
388 private boolean doReport;
389
390 //
391 // Symbol table, for caching interned names.
392 //
393 // These show up wherever XML names or nmtokens are used: naming elements,
394 // attributes, PIs, notations, entities, and enumerated attribute values.
395 //
396 // NOTE: This hashtable doesn't grow. The default size is intended to be
397 // rather large for most documents. Example: one snapshot of the DocBook
398 // XML 4.1 DTD used only about 350 such names. As a rule, only pathological
399 // documents (ones that don't reuse names) should ever see much collision.
400 //
401 // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing.
402 // "2039" keeps the hash table size at about two memory pages on typical
403 // 32 bit hardware.
404 //
405 private final static int SYMBOL_TABLE_LENGTH = 2039;
406
407 private Object[][] symbolTable;
408
409 //
410 // Hash table of attributes found in current start tag.
411 //
412 private String[] tagAttributes;
413
414 private int tagAttributePos;
415
416 //
417 // Utility flag: have we noticed a CR while reading the last
418 // data chunk? If so, we will have to go back and normalise
419 // CR or CR/LF line ends.
420 //
421 private boolean sawCR;
422
423 //
424 // Utility flag: are we in CDATA? If so, whitespace isn't ignorable.
425 //
426 private boolean inCDATA;
427
428 //
429 // Xml version.
430 //
431 private static final int XML_10 = 0;
432
433 private static final int XML_11 = 1;
434
435 private int xmlVersion = XML_10;
436
437 //
438 // Normalization checking
439 //
440
441 private NormalizationChecker normalizationChecker;
442
443 private CharacterHandler characterHandler;
444
445 // ////////////////////////////////////////////////////////////////////
446 // Constructors.
447 // //////////////////////////////////////////////////////////////////////
448
449 /**
450 * Construct a new parser with no associated handler.
451 *
452 * @see #setHandler
453 * @see #parse
454 */
455 // package private
456 XmlParser() {
457 }
458
459 /**
460 * Set the handler that will receive parsing events.
461 *
462 * @param handler
463 * The handler to receive callback events.
464 * @see #parse
465 */
466 // package private
467 void setHandler(SAXDriver handler) {
468 this.handler = handler;
469 }
470
471 /**
472 * Parse an XML document from the character stream, byte stream, or URI that
473 * you provide (in that order of preference). Any URI that you supply will
474 * become the base URI for resolving relative URI, and may be used to
475 * acquire a reader or byte stream.
476 *
477 * <p>
478 * Only one thread at a time may use this parser; since it is private to
479 * this package, post-parse cleanup is done by the caller, which MUST NOT
480 * REUSE the parser (just null it).
481 *
482 * @param systemId
483 * Absolute URI of the document; should never be null, but may be
484 * so iff a reader <em>or</em> a stream is provided.
485 * @param publicId
486 * The public identifier of the document, or null.
487 * @param reader
488 * A character stream; must be null if stream isn't.
489 * @param stream
490 * A byte input stream; must be null if reader isn't.
491 * @param characterEncoding
492 * The suggested encoding, or null if unknown.
493 * @exception java.lang.Exception
494 * Basically SAXException or IOException
495 */
496 // package private
497 void doParse(String systemId, String publicId, Reader reader,
498 InputStream stream, String encoding) throws Exception {
499 if (handler == null) {
500 throw new IllegalStateException("no callback handler");
501 }
502
503 alreadyWarnedAboutPrivateUseCharacters = false;
504 initializeVariables();
505
506 // predeclare the built-in entities here (replacement texts)
507 // we don't need to intern(), since we're guaranteed literals
508 // are always (globally) interned.
509 setInternalEntity("amp", "&");
510 setInternalEntity("lt", "<");
511 setInternalEntity("gt", ">");
512 setInternalEntity("apos", "'");
513 setInternalEntity("quot", """);
514
515 try {
516 // pushURL first to ensure locator is correct in startDocument
517 // ... it might report an IO or encoding exception.
518 handler.startDocument();
519 pushURL(false, "[document]",
520 // default baseURI: null
521 new ExternalIdentifiers(publicId, systemId, null), reader,
522 stream, encoding, false);
523
524 parseDocument();
525 } catch (EOFException e) {
526 // empty input
527 fatal("empty document, with no root element.");
528 } finally {
529 if (reader != null) {
530 try {
531 reader.close();
532 } catch (IOException e) {
533 /* ignore */
534 }
535 }
536 if (stream != null) {
537 try {
538 stream.close();
539 } catch (IOException e) {
540 /* ignore */
541 }
542 }
543 if (is != null) {
544 try {
545 is.close();
546 } catch (IOException e) {
547 /* ignore */
548 }
549 }
550 }
551 }
552
553 // ////////////////////////////////////////////////////////////////////
554 // Error reporting.
555 // ////////////////////////////////////////////////////////////////////
556
557 /**
558 * Report an error.
559 *
560 * @param message
561 * The error message.
562 * @param textFound
563 * The text that caused the error (or null).
564 * @see SAXDriver#error
565 * @see #line
566 */
567 private void fatal(String message, String textFound, String textExpected)
568 throws SAXException {
569 // smart quotes -- 2005-08-20 hsivonen
570 if (textFound != null) {
571 message = message + " (found \u201C" + textFound + "\u201D)";
572 }
573 if (textExpected != null) {
574 message = message + " (expected \u201C" + textExpected + "\u201D)";
575 }
576 handler.fatal(message);
577
578 // "can't happen"
579 throw new SAXException(message);
580 }
581
582 /**
583 * Report a serious error.
584 *
585 * @param message
586 * The error message.
587 * @param textFound
588 * The text that caused the error (or null).
589 */
590 private void fatal(String message, char textFound, String textExpected)
591 throws SAXException {
592 fatal(message, new Character(textFound).toString(), textExpected);
593 }
594
595 /**
596 * Report typical case fatal errors.
597 */
598 private void fatal(String message) throws SAXException {
599 handler.fatal(message);
600 }
601
602 /**
603 * Report non-fatal errors.
604 */
605 private void err(String message) throws SAXException {
606 handler.verror(message);
607 }
608
609 // ////////////////////////////////////////////////////////////////////
610 // Major syntactic productions.
611 // ////////////////////////////////////////////////////////////////////
612
613 /**
614 * Parse an XML document.
615 *
616 * <pre>
617 * [1] document ::= prolog element Misc*
618 * </pre>
619 *
620 * <p>
621 * This is the top-level parsing function for a single XML document. As a
622 * minimum, a well-formed document must have a document element, and a valid
623 * document must have a prolog (one with doctype) as well.
624 */
625 private void parseDocument() throws Exception {
626 try { // added by MHK
627 boolean sawDTD = parseProlog();
628 require('<');
629 parseElement(!sawDTD);
630 } catch (EOFException ee) { // added by MHK
631 fatal("premature end of file", "[EOF]", null);
632 }
633
634 try {
635 parseMisc(); // skip all white, PIs, and comments
636 char c = readCh(); // if this doesn't throw an exception...
637 fatal("unexpected characters after document end", c, null);
638 } catch (EOFException e) {
639 if (characterHandler != null) {
640 characterHandler.end();
641 }
642 if (normalizationChecker != null) {
643 normalizationChecker.end();
644 }
645 return;
646 }
647 }
648
649 static final char[] startDelimComment = { '<', '!', '-', '-' };
650
651 static final char[] endDelimComment = { '-', '-' };
652
653 /**
654 * Skip a comment.
655 *
656 * <pre>
657 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
658 * </pre>
659 *
660 * <p>
661 * (The <code><!--</code> has already been read.)
662 */
663 private void parseComment() throws Exception {
664 boolean saved = expandPE;
665
666 expandPE = false;
667 parseUntil(endDelimComment);
668 require('>');
669 expandPE = saved;
670 handler.comment(dataBuffer, 0, dataBufferPos);
671 dataBufferPos = 0;
672 }
673
674 static final char[] startDelimPI = { '<', '?' };
675
676 static final char[] endDelimPI = { '?', '>' };
677
678 /**
679 * Parse a processing instruction and do a call-back.
680 *
681 * <pre>
682 * [16] PI ::= '<?' PITarget
683 * (S (Char* - (Char* '?>' Char*)))?
684 * '?>'
685 * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
686 * </pre>
687 *
688 * <p>
689 * (The <code><?</code> has already been read.)
690 */
691 private void parsePI() throws SAXException, IOException {
692 String name;
693 boolean saved = expandPE;
694
695 expandPE = false;
696 name = readNmtoken(true);
697 // NE08
698 if (name.indexOf(':') >= 0) {
699 fatal("Illegal character(':') in processing instruction name ",
700 name, null);
701 }
702 if ("xml".equalsIgnoreCase(name)) {
703 fatal("Illegal processing instruction target", name, null);
704 }
705 if (!tryRead(endDelimPI)) {
706 requireWhitespace();
707 parseUntil(endDelimPI);
708 }
709 expandPE = saved;
710 handler.processingInstruction(name, dataBufferToString());
711 }
712
713 static final char[] endDelimCDATA = { ']', ']', '>' };
714
715 private boolean isDirtyCurrentElement;
716
717 private boolean alreadyWarnedAboutPrivateUseCharacters;
718
719 private char prev;
720
721 /**
722 * Parse a CDATA section.
723 *
724 * <pre>
725 * [18] CDSect ::= CDStart CData CDEnd
726 * [19] CDStart ::= '<![CDATA['
727 * [20] CData ::= (Char* - (Char* ']]>' Char*))
728 * [21] CDEnd ::= ']]>'
729 * </pre>
730 *
731 * <p>
732 * (The '<![CDATA[' has already been read.)
733 */
734 private void parseCDSect() throws Exception {
735 parseUntil(endDelimCDATA);
736 dataBufferFlush();
737 }
738
739 /**
740 * Parse the prolog of an XML document.
741 *
742 * <pre>
743 * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
744 * </pre>
745 *
746 * <p>
747 * We do not look for the XML declaration here, because it was handled by
748 * pushURL ().
749 *
750 * @see pushURL
751 * @return true if a DTD was read.
752 */
753 private boolean parseProlog() throws Exception {
754 parseMisc();
755
756 if (tryRead("<!DOCTYPE")) {
757 parseDoctypedecl();
758 parseMisc();
759 return true;
760 }
761 return false;
762 }
763
764 private void checkLegalVersion(String version) throws SAXException {
765 int len = version.length();
766 for (int i = 0; i < len; i++) {
767 char c = version.charAt(i);
768 if ('0' <= c && c <= '9') {
769 continue;
770 }
771 if (c == '_' || c == '.' || c == ':' || c == '-') {
772 continue;
773 }
774 if ('a' <= c && c <= 'z') {
775 continue;
776 }
777 if ('A' <= c && c <= 'Z') {
778 continue;
779 }
780 fatal("illegal character in version", version, "1.0");
781 }
782 }
783
784 /**
785 * Parse the XML declaration.
786 *
787 * <pre>
788 * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
789 * [24] VersionInfo ::= S 'version' Eq
790 * ("'" VersionNum "'" | '"' VersionNum '"' )
791 * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
792 * [32] SDDecl ::= S 'standalone' Eq
793 * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
794 * [80] EncodingDecl ::= S 'encoding' Eq
795 * ( "'" EncName "'" | "'" EncName "'" )
796 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
797 * </pre>
798 *
799 * <p>
800 * (The <code><?xml</code> and whitespace have already been read.)
801 *
802 * @return the encoding in the declaration, uppercased; or null
803 * @see #parseTextDecl
804 * @see #setupDecoding
805 */
806 private String parseXMLDecl(String encoding) throws SAXException,
807 IOException {
808 String version;
809 String encodingName = null;
810 String standalone = null;
811 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
812
813 // Read the version.
814 require("version");
815 parseEq();
816 checkLegalVersion(version = readLiteral(flags));
817 if (!version.equals("1.0")) {
818 if (version.equals("1.1")) {
819 fatal("XML 1.1 not supported."); // 2006-04-24 hsivonen
820 } else {
821 fatal("illegal XML version", version, "1.0"); // removed 1.1
822 // -- 2006-04-24
823 // hsivonen
824 }
825 } else {
826 xmlVersion = XML_10;
827 }
828 // Try reading an encoding declaration.
829 boolean white = tryWhitespace();
830
831 if (tryRead("encoding")) {
832 if (!white) {
833 fatal("whitespace required before 'encoding='");
834 }
835 parseEq();
836 encodingName = readLiteral(flags);
837 checkEncodingLiteral(encodingName); // 2006-04-28 hsivonen
838 if (reader == null) {
839 draconianInputStreamReader(encodingName, is, true);
840 } else {
841 checkEncodingMatch(encoding, encodingName);
842 }
843 }
844
845 // Try reading a standalone declaration
846 if (encodingName != null) {
847 white = tryWhitespace();
848 } else {
849 if (encoding == null) {
850 draconianInputStreamReader("UTF-8", is, false); // 2006-04-24
851 // hsivonen
852 }
853 warnAboutLackOfEncodingDecl(encoding);
854 }
855 if (tryRead("standalone")) {
856 if (!white) {
857 fatal("whitespace required before 'standalone='");
858 }
859 parseEq();
860 standalone = readLiteral(flags);
861 if ("yes".equals(standalone)) {
862 docIsStandalone = true;
863 } else if (!"no".equals(standalone)) {
864 fatal("standalone flag must be 'yes' or 'no'");
865 }
866 }
867
868 skipWhitespace();
869 require("?>");
870
871 return encodingName;
872 }
873
874 // hsivonen 2006-04-28
875 private void checkEncodingLiteral(String encodingName) throws SAXException {
876 if (encodingName == null) {
877 return;
878 }
879 if (encodingName.length() == 0) {
880 fatal("The empty string does not a legal encoding name.");
881 }
882 char c = encodingName.charAt(0);
883 if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
884 fatal("The encoding name must start with an ASCII letter.");
885 }
886 for (int i = 1; i < encodingName.length(); i++) {
887 c = encodingName.charAt(i);
888 if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
889 || (c >= '0' && c <= '9') || (c == '.') || (c == '_') || (c == '-'))) {
890 fatal("Illegal character in encoding name: U+"
891 + Integer.toHexString(c) + ".");
892 }
893 }
894 }
895
896 /**
897 * Parse a text declaration.
898 *
899 * <pre>
900 * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
901 * [80] EncodingDecl ::= S 'encoding' Eq
902 * ( '"' EncName '"' | "'" EncName "'" )
903 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
904 * </pre>
905 *
906 * <p>
907 * (The <code><?xml</code>' and whitespace have already been read.)
908 *
909 * @return the encoding in the declaration, uppercased; or null
910 * @see #parseXMLDecl
911 * @see #setupDecoding
912 */
913 private String parseTextDecl(String encoding) throws SAXException,
914 IOException {
915 String encodingName = null;
916 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
917
918 // Read an optional version.
919 if (tryRead("version")) {
920 String version;
921 parseEq();
922 checkLegalVersion(version = readLiteral(flags));
923 if (!version.equals("1.0")) {
924 if (version.equals("1.1")) {
925 fatal("XML 1.1 not supported."); // 2006-04-24 hsivonen
926 } else {
927 fatal("illegal XML version", version, "1.0"); // removed
928 // 1.1 --
929 // 2006-04-24
930 // hsivonen
931 }
932 }
933 requireWhitespace();
934 }
935
936 // Read the encoding.
937 require("encoding");
938 parseEq();
939 encodingName = readLiteral(flags);
940 checkEncodingLiteral(encodingName); // 2006-04-28 hsivonen
941 if (reader == null) {
942 draconianInputStreamReader(encodingName, is, true);
943 } else {
944 checkEncodingMatch(encoding, encodingName);
945 }
946 skipWhitespace();
947 require("?>");
948
949 return encodingName;
950 }
951
952 private void checkEncodingMatch(String used, String detected)
953 throws SAXException {
954 // method added -- 2006-02-03 hsivonen
955 if (used == null) {
956 if (!characterEncoding.equalsIgnoreCase(detected)) {
957 fatal(
958 "Declared character encoding was not the one sniffed from the BOM.",
959 detected, characterEncoding);
960 }
961 } else {
962 if (!"".equals(used) && !used.equalsIgnoreCase(detected)) {
963 handler.warn("External encoding information specified "
964 + used
965 + ", but XML declaration specified "
966 + detected
967 + ". Allowing external to override per RFC 3023. The well-formedness status of this document may change when decoupled from the external character encoding information.");
968 }
969 }
970 }
971
972 private void draconianInputStreamReader(String encoding,
973 InputStream stream, boolean requireAsciiSuperset)
974 throws SAXException, IOException {
975 draconianInputStreamReader(encoding, stream, requireAsciiSuperset,
976 encoding);
977 }
978
979 private void draconianInputStreamReader(String encoding,
980 InputStream stream, boolean requireAsciiSuperset, String actualName)
981 throws SAXException, IOException {
982 // method added -- 2005-08-21 hsivonen
983 sourceType = INPUT_READER;
984 characterEncoding = actualName.toUpperCase();
985 encoding = encoding.toUpperCase();
986 try {
987 Charset cs = Charset.forName(encoding);
988 String canonName = cs.name();
989 if (requireAsciiSuperset) {
990 if (!EncodingInfo.isAsciiSuperset(canonName)) {
991 fatal("The encoding \u201C"
992 + encoding
993 + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration.");
994 }
995 }
996 if (canonName.startsWith("X-") || canonName.startsWith("x-")
997 || canonName.startsWith("Mac")) {
998 if (encoding.startsWith("X-")) {
999 err(encoding
1000 + " is not an IANA-registered encoding. (Charmod C022)");
1001 } else {
1002 err(encoding
1003 + "is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
1004 }
1005 } else if (!canonName.equalsIgnoreCase(encoding)) {
1006 err(encoding
1007 + " is not the preferred name of the character encoding in use. The preferred name is "
1008 + canonName + ". (Charmod C024)");
1009 }
1010 if (!("UTF-8".equals(encoding) || "UTF-16".equals(encoding)
1011 || "UTF-16BE".equals(encoding)
1012 || "UTF-16LE".equals(encoding)
1013 || "ISO-8859-1".equals(encoding) || "US-ASCII".equals(encoding))) {
1014 handler.warn("XML processors are required to support the UTF-8 and UTF-16 character encodings. The encoding was "
1015 + actualName
1016 + " instead, which is an incompatibility risk.");
1017 }
1018 CharsetDecoder decoder = cs.newDecoder();
1019 decoder.onMalformedInput(CodingErrorAction.REPORT);
1020 decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
1021 this.reader = new InputStreamReader(stream, decoder);
1022 } catch (IllegalCharsetNameException e) {
1023 fatal("Illegal character encoding name: " + encoding);
1024 } catch (UnsupportedCharsetException e) {
1025 handler.fatal("Unsupported character encoding: " + encoding);
1026 }
1027 }
1028
1029 /**
1030 * Parse miscellaneous markup outside the document element and DOCTYPE
1031 * declaration.
1032 *
1033 * <pre>
1034 * [27] Misc ::= Comment | PI | S
1035 * </pre>
1036 */
1037 private void parseMisc() throws Exception {
1038 while (true) {
1039 skipWhitespace();
1040 if (tryRead(startDelimPI)) {
1041 parsePI();
1042 } else if (tryRead(startDelimComment)) {
1043 parseComment();
1044 } else {
1045 return;
1046 }
1047 }
1048 }
1049
1050 /**
1051 * Parse a document type declaration.
1052 *
1053 * <pre>
1054 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
1055 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
1056 * </pre>
1057 *
1058 * <p>
1059 * (The <code><!DOCTYPE</code> has already been read.)
1060 */
1061 private void parseDoctypedecl() throws Exception {
1062 String rootName;
1063 ExternalIdentifiers ids;
1064
1065 // Read the document type name.
1066 requireWhitespace();
1067 rootName = readNmtoken(true);
1068
1069 // Read the External subset's IDs
1070 skipWhitespace();
1071 ids = readExternalIds(false, true);
1072
1073 // report (a) declaration of name, (b) lexical info (ids)
1074 handler.doctypeDecl(rootName, ids.publicId, ids.systemId);
1075
1076 // Internal subset is parsed first, if present
1077 skipWhitespace();
1078 if (tryRead('[')) {
1079
1080 // loop until the subset ends
1081 while (true) {
1082 doReport = expandPE = true;
1083 skipWhitespace();
1084 doReport = expandPE = false;
1085 if (tryRead(']')) {
1086 break; // end of subset
1087 } else {
1088 // WFC, PEs in internal subset (only between decls)
1089 peIsError = expandPE = true;
1090 parseMarkupdecl();
1091 peIsError = expandPE = false;
1092 }
1093 }
1094 }
1095 skipWhitespace();
1096 require('>');
1097
1098 // Read the external subset, if any
1099 InputSource subset;
1100
1101 if (ids.systemId == null) {
1102 subset = handler.getExternalSubset(rootName, handler.getSystemId());
1103 } else {
1104 subset = null;
1105 }
1106 if (ids.systemId != null || subset != null) {
1107 pushString(null, ">");
1108
1109 // NOTE: [dtd] is so we say what SAX2 expects,
1110 // though it's misleading (subset, not entire dtd)
1111 if (ids.systemId != null) {
1112 pushURL(true, "[dtd]", ids, null, null, null, true);
1113 } else {
1114 handler.warn("modifying document by adding external subset");
1115 pushURL(true, "[dtd]", new ExternalIdentifiers(
1116 subset.getPublicId(), subset.getSystemId(), null),
1117 subset.getCharacterStream(), subset.getByteStream(),
1118 subset.getEncoding(), false);
1119 }
1120
1121 // Loop until we end up back at '>'
1122 while (true) {
1123 doReport = expandPE = true;
1124 skipWhitespace();
1125 doReport = expandPE = false;
1126 if (tryRead('>')) {
1127 break;
1128 } else {
1129 expandPE = true;
1130 parseMarkupdecl();
1131 expandPE = false;
1132 }
1133 }
1134
1135 // the ">" string isn't popped yet
1136 if (inputStack.size() != 1) {
1137 fatal("external subset has unmatched '>'");
1138 }
1139 }
1140
1141 // done dtd
1142 handler.endDoctype();
1143 expandPE = false;
1144 doReport = true;
1145 }
1146
1147 /**
1148 * Parse a markup declaration in the internal or external DTD subset.
1149 *
1150 * <pre>
1151 * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
1152 * | NotationDecl | PI | Comment
1153 * [30] extSubsetDecl ::= (markupdecl | conditionalSect
1154 * | PEReference | S) *
1155 * </pre>
1156 *
1157 * <p>
1158 * Reading toplevel PE references is handled as a lexical issue by the
1159 * caller, as is whitespace.
1160 */
1161 private void parseMarkupdecl() throws Exception {
1162 char[] saved = null;
1163 boolean savedPE = expandPE;
1164
1165 // prevent "<%foo;" and ensures saved entity is right
1166 require('<');
1167 unread('<');
1168 expandPE = false;
1169
1170 if (tryRead("<!ELEMENT")) {
1171 saved = readBuffer;
1172 expandPE = savedPE;
1173 parseElementDecl();
1174 } else if (tryRead("<!ATTLIST")) {
1175 saved = readBuffer;
1176 expandPE = savedPE;
1177 parseAttlistDecl();
1178 } else if (tryRead("<!ENTITY")) {
1179 saved = readBuffer;
1180 expandPE = savedPE;
1181 parseEntityDecl();
1182 } else if (tryRead("<!NOTATION")) {
1183 saved = readBuffer;
1184 expandPE = savedPE;
1185 parseNotationDecl();
1186 } else if (tryRead(startDelimPI)) {
1187 saved = readBuffer;
1188 expandPE = savedPE;
1189 parsePI();
1190 } else if (tryRead(startDelimComment)) {
1191 saved = readBuffer;
1192 expandPE = savedPE;
1193 parseComment();
1194 } else if (tryRead("<![")) {
1195 saved = readBuffer;
1196 expandPE = savedPE;
1197 if (inputStack.size() > 0) {
1198 parseConditionalSect(saved);
1199 } else {
1200 fatal("conditional sections illegal in internal subset");
1201 }
1202 } else {
1203 fatal("expected markup declaration");
1204 }
1205
1206 // VC: Proper Decl/PE Nesting
1207 if (readBuffer != saved) {
1208 handler.verror("Illegal Declaration/PE nesting");
1209 }
1210 }
1211
1212 /**
1213 * Parse an element, with its tags.
1214 *
1215 * <pre>
1216 * [39] element ::= EmptyElementTag | STag content ETag
1217 * [40] STag ::= '<' Name (S Attribute)* S? '>'
1218 * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>'
1219 * </pre>
1220 *
1221 * <p>
1222 * (The '<' has already been read.)
1223 * <p>
1224 * NOTE: this method actually chains onto parseContent (), if necessary, and
1225 * parseContent () will take care of calling parseETag ().
1226 */
1227 private void parseElement(boolean maybeGetSubset) throws Exception {
1228 String gi;
1229 char c;
1230 int oldElementContent = currentElementContent;
1231 String oldElement = currentElement;
1232 ElementDecl element;
1233
1234 // This is the (global) counter for the
1235 // array of specified attributes.
1236 tagAttributePos = 0;
1237
1238 // Read the element type name.
1239 gi = readNmtoken(true);
1240
1241 // If we saw no DTD, and this is the document root element,
1242 // let the application modify the input stream by providing one.
1243 if (maybeGetSubset) {
1244 InputSource subset = handler.getExternalSubset(gi,
1245 handler.getSystemId());
1246 if (subset != null) {
1247 String publicId = subset.getPublicId();
1248 String systemId = subset.getSystemId();
1249
1250 handler.warn("modifying document by adding DTD");
1251 handler.doctypeDecl(gi, publicId, systemId);
1252 pushString(null, ">");
1253
1254 // NOTE: [dtd] is so we say what SAX2 expects,
1255 // though it's misleading (subset, not entire dtd)
1256 pushURL(true, "[dtd]", new ExternalIdentifiers(publicId,
1257 systemId, null), subset.getCharacterStream(),
1258 subset.getByteStream(), subset.getEncoding(), false);
1259
1260 // Loop until we end up back at '>'
1261 while (true) {
1262 doReport = expandPE = true;
1263 skipWhitespace();
1264 doReport = expandPE = false;
1265 if (tryRead('>')) {
1266 break;
1267 } else {
1268 expandPE = true;
1269 parseMarkupdecl();
1270 expandPE = false;
1271 }
1272 }
1273
1274 // the ">" string isn't popped yet
1275 if (inputStack.size() != 1) {
1276 fatal("external subset has unmatched '>'");
1277 }
1278
1279 handler.endDoctype();
1280 }
1281 }
1282
1283 // Determine the current content type.
1284 currentElement = gi;
1285 element = elementInfo.get(gi);
1286 currentElementContent = getContentType(element, CONTENT_ANY);
1287
1288 // Read the attributes, if any.
1289 // After this loop, "c" is the closing delimiter.
1290 boolean white = tryWhitespace();
1291 c = readCh();
1292 while (c != '/' && c != '>') {
1293 unread(c);
1294 if (!white) {
1295 fatal("need whitespace between attributes");
1296 }
1297 parseAttribute(gi);
1298 white = tryWhitespace();
1299 c = readCh();
1300 }
1301
1302 // Supply any defaulted attributes.
1303 Iterator<String> atts = declaredAttributes(element);
1304 if (atts != null) {
1305 String aname;
1306 loop: while (atts.hasNext()) {
1307 aname = atts.next();
1308 // See if it was specified.
1309 for (int i = 0; i < tagAttributePos; i++) {
1310 if (tagAttributes[i] == aname) {
1311 continue loop;
1312 }
1313 }
1314 // ... or has a default
1315 String value = getAttributeDefaultValue(gi, aname);
1316
1317 if (value == null) {
1318 continue;
1319 }
1320 handler.attribute(aname, value, false);
1321 }
1322 }
1323
1324 // Figure out if this is a start tag
1325 // or an empty element, and dispatch an
1326 // event accordingly.
1327 switch (c) {
1328 case '>':
1329 handler.startElement(gi);
1330 parseContent();
1331 break;
1332 case '/':
1333 require('>');
1334 handler.startElement(gi);
1335 handler.endElement(gi);
1336 break;
1337 }
1338
1339 // Restore the previous state.
1340 currentElement = oldElement;
1341 currentElementContent = oldElementContent;
1342 }
1343
1344 /**
1345 * Parse an attribute assignment.
1346 *
1347 * <pre>
1348 * [41] Attribute ::= Name Eq AttValue
1349 * </pre>
1350 *
1351 * @param name
1352 * The name of the attribute's element.
1353 * @see SAXDriver#attribute
1354 */
1355 private void parseAttribute(String name) throws Exception {
1356 String aname;
1357 String type;
1358 String value;
1359 int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF;
1360
1361 // Read the attribute name.
1362 aname = readNmtoken(true);
1363 type = getAttributeType(name, aname);
1364
1365 // Parse '='
1366 parseEq();
1367
1368 // Read the value, normalizing whitespace
1369 // unless it is CDATA.
1370 if (handler.stringInterning) {
1371 if (type == "CDATA" || type == null) {
1372 value = readLiteral(flags);
1373 } else {
1374 value = readLiteral(flags | LIT_NORMALIZE);
1375 }
1376 } else {
1377 if (type.equals("CDATA") || type == null) {
1378 value = readLiteral(flags);
1379 } else {
1380 value = readLiteral(flags | LIT_NORMALIZE);
1381 }
1382 }
1383
1384 // WFC: no duplicate attributes
1385 for (int i = 0; i < tagAttributePos; i++) {
1386 if (aname.equals(tagAttributes[i])) {
1387 fatal("duplicate attribute", aname, null);
1388 }
1389 }
1390
1391 // Inform the handler about the
1392 // attribute.
1393 handler.attribute(aname, value, true);
1394 dataBufferPos = 0;
1395
1396 // Note that the attribute has been
1397 // specified.
1398 if (tagAttributePos == tagAttributes.length) {
1399 String newAttrib[] = new String[tagAttributes.length * 2];
1400 System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1401 tagAttributes = newAttrib;
1402 }
1403 tagAttributes[tagAttributePos++] = aname;
1404 }
1405
1406 /**
1407 * Parse an equals sign surrounded by optional whitespace.
1408 *
1409 * <pre>
1410 * [25] Eq ::= S? '=' S?
1411 * </pre>
1412 */
1413 private void parseEq() throws SAXException, IOException {
1414 skipWhitespace();
1415 require('=');
1416 skipWhitespace();
1417 }
1418
1419 /**
1420 * Parse an end tag.
1421 *
1422 * <pre>
1423 * [42] ETag ::= '</' Name S? '>'
1424 * </pre>
1425 *
1426 * <p>
1427 * NOTE: parseContent () chains to here, we already read the "</".
1428 */
1429 private void parseETag() throws Exception {
1430 require(currentElement);
1431 skipWhitespace();
1432 require('>');
1433 handler.endElement(currentElement);
1434 // not re-reporting any SAXException re bogus end tags,
1435 // even though that diagnostic might be clearer ...
1436 }
1437
1438 /**
1439 * Parse the content of an element.
1440 *
1441 * <pre>
1442 * [43] content ::= (element | CharData | Reference
1443 * | CDSect | PI | Comment)*
1444 * [67] Reference ::= EntityRef | CharRef
1445 * </pre>
1446 *
1447 * <p>
1448 * NOTE: consumes ETtag.
1449 */
1450 private void parseContent() throws Exception {
1451 char c;
1452
1453 while (true) {
1454 // consume characters (or ignorable whitspace) until delimiter
1455 parseCharData();
1456
1457 // Handle delimiters
1458 c = readCh();
1459 switch (c) {
1460 case '&': // Found "&"
1461 c = readCh();
1462 if (c == '#') {
1463 parseCharRef();
1464 } else {
1465 unread(c);
1466 parseEntityRef(true);
1467 }
1468 isDirtyCurrentElement = true;
1469 break;
1470
1471 case '<': // Found "<"
1472 dataBufferFlush();
1473 c = readCh();
1474 switch (c) {
1475 case '!': // Found "<!"
1476 c = readCh();
1477 switch (c) {
1478 case '-': // Found "<!-"
1479 require('-');
1480 isDirtyCurrentElement = false;
1481 parseComment();
1482 break;
1483 case '[': // Found "<!["
1484 isDirtyCurrentElement = false;
1485 require("CDATA[");
1486 handler.startCDATA();
1487 inCDATA = true;
1488 parseCDSect();
1489 inCDATA = false;
1490 handler.endCDATA();
1491 break;
1492 default:
1493 fatal("expected comment or CDATA section",
1494 c, null);
1495 break;
1496 }
1497 break;
1498
1499 case '?': // Found "<?"
1500 isDirtyCurrentElement = false;
1501 parsePI();
1502 break;
1503
1504 case '/': // Found "</"
1505 isDirtyCurrentElement = false;
1506 parseETag();
1507 return;
1508
1509 default: // Found "<" followed by something else
1510 isDirtyCurrentElement = false;
1511 unread(c);
1512 parseElement(false);
1513 break;
1514 }
1515 }
1516 }
1517 }
1518
1519 /**
1520 * Parse an element type declaration.
1521 *
1522 * <pre>
1523 * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
1524 * </pre>
1525 *
1526 * <p>
1527 * NOTE: the '<!ELEMENT' has already been read.
1528 */
1529 private void parseElementDecl() throws Exception {
1530 String name;
1531
1532 requireWhitespace();
1533 // Read the element type name.
1534 name = readNmtoken(true);
1535
1536 requireWhitespace();
1537 // Read the content model.
1538 parseContentspec(name);
1539
1540 skipWhitespace();
1541 require('>');
1542 }
1543
1544 /**
1545 * Content specification.
1546 *
1547 * <pre>
1548 * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1549 * </pre>
1550 */
1551 private void parseContentspec(String name) throws Exception {
1552 // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ...
1553 if (tryRead("EMPTY")) {
1554 setElement(name, CONTENT_EMPTY, null, null);
1555 if (!skippedPE) {
1556 handler.getDeclHandler().elementDecl(name, "EMPTY");
1557 }
1558 return;
1559 } else if (tryRead("ANY")) {
1560 setElement(name, CONTENT_ANY, null, null);
1561 if (!skippedPE) {
1562 handler.getDeclHandler().elementDecl(name, "ANY");
1563 }
1564 return;
1565 } else {
1566 String model;
1567 char[] saved;
1568
1569 require('(');
1570 saved = readBuffer;
1571 dataBufferAppend('(');
1572 skipWhitespace();
1573 if (tryRead("#PCDATA")) {
1574 dataBufferAppend("#PCDATA");
1575 parseMixed(saved);
1576 model = dataBufferToString();
1577 setElement(name, CONTENT_MIXED, model, null);
1578 } else {
1579 parseElements(saved);
1580 model = dataBufferToString();
1581 setElement(name, CONTENT_ELEMENTS, model, null);
1582 }
1583 if (!skippedPE) {
1584 handler.getDeclHandler().elementDecl(name, model);
1585 }
1586 }
1587 }
1588
1589 /**
1590 * Parse an element-content model.
1591 *
1592 * <pre>
1593 * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1594 * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1595 * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1596 * </pre>
1597 *
1598 * <p>
1599 * NOTE: the opening '(' and S have already been read.
1600 *
1601 * @param saved
1602 * Buffer for entity that should have the terminal ')'
1603 */
1604 private void parseElements(char[] saved) throws Exception {
1605 char c;
1606 char sep;
1607
1608 // Parse the first content particle
1609 skipWhitespace();
1610 parseCp();
1611
1612 // Check for end or for a separator.
1613 skipWhitespace();
1614 c = readCh();
1615 switch (c) {
1616 case ')':
1617 // VC: Proper Group/PE Nesting
1618 if (readBuffer != saved) {
1619 handler.verror("Illegal Group/PE nesting");
1620 }
1621
1622 dataBufferAppend(')');
1623 c = readCh();
1624 switch (c) {
1625 case '*':
1626 case '+':
1627 case '?':
1628 dataBufferAppend(c);
1629 break;
1630 default:
1631 unread(c);
1632 }
1633 return;
1634 case ',': // Register the separator.
1635 case '|':
1636 sep = c;
1637 dataBufferAppend(c);
1638 break;
1639 default:
1640 fatal("bad separator in content model", c, null);
1641 return;
1642 }
1643
1644 // Parse the rest of the content model.
1645 while (true) {
1646 skipWhitespace();
1647 parseCp();
1648 skipWhitespace();
1649 c = readCh();
1650 if (c == ')') {
1651 // VC: Proper Group/PE Nesting
1652 if (readBuffer != saved) {
1653 handler.verror("Illegal Group/PE nesting");
1654 }
1655
1656 dataBufferAppend(')');
1657 break;
1658 } else if (c != sep) {
1659 fatal("bad separator in content model", c, null);
1660 return;
1661 } else {
1662 dataBufferAppend(c);
1663 }
1664 }
1665
1666 // Check for the occurrence indicator.
1667 c = readCh();
1668 switch (c) {
1669 case '?':
1670 case '*':
1671 case '+':
1672 dataBufferAppend(c);
1673 return;
1674 default:
1675 unread(c);
1676 return;
1677 }
1678 }
1679
1680 /**
1681 * Parse a content particle.
1682 *
1683 * <pre>
1684 * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1685 * </pre>
1686 */
1687 private void parseCp() throws Exception {
1688 if (tryRead('(')) {
1689 dataBufferAppend('(');
1690 parseElements(readBuffer);
1691 } else {
1692 dataBufferAppend(readNmtoken(true));
1693 char c = readCh();
1694 switch (c) {
1695 case '?':
1696 case '*':
1697 case '+':
1698 dataBufferAppend(c);
1699 break;
1700 default:
1701 unread(c);
1702 break;
1703 }
1704 }
1705 }
1706
1707 /**
1708 * Parse mixed content.
1709 *
1710 * <pre>
1711 * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1712 * | '(' S? ('#PCDATA') S? ')'
1713 * </pre>
1714 *
1715 * @param saved
1716 * Buffer for entity that should have the terminal ')'
1717 */
1718 private void parseMixed(char[] saved) throws Exception {
1719 // Check for PCDATA alone.
1720 skipWhitespace();
1721 if (tryRead(')')) {
1722 // VC: Proper Group/PE Nesting
1723 if (readBuffer != saved) {
1724 handler.verror("Illegal Group/PE nesting");
1725 }
1726
1727 dataBufferAppend(")*");
1728 tryRead('*');
1729 return;
1730 }
1731
1732 // Parse mixed content.
1733 skipWhitespace();
1734 while (!tryRead(")")) {
1735 require('|');
1736 dataBufferAppend('|');
1737 skipWhitespace();
1738 dataBufferAppend(readNmtoken(true));
1739 skipWhitespace();
1740 }
1741
1742 // VC: Proper Group/PE Nesting
1743 if (readBuffer != saved) {
1744 handler.verror("Illegal Group/PE nesting");
1745 }
1746
1747 require('*');
1748 dataBufferAppend(")*");
1749 }
1750
1751 /**
1752 * Parse an attribute list declaration.
1753 *
1754 * <pre>
1755 * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
1756 * </pre>
1757 *
1758 * <p>
1759 * NOTE: the '<!ATTLIST' has already been read.
1760 */
1761 private void parseAttlistDecl() throws Exception {
1762 String elementName;
1763
1764 requireWhitespace();
1765 elementName = readNmtoken(true);
1766 boolean white = tryWhitespace();
1767 while (!tryRead('>')) {
1768 if (!white) {
1769 fatal("whitespace required before attribute definition");
1770 }
1771 parseAttDef(elementName);
1772 white = tryWhitespace();
1773 }
1774 }
1775
1776 /**
1777 * Parse a single attribute definition.
1778 *
1779 * <pre>
1780 * [53] AttDef ::= S Name S AttType S DefaultDecl
1781 * </pre>
1782 */
1783 private void parseAttDef(String elementName) throws Exception {
1784 String name;
1785 String type;
1786 String enumer = null;
1787
1788 // Read the attribute name.
1789 name = readNmtoken(true);
1790
1791 // Read the attribute type.
1792 requireWhitespace();
1793 type = readAttType();
1794
1795 // Get the string of enumerated values if necessary.
1796 if (handler.stringInterning) {
1797 if ("ENUMERATION" == type || "NOTATION" == type) {
1798 enumer = dataBufferToString();
1799 }
1800 } else {
1801 if ("ENUMERATION".equals(type) || "NOTATION".equals(type)) {
1802 enumer = dataBufferToString();
1803 }
1804 }
1805
1806 // Read the default value.
1807 requireWhitespace();
1808 parseDefault(elementName, name, type, enumer);
1809 }
1810
1811 /**
1812 * Parse the attribute type.
1813 *
1814 * <pre>
1815 * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1816 * [55] StringType ::= 'CDATA'
1817 * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1818 * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1819 * [57] EnumeratedType ::= NotationType | Enumeration
1820 * </pre>
1821 */
1822 private String readAttType() throws Exception {
1823 if (tryRead('(')) {
1824 parseEnumeration(false);
1825 return "ENUMERATION";
1826 } else {
1827 String typeString = readNmtoken(true);
1828 if (handler.stringInterning) {
1829 if ("NOTATION" == typeString) {
1830 parseNotationType();
1831 return typeString;
1832 } else if ("CDATA" == typeString || "ID" == typeString
1833 || "IDREF" == typeString || "IDREFS" == typeString
1834 || "ENTITY" == typeString || "ENTITIES" == typeString
1835 || "NMTOKEN" == typeString || "NMTOKENS" == typeString) {
1836 return typeString;
1837 }
1838 } else {
1839 if ("NOTATION".equals(typeString)) {
1840 parseNotationType();
1841 return typeString;
1842 } else if ("CDATA".equals(typeString)
1843 || "ID".equals(typeString)
1844 || "IDREF".equals(typeString)
1845 || "IDREFS".equals(typeString)
1846 || "ENTITY".equals(typeString)
1847 || "ENTITIES".equals(typeString)
1848 || "NMTOKEN".equals(typeString)
1849 || "NMTOKENS".equals(typeString)) {
1850 return typeString;
1851 }
1852 }
1853 fatal("illegal attribute type", typeString, null);
1854 return null;
1855 }
1856 }
1857
1858 /**
1859 * Parse an enumeration.
1860 *
1861 * <pre>
1862 * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1863 * </pre>
1864 *
1865 * <p>
1866 * NOTE: the '(' has already been read.
1867 */
1868 private void parseEnumeration(boolean isNames) throws Exception {
1869 dataBufferAppend('(');
1870
1871 // Read the first token.
1872 skipWhitespace();
1873 dataBufferAppend(readNmtoken(isNames));
1874 // Read the remaining tokens.
1875 skipWhitespace();
1876 while (!tryRead(')')) {
1877 require('|');
1878 dataBufferAppend('|');
1879 skipWhitespace();
1880 dataBufferAppend(readNmtoken(isNames));
1881 skipWhitespace();
1882 }
1883 dataBufferAppend(')');
1884 }
1885
1886 /**
1887 * Parse a notation type for an attribute.
1888 *
1889 * <pre>
1890 * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
1891 * (S? '|' S? name)* S? ')'
1892 * </pre>
1893 *
1894 * <p>
1895 * NOTE: the 'NOTATION' has already been read
1896 */
1897 private void parseNotationType() throws Exception {
1898 requireWhitespace();
1899 require('(');
1900
1901 parseEnumeration(true);
1902 }
1903
1904 /**
1905 * Parse the default value for an attribute.
1906 *
1907 * <pre>
1908 * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1909 * | (('#FIXED' S)? AttValue)
1910 * </pre>
1911 */
1912 private void parseDefault(String elementName, String name, String type,
1913 String enumer) throws Exception {
1914 int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1915 String value = null;
1916 int flags = LIT_ATTRIBUTE;
1917 boolean saved = expandPE;
1918 String defaultType = null;
1919
1920 // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
1921 // chars to spaces (doesn't matter when that's done if it doesn't
1922 // interfere with char refs expanding to whitespace).
1923
1924 if (!skippedPE) {
1925 flags |= LIT_ENTITY_REF;
1926 if (handler.stringInterning) {
1927 if ("CDATA" != type) {
1928 flags |= LIT_NORMALIZE;
1929 }
1930 } else {
1931 if (!"CDATA".equals(type)) {
1932 flags |= LIT_NORMALIZE;
1933 }
1934 }
1935 }
1936
1937 expandPE = false;
1938 if (tryRead('#')) {
1939 if (tryRead("FIXED")) {
1940 defaultType = "#FIXED";
1941 valueType = ATTRIBUTE_DEFAULT_FIXED;
1942 requireWhitespace();
1943 value = readLiteral(flags);
1944 } else if (tryRead("REQUIRED")) {
1945 defaultType = "#REQUIRED";
1946 valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1947 } else if (tryRead("IMPLIED")) {
1948 defaultType = "#IMPLIED";
1949 valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1950 } else {
1951 fatal("illegal keyword for attribute default value");
1952 }
1953 } else {
1954 value = readLiteral(flags);
1955 }
1956 expandPE = saved;
1957 setAttribute(elementName, name, type, enumer, value, valueType);
1958 if (handler.stringInterning) {
1959 if ("ENUMERATION" == type) {
1960 type = enumer;
1961 } else if ("NOTATION" == type) {
1962 type = "NOTATION " + enumer;
1963 }
1964 } else {
1965 if ("ENUMERATION".equals(type)) {
1966 type = enumer;
1967 } else if ("NOTATION".equals(type)) {
1968 type = "NOTATION " + enumer;
1969 }
1970 }
1971 if (!skippedPE) {
1972 handler.getDeclHandler().attributeDecl(elementName, name, type,
1973 defaultType, value);
1974 }
1975 }
1976
1977 /**
1978 * Parse a conditional section.
1979 *
1980 * <pre>
1981 * [61] conditionalSect ::= includeSect || ignoreSect
1982 * [62] includeSect ::= '<![' S? 'INCLUDE' S? '['
1983 * extSubsetDecl ']]>'
1984 * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '['
1985 * ignoreSectContents* ']]>'
1986 * [64] ignoreSectContents ::= Ignore
1987 * ('<![' ignoreSectContents* ']]>' Ignore )*
1988 * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* )
1989 * </pre>
1990 *
1991 * <p>
1992 * NOTE: the '>![' has already been read.
1993 */
1994 private void parseConditionalSect(char[] saved) throws Exception {
1995 skipWhitespace();
1996 if (tryRead("INCLUDE")) {
1997 skipWhitespace();
1998 require('[');
1999 // VC: Proper Conditional Section/PE Nesting
2000 if (readBuffer != saved) {
2001 handler.verror("Illegal Conditional Section/PE nesting");
2002 }
2003 skipWhitespace();
2004 while (!tryRead("]]>")) {
2005 parseMarkupdecl();
2006 skipWhitespace();
2007 }
2008 } else if (tryRead("IGNORE")) {
2009 skipWhitespace();
2010 require('[');
2011 // VC: Proper Conditional Section/PE Nesting
2012 if (readBuffer != saved) {
2013 handler.verror("Illegal Conditional Section/PE nesting");
2014 }
2015 char c;
2016 expandPE = false;
2017 for (int nest = 1; nest > 0;) {
2018 c = readCh();
2019 switch (c) {
2020 case '<':
2021 if (tryRead("![")) {
2022 nest++;
2023 }
2024 case ']':
2025 if (tryRead("]>")) {
2026 nest--;
2027 }
2028 }
2029 }
2030 expandPE = true;
2031 } else {
2032 fatal("conditional section must begin with INCLUDE or IGNORE");
2033 }
2034 }
2035
2036 private void parseCharRef() throws SAXException, IOException {
2037 parseCharRef(true /* do flushDataBuffer by default */);
2038 }
2039
2040 /**
2041 * Try to read a character reference without consuming data from buffer.
2042 *
2043 * <pre>
2044 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2045 * </pre>
2046 *
2047 * <p>
2048 * NOTE: the '&#' has already been read.
2049 */
2050 private void tryReadCharRef() throws SAXException, IOException {
2051 int value = 0;
2052 char c;
2053
2054 if (tryRead('x')) {
2055 loop1: while (true) {
2056 c = readCh();
2057 if (c == ';') {
2058 break loop1;
2059 } else {
2060 int n = Character.digit(c, 16);
2061 if (n == -1) {
2062 fatal("illegal character in character reference", c,
2063 null);
2064 break loop1;
2065 }
2066 value *= 16;
2067 value += n;
2068 }
2069 }
2070 } else {
2071 loop2: while (true) {
2072 c = readCh();
2073 if (c == ';') {
2074 break loop2;
2075 } else {
2076 int n = Character.digit(c, 10);
2077 if (n == -1) {
2078 fatal("illegal character in character reference", c,
2079 null);
2080 break loop2;
2081 }
2082 value *= 10;
2083 value += n;
2084 }
2085 }
2086 }
2087
2088 // check for character refs being legal XML
2089 if ((value < 0x0020 && !(value == '\n' || value == '\t' || value == '\r'))
2090 || (value >= 0xD800 && value <= 0xDFFF)
2091 || value == 0xFFFE
2092 || value == 0xFFFF || value > 0x0010ffff) {
2093 fatal("illegal XML character reference U+"
2094 + Integer.toHexString(value));
2095 } else if (value >= 0x007F && value <= 0x009F) // 2006-11-13 hsivonen
2096 {
2097 handler.warn("Character reference expands to a control character: U+00"
2098 + Integer.toHexString(c) + ".");
2099 }
2100 if (isPrivateUse(value)) {
2101 warnAboutPrivateUseChar();
2102 }
2103 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2104 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2105 if (value > 0x0010ffff) {
2106 // too big for surrogate
2107 fatal("character reference " + value + " is too large for UTF-16",
2108 new Integer(value).toString(), null);
2109 }
2110
2111 }
2112
2113 /**
2114 * Read and interpret a character reference.
2115 *
2116 * <pre>
2117 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2118 * </pre>
2119 *
2120 * <p>
2121 * NOTE: the '&#' has already been read.
2122 */
2123 private void parseCharRef(boolean doFlush) throws SAXException, IOException {
2124 int value = 0;
2125 char c;
2126
2127 if (tryRead('x')) {
2128 loop1: while (true) {
2129 c = readCh();
2130 if (c == ';') {
2131 break loop1;
2132 } else {
2133 int n = Character.digit(c, 16);
2134 if (n == -1) {
2135 fatal("illegal character in character reference", c,
2136 null);
2137 break loop1;
2138 }
2139 value *= 16;
2140 value += n;
2141 }
2142 }
2143 } else {
2144 loop2: while (true) {
2145 c = readCh();
2146 if (c == ';') {
2147 break loop2;
2148 } else {
2149 int n = Character.digit(c, 10);
2150 if (n == -1) {
2151 fatal("illegal character in character reference", c,
2152 null);
2153 break loop2;
2154 }
2155 value *= 10;
2156 value += c - '0';
2157 }
2158 }
2159 }
2160
2161 // check for character refs being legal XML
2162 if ((value < 0x0020 && !(value == '\n' || value == '\t' || value == '\r'))
2163 || (value >= 0xD800 && value <= 0xDFFF)
2164 || value == 0xFFFE
2165 || value == 0xFFFF || value > 0x0010ffff) {
2166 fatal("illegal XML character reference U+"
2167 + Integer.toHexString(value));
2168 } else if (value >= 0x007F && value <= 0x009F) // 2006-11-13 hsivonen
2169 {
2170 handler.warn("Character reference expands to a control character: U+00"
2171 + Integer.toHexString(c) + ".");
2172 }
2173 if (isPrivateUse(value)) {
2174 warnAboutPrivateUseChar();
2175 }
2176
2177 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2178 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2179 if (value <= 0x0000ffff) {
2180 // no surrogates needed
2181 dataBufferAppend((char) value);
2182 } else if (value <= 0x0010ffff) {
2183 value -= 0x10000;
2184 // > 16 bits, surrogate needed
2185 dataBufferAppend((char) (0xd800 | (value >> 10)));
2186 dataBufferAppend((char) (0xdc00 | (value & 0x0003ff)));
2187 } else {
2188 // too big for surrogate
2189 fatal("character reference " + value + " is too large for UTF-16",
2190 new Integer(value).toString(), null);
2191 }
2192 if (doFlush) {
2193 dataBufferFlush();
2194 }
2195 }
2196
2197 /**
2198 * Parse and expand an entity reference.
2199 *
2200 * <pre>
2201 * [68] EntityRef ::= '&' Name ';'
2202 * </pre>
2203 *
2204 * <p>
2205 * NOTE: the '&' has already been read.
2206 *
2207 * @param externalAllowed
2208 * External entities are allowed here.
2209 */
2210 private void parseEntityRef(boolean externalAllowed) throws SAXException,
2211 IOException {
2212 String name;
2213
2214 name = readNmtoken(true);
2215 require(';');
2216 switch (getEntityType(name)) {
2217 case ENTITY_UNDECLARED:
2218 // NOTE: XML REC describes amazingly convoluted handling for
2219 // this case. Nothing as meaningful as being a WFness error
2220 // unless the processor might _legitimately_ not have seen a
2221 // declaration ... which is what this implements.
2222 String message;
2223
2224 message = "reference to undeclared general entity " + name;
2225 if (skippedPE && !docIsStandalone) {
2226 handler.verror(message);
2227 // we don't know this entity, and it might be external...
2228 if (externalAllowed) {
2229 handler.skippedEntity(name);
2230 }
2231 } else {
2232 fatal(message);
2233 }
2234 break;
2235 case ENTITY_INTERNAL:
2236 pushString(name, getEntityValue(name));
2237
2238 // workaround for possible input pop before marking
2239 // the buffer reading position
2240 char t = readCh();
2241 unread(t);
2242 int bufferPosMark = readBufferPos;
2243
2244 int end = readBufferPos + getEntityValue(name).length();
2245 for (int k = readBufferPos; k < end; k++) {
2246 t = readCh();
2247 if (t == '&') {
2248 t = readCh();
2249 if (t == '#') {
2250 // try to match a character ref
2251 tryReadCharRef();
2252
2253 // everything has been read
2254 if (readBufferPos >= end) {
2255 break;
2256 }
2257 k = readBufferPos;
2258 continue;
2259 } else if (Character.isLetter(t)) {
2260 // looks like an entity ref
2261 unread(t);
2262 readNmtoken(true);
2263 require(';');
2264
2265 // everything has been read
2266 if (readBufferPos >= end) {
2267 break;
2268 }
2269 k = readBufferPos;
2270 continue;
2271 }
2272 fatal(" malformed entity reference");
2273 }
2274
2275 }
2276 readBufferPos = bufferPosMark;
2277 break;
2278 case ENTITY_TEXT:
2279 if (externalAllowed) {
2280 pushURL(false, name, getEntityIds(name), null, null, null,
2281 true);
2282 } else {
2283 fatal("reference to external entity in attribute value.",
2284 name, null);
2285 }
2286 break;
2287 case ENTITY_NDATA:
2288 if (externalAllowed) {
2289 fatal("unparsed entity reference in content", name, null);
2290 } else {
2291 fatal("reference to external entity in attribute value.",
2292 name, null);
2293 }
2294 break;
2295 default:
2296 throw new RuntimeException();
2297 }
2298 }
2299
2300 /**
2301 * Parse and expand a parameter entity reference.
2302 *
2303 * <pre>
2304 * [69] PEReference ::= '%' Name ';'
2305 * </pre>
2306 *
2307 * <p>
2308 * NOTE: the '%' has already been read.
2309 */
2310 private void parsePEReference() throws SAXException, IOException {
2311 String name;
2312
2313 name = "%" + readNmtoken(true);
2314 require(';');
2315 switch (getEntityType(name)) {
2316 case ENTITY_UNDECLARED:
2317 // VC: Entity Declared
2318 handler.verror("reference to undeclared parameter entity "
2319 + name);
2320
2321 // we should disable handling of all subsequent declarations
2322 // unless this is a standalone document (info discarded)
2323 break;
2324 case ENTITY_INTERNAL:
2325 if (inLiteral) {
2326 pushString(name, getEntityValue(name));
2327 } else {
2328 pushString(name, ' ' + getEntityValue(name) + ' ');
2329 }
2330 break;
2331 case ENTITY_TEXT:
2332 if (!inLiteral) {
2333 pushString(null, " ");
2334 }
2335 pushURL(true, name, getEntityIds(name), null, null, null, true);
2336 if (!inLiteral) {
2337 pushString(null, " ");
2338 }
2339 break;
2340 }
2341 }
2342
2343 /**
2344 * Parse an entity declaration.
2345 *
2346 * <pre>
2347 * [70] EntityDecl ::= GEDecl | PEDecl
2348 * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
2349 * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
2350 * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
2351 * [74] PEDef ::= EntityValue | ExternalID
2352 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2353 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2354 * [76] NDataDecl ::= S 'NDATA' S Name
2355 * </pre>
2356 *
2357 * <p>
2358 * NOTE: the '<!ENTITY' has already been read.
2359 */
2360 private void parseEntityDecl() throws Exception {
2361 boolean peFlag = false;
2362 int flags = 0;
2363
2364 // Check for a parameter entity.
2365 expandPE = false;
2366 requireWhitespace();
2367 if (tryRead('%')) {
2368 peFlag = true;
2369 requireWhitespace();
2370 }
2371 expandPE = true;
2372
2373 // Read the entity name, and prepend
2374 // '%' if necessary.
2375 String name = readNmtoken(true);
2376 // NE08
2377 if (name.indexOf(':') >= 0) {
2378 fatal("Illegal character(':') in entity name ", name, null);
2379 }
2380 if (peFlag) {
2381 name = "%" + name;
2382 }
2383
2384 // Read the entity value.
2385 requireWhitespace();
2386 char c = readCh();
2387 unread(c);
2388 if (c == '"' || c == '\'') {
2389 // Internal entity ... replacement text has expanded refs
2390 // to characters and PEs, but not to general entities
2391 String value = readLiteral(flags);
2392 setInternalEntity(name, value);
2393 } else {
2394 // Read the external IDs
2395 ExternalIdentifiers ids = readExternalIds(false, false);
2396
2397 // Check for NDATA declaration.
2398 boolean white = tryWhitespace();
2399 if (!peFlag && tryRead("NDATA")) {
2400 if (!white) {
2401 fatal("whitespace required before NDATA");
2402 }
2403 requireWhitespace();
2404 String notationName = readNmtoken(true);
2405 if (!skippedPE) {
2406 setExternalEntity(name, ENTITY_NDATA, ids, notationName);
2407 handler.unparsedEntityDecl(name, ids.publicId,
2408 ids.systemId, ids.baseUri, notationName);
2409 }
2410 } else if (!skippedPE) {
2411 setExternalEntity(name, ENTITY_TEXT, ids, null);
2412 handler.getDeclHandler().externalEntityDecl(name, ids.publicId,
2413 handler.resolveURIs()
2414 // FIXME: ASSUMES not skipped
2415 // "false" forces error on bad URI
2416 ? handler.absolutize(ids.baseUri, ids.systemId, false)
2417 : ids.systemId);
2418 }
2419 }
2420
2421 // Finish the declaration.
2422 skipWhitespace();
2423 require('>');
2424 }
2425
2426 /**
2427 * Parse a notation declaration.
2428 *
2429 * <pre>
2430 * [82] NotationDecl ::= '<!NOTATION' S Name S
2431 * (ExternalID | PublicID) S? '>'
2432 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2433 * </pre>
2434 *
2435 * <P>
2436 * NOTE: the '<!NOTATION' has already been read.
2437 */
2438 private void parseNotationDecl() throws Exception {
2439 String nname;
2440 ExternalIdentifiers ids;
2441
2442 requireWhitespace();
2443 nname = readNmtoken(true);
2444 // NE08
2445 if (nname.indexOf(':') >= 0) {
2446 fatal("Illegal character(':') in notation name ", nname, null);
2447 }
2448 requireWhitespace();
2449
2450 // Read the external identifiers.
2451 ids = readExternalIds(true, false);
2452
2453 // Register the notation.
2454 setNotation(nname, ids);
2455
2456 skipWhitespace();
2457 require('>');
2458 }
2459
2460 /**
2461 * Parse character data.
2462 *
2463 * <pre>
2464 * [14] CharData ::= [ˆ<&]* - ([ˆ<&]* ']]>' [ˆ<&]*)
2465 * </pre>
2466 */
2467 private void parseCharData() throws Exception {
2468 char c;
2469 int state = 0;
2470 boolean pureWhite = false;
2471
2472 // assert (dataBufferPos == 0);
2473
2474 // are we expecting pure whitespace? it might be dirty...
2475 if ((currentElementContent == CONTENT_ELEMENTS)
2476 && !isDirtyCurrentElement) {
2477 pureWhite = true;
2478 }
2479
2480 // always report right out of readBuffer
2481 // to minimize (pointless) buffer copies
2482 while (true) {
2483 int i;
2484
2485 loop: for (i = readBufferPos; i < readBufferLength; i++) {
2486 advanceLocation();
2487 switch (c = readBuffer[i]) {
2488 case '\n':
2489 nextCharOnNewLine = true;
2490 // pureWhite unmodified
2491 break;
2492 case '\r': // should not happen!!
2493 case '\t':
2494 case ' ':
2495 // pureWhite unmodified
2496 break;
2497 case '&':
2498 case '<':
2499 // pureWhite unmodified
2500 // CLEAN end of text sequence
2501 state = 1;
2502 break loop;
2503 case ']':
2504 // that's not a whitespace char, and
2505 // can not terminate pure whitespace either
2506 pureWhite = false;
2507 if ((i + 2) < readBufferLength) {
2508 if (readBuffer[i + 1] == ']'
2509 && readBuffer[i + 2] == '>') {
2510 // ERROR end of text sequence
2511 state = 2;
2512 break loop;
2513 }
2514 } else {
2515 // FIXME missing two end-of-buffer cases
2516 }
2517 break;
2518 default:
2519 if ((c < 0x0020 || c > 0xFFFD)
2520 || ((c >= 0x007f) && (c <= 0x009f)
2521 && (c != 0x0085) && xmlVersion == XML_11)) {
2522 fatal("illegal XML character U+"
2523 + Integer.toHexString(c));
2524 } else if (c >= '\u007F' && c <= '\u009F') // 2006-04-25
2525 // hsivonen
2526 {
2527 handler.warn("Saw a control character: U+00"
2528 + Integer.toHexString(c) + ".");
2529 }
2530 // that's not a whitespace char
2531 pureWhite = false;
2532 }
2533 }
2534 rollbackLocation();
2535 // report characters/whitspace
2536 int length = i - readBufferPos;
2537
2538 if (length != 0) {
2539 int saveLine = line;
2540 int saveColumn = column;
2541 line = linePrev;
2542 column = columnPrev;
2543 if (pureWhite) {
2544 handler.ignorableWhitespace(readBuffer, readBufferPos,
2545 length);
2546 } else {
2547 handler.charData(readBuffer, readBufferPos, length);
2548 }
2549 line = saveLine;
2550 column = saveColumn;
2551 readBufferPos = i;
2552 }
2553
2554 if (state != 0) {
2555 break;
2556 }
2557
2558 // fill next buffer from this entity, or
2559 // pop stack and continue with previous entity
2560 unread(readCh());
2561 }
2562 if (!pureWhite) {
2563 isDirtyCurrentElement = true;
2564 }
2565 // finish, maybe with error
2566 if (state != 1) // finish, no error
2567 {
2568 fatal("character data may not contain ']]>'");
2569 }
2570 }
2571
2572 /**
2573 *
2574 */
2575 private void advanceLocation() {
2576 linePrev = line;
2577 columnPrev = column;
2578 if (nextCharOnNewLine) {
2579 line++;
2580 column = 1;
2581 } else {
2582 column++;
2583 }
2584 nextCharOnNewLine = false;
2585 }
2586
2587 // ////////////////////////////////////////////////////////////////////
2588 // High-level reading and scanning methods.
2589 // ////////////////////////////////////////////////////////////////////
2590
2591 /**
2592 * Require whitespace characters.
2593 */
2594 private void requireWhitespace() throws SAXException, IOException {
2595 char c = readCh();
2596 if (isWhitespace(c)) {
2597 skipWhitespace();
2598 } else {
2599 fatal("whitespace required", c, null);
2600 }
2601 }
2602
2603 /**
2604 * Skip whitespace characters.
2605 *
2606 * <pre>
2607 * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2608 * </pre>
2609 */
2610 private void skipWhitespace() throws SAXException, IOException {
2611 // Start with a little cheat. Most of
2612 // the time, the white space will fall
2613 // within the current read buffer; if
2614 // not, then fall through.
2615 if (USE_CHEATS) {
2616
2617 loop: for (int i = readBufferPos; i < readBufferLength; i++) {
2618 advanceLocation();
2619 switch (readBuffer[i]) {
2620 case ' ':
2621 case '\t':
2622 case '\r':
2623 break;
2624 case '\n':
2625 nextCharOnNewLine = true;
2626 break;
2627 case '%':
2628 if (expandPE) {
2629 break loop;
2630 }
2631 // else fall through...
2632 default:
2633 readBufferPos = i;
2634 return;
2635 }
2636 }
2637 }
2638
2639 // OK, do it the slow way.
2640 char c = readCh();
2641 while (isWhitespace(c)) {
2642 c = readCh();
2643 }
2644 unread(c);
2645 }
2646
2647 /**
2648 * Read a name or (when parsing an enumeration) name token.
2649 *
2650 * <pre>
2651 * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2652 * [7] Nmtoken ::= (NameChar)+
2653 * </pre>
2654 */
2655 private String readNmtoken(boolean isName) throws SAXException, IOException {
2656 char c;
2657
2658 if (USE_CHEATS) {
2659 loop: for (int i = readBufferPos; i < readBufferLength; i++) {
2660 c = readBuffer[i];
2661 switch (c) {
2662 case '%':
2663 if (expandPE) {
2664 break loop;
2665 }
2666 // else fall through...
2667
2668 // What may legitimately come AFTER a name/nmtoken?
2669 case '<':
2670 case '>':
2671 case '&':
2672 case ',':
2673 case '|':
2674 case '*':
2675 case '+':
2676 case '?':
2677 case ')':
2678 case '=':
2679 case '\'':
2680 case '"':
2681 case '[':
2682 case ' ':
2683 case '\t':
2684 case '\r':
2685 case '\n':
2686 case ';':
2687 case '/':
2688 int start = readBufferPos;
2689 if (i == start) {
2690 fatal("name expected", readBuffer[i], null);
2691 }
2692 readBufferPos = i;
2693 return intern(readBuffer, start, i - start);
2694
2695 default:
2696 // FIXME ... per IBM's OASIS test submission, these:
2697 // ? U+06dd
2698 // Combining U+309B
2699 // these switches are kind of ugly but at least we won't
2700 // have to go over the whole lits for each char
2701 if (isName && i == readBufferPos) {
2702 char c2 = (char) (c & 0x00f0);
2703 switch (c & 0xff00) {
2704 // starting with 01
2705 case 0x0100:
2706 switch (c2) {
2707 case 0x0030:
2708 if (c == 0x0132 || c == 0x0133
2709 || c == 0x013f) {
2710 fatal("Not a name start character, U+"
2711 + Integer.toHexString(c));
2712 }
2713 break;
2714 case 0x0040:
2715 if (c == 0x0140 || c == 0x0149) {
2716 fatal("Not a name start character, U+"
2717 + Integer.toHexString(c));
2718 }
2719 break;
2720 case 0x00c0:
2721 if (c == 0x01c4 || c == 0x01cc) {
2722 fatal("Not a name start character, U+"
2723 + Integer.toHexString(c));
2724 }
2725 break;
2726 case 0x00f0:
2727 if (c == 0x01f1 || c == 0x01f3) {
2728 fatal("Not a name start character, U+"
2729 + Integer.toHexString(c));
2730 }
2731 break;
2732 case 0x00b0:
2733 if (c == 0x01f1 || c == 0x01f3) {
2734 fatal("Not a name start character, U+"
2735 + Integer.toHexString(c));
2736 }
2737 break;
2738 default:
2739 if (c == 0x017f) {
2740 fatal("Not a name start character, U+"
2741 + Integer.toHexString(c));
2742 }
2743 }
2744
2745 break;
2746 // starting with 11
2747 case 0x1100:
2748 switch (c2) {
2749 case 0x0000:
2750 if (c == 0x1104 || c == 0x1108
2751 || c == 0x110a
2752 || c == 0x110d) {
2753 fatal("Not a name start character, U+"
2754 + Integer.toHexString(c));
2755 }
2756 break;
2757 case 0x0030:
2758 if (c == 0x113b || c == 0x113f) {
2759 fatal("Not a name start character, U+"
2760 + Integer.toHexString(c));
2761 }
2762 break;
2763 case 0x0040:
2764 if (c == 0x1141 || c == 0x114d
2765 || c == 0x114f) {
2766 fatal("Not a name start character, U+"
2767 + Integer.toHexString(c));
2768 }
2769 break;
2770 case 0x0050:
2771 if (c == 0x1151 || c == 0x1156) {
2772 fatal("Not a name start character, U+"
2773 + Integer.toHexString(c));
2774 }
2775 break;
2776 case 0x0060:
2777 if (c == 0x1162 || c == 0x1164
2778 || c == 0x1166
2779 || c == 0x116b
2780 || c == 0x116f) {
2781 fatal("Not a name start character, U+"
2782 + Integer.toHexString(c));
2783 }
2784 break;
2785 case 0x00b0:
2786 if (c == 0x11b6 || c == 0x11b9
2787 || c == 0x11bb
2788 || c == 0x116f) {
2789 fatal("Not a name start character, U+"
2790 + Integer.toHexString(c));
2791 }
2792 break;
2793 default:
2794 if (c == 0x1174 || c == 0x119f
2795 || c == 0x11ac
2796 || c == 0x11c3
2797 || c == 0x11f1) {
2798 fatal("Not a name start character, U+"
2799 + Integer.toHexString(c));
2800 }
2801 }
2802 break;
2803 default:
2804 if (c == 0x0e46 || c == 0x1011
2805 || c == 0x212f || c == 0x0587
2806 || c == 0x0230) {
2807 fatal("Not a name start character, U+"
2808 + Integer.toHexString(c));
2809 }
2810 }
2811 }
2812 // punt on exact tests from Appendix A; approximate
2813 // them using the Unicode ID start/part rules
2814 if (i == readBufferPos && isName) {
2815 if (!Character.isUnicodeIdentifierStart(c)
2816 && c != ':' && c != '_') {
2817 fatal("Not a name start character, U+"
2818 + Integer.toHexString(c));
2819 }
2820 } else if (!Character.isUnicodeIdentifierPart(c)
2821 && c != '-' && c != ':' && c != '_' && c != '.'
2822 && !isExtender(c)) {
2823 fatal("Not a name character, U+"
2824 + Integer.toHexString(c));
2825 }
2826 }
2827 }
2828 }
2829
2830 nameBufferPos = 0;
2831
2832 // Read the first character.
2833 loop: while (true) {
2834 c = readCh();
2835 switch (c) {
2836 case '%':
2837 case '<':
2838 case '>':
2839 case '&':
2840 case ',':
2841 case '|':
2842 case '*':
2843 case '+':
2844 case '?':
2845 case ')':
2846 case '=':
2847 case '\'':
2848 case '"':
2849 case '[':
2850 case ' ':
2851 case '\t':
2852 case '\n':
2853 case '\r':
2854 case ';':
2855 case '/':
2856 unread(c);
2857 if (nameBufferPos == 0) {
2858 fatal("name expected");
2859 }
2860 // punt on exact tests from Appendix A, but approximate them
2861 if (isName
2862 && !Character.isUnicodeIdentifierStart(nameBuffer[0])
2863 && ":_".indexOf(nameBuffer[0]) == -1) {
2864 fatal("Not a name start character, U+"
2865 + Integer.toHexString(nameBuffer[0]));
2866 }
2867 String s = intern(nameBuffer, 0, nameBufferPos);
2868 nameBufferPos = 0;
2869 return s;
2870 default:
2871 // punt on exact tests from Appendix A, but approximate them
2872
2873 if ((nameBufferPos != 0 || !isName)
2874 && !Character.isUnicodeIdentifierPart(c)
2875 && ":-_.".indexOf(c) == -1 && !isExtender(c)) {
2876 fatal("Not a name character, U+"
2877 + Integer.toHexString(c));
2878 }
2879 if (nameBufferPos >= nameBuffer.length) {
2880 nameBuffer = (char[]) extendArray(nameBuffer,
2881 nameBuffer.length, nameBufferPos);
2882 }
2883 nameBuffer[nameBufferPos++] = c;
2884 }
2885 }
2886 }
2887
2888 private static boolean isExtender(char c) {
2889 // [88] Extender ::= ...
2890 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
2891 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
2892 || (c >= 0x3031 && c <= 0x3035) || (c >= 0x309d && c <= 0x309e)
2893 || (c >= 0x30fc && c <= 0x30fe);
2894 }
2895
2896 /**
2897 * Read a literal. With matching single or double quotes as delimiters (and
2898 * not embedded!) this is used to parse:
2899 *
2900 * <pre>
2901 * [9] EntityValue ::= ... ([ˆ%&] | PEReference | Reference)* ...
2902 * [10] AttValue ::= ... ([ˆ<&] | Reference)* ...
2903 * [11] SystemLiteral ::= ... (URLchar - "'")* ...
2904 * [12] PubidLiteral ::= ... (PubidChar - "'")* ...
2905 * </pre>
2906 *
2907 * as well as the quoted strings in XML and text declarations (for version,
2908 * encoding, and standalone) which have their own constraints.
2909 */
2910 private String readLiteral(int flags) throws SAXException, IOException {
2911 char delim, c;
2912 int startLine = line;
2913 boolean saved = expandPE;
2914 boolean savedReport = doReport;
2915
2916 // Find the first delimiter.
2917 delim = readCh();
2918 if (delim != '"' && delim != '\'') {
2919 fatal("expected '\"' or \"'\"", delim, null);
2920 return null;
2921 }
2922 inLiteral = true;
2923 if ((flags & LIT_DISABLE_PE) != 0) {
2924 expandPE = false;
2925 }
2926 doReport = false;
2927
2928 // Each level of input source has its own buffer; remember
2929 // ours, so we won't read the ending delimiter from any
2930 // other input source, regardless of entity processing.
2931 char[] ourBuf = readBuffer;
2932
2933 // Read the literal.
2934 try {
2935 c = readCh();
2936 loop: while (!(c == delim && readBuffer == ourBuf)) {
2937 switch (c) {
2938 // attributes and public ids are normalized
2939 // in almost the same ways
2940 case '\n':
2941 case '\r':
2942 if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) {
2943 c = ' ';
2944 }
2945 break;
2946 case '\t':
2947 if ((flags & LIT_ATTRIBUTE) != 0) {
2948 c = ' ';
2949 }
2950 break;
2951 case '&':
2952 c = readCh();
2953 // Char refs are expanded immediately, except for
2954 // all the cases where it's deferred.
2955 if (c == '#') {
2956 if ((flags & LIT_DISABLE_CREF) != 0) {
2957 dataBufferAppend('&');
2958 break;
2959 }
2960 parseCharRef(false /* Do not do flushDataBuffer */);
2961
2962 // exotic WFness risk: this is an entity literal,
2963 // dataBuffer [dataBufferPos - 1] == '&', and
2964 // following chars are a _partial_ entity/char ref
2965
2966 // It looks like an entity ref ...
2967 } else {
2968 unread(c);
2969 // Expand it?
2970 if ((flags & LIT_ENTITY_REF) > 0) {
2971 parseEntityRef(false);
2972 // Is it just data?
2973 } else if ((flags & LIT_DISABLE_EREF) != 0) {
2974 dataBufferAppend('&');
2975
2976 // OK, it will be an entity ref -- expanded
2977 // later.
2978 } else {
2979 String name = readNmtoken(true);
2980 require(';');
2981 dataBufferAppend('&');
2982 dataBufferAppend(name);
2983 dataBufferAppend(';');
2984 }
2985 }
2986 c = readCh();
2987 continue loop;
2988
2989 case '<':
2990 // and why? Perhaps so "&foo;" expands the same
2991 // inside and outside an attribute?
2992 if ((flags & LIT_ATTRIBUTE) != 0) {
2993 fatal("attribute values may not contain '<'");
2994 }
2995 break;
2996
2997 // We don't worry about case '%' and PE refs, readCh does.
2998
2999 default:
3000 break;
3001 }
3002 dataBufferAppend(c);
3003 c = readCh();
3004 }
3005 } catch (EOFException e) {
3006 fatal("end of input while looking for delimiter (started on line "
3007 + startLine + ')', null, new Character(delim).toString());
3008 }
3009 inLiteral = false;
3010 expandPE = saved;
3011 doReport = savedReport;
3012
3013 // Normalise whitespace if necessary.
3014 if ((flags & LIT_NORMALIZE) > 0) {
3015 dataBufferNormalize();
3016 }
3017
3018 // Return the value.
3019 return dataBufferToString();
3020 }
3021
3022 /**
3023 * Try reading external identifiers. A system identifier is not required for
3024 * notations.
3025 *
3026 * @param inNotation
3027 * Are we parsing a notation decl?
3028 * @param isSubset
3029 * Parsing external subset decl (may be omitted)?
3030 * @return A three-member String array containing the identifiers, or nulls.
3031 * Order: public, system, baseURI.
3032 */
3033 private ExternalIdentifiers readExternalIds(boolean inNotation,
3034 boolean isSubset) throws Exception {
3035 char c;
3036 ExternalIdentifiers ids = new ExternalIdentifiers();
3037 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
3038
3039 if (tryRead("PUBLIC")) {
3040 requireWhitespace();
3041 ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags);
3042 if (inNotation) {
3043 skipWhitespace();
3044 c = readCh();
3045 unread(c);
3046 if (c == '"' || c == '\'') {
3047 ids.systemId = readLiteral(flags);
3048 }
3049 } else {
3050 requireWhitespace();
3051 ids.systemId = readLiteral(flags);
3052 }
3053
3054 for (int i = 0; i < ids.publicId.length(); i++) {
3055 c = ids.publicId.charAt(i);
3056 if (c >= 'a' && c <= 'z') {
3057 continue;
3058 }
3059 if (c >= 'A' && c <= 'Z') {
3060 continue;
3061 }
3062 if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1) {
3063 continue;
3064 }
3065 fatal("illegal PUBLIC id character U+" + Integer.toHexString(c));
3066 }
3067 } else if (tryRead("SYSTEM")) {
3068 requireWhitespace();
3069 ids.systemId = readLiteral(flags);
3070 } else if (!isSubset) {
3071 fatal("missing SYSTEM or PUBLIC keyword");
3072 }
3073
3074 if (ids.systemId != null) {
3075 if (ids.systemId.indexOf('#') != -1) {
3076 handler.verror("SYSTEM id has a URI fragment: " + ids.systemId);
3077 }
3078 ids.baseUri = handler.getSystemId();
3079 if (ids.baseUri == null && uriWarnings) {
3080 handler.warn("No base URI; hope URI is absolute: "
3081 + ids.systemId);
3082 }
3083 }
3084
3085 return ids;
3086 }
3087
3088 /**
3089 * Test if a character is whitespace.
3090 *
3091 * <pre>
3092 * [3] S ::= (#x20 | #x9 | #xd | #xa)+
3093 * </pre>
3094 *
3095 * @param c
3096 * The character to test.
3097 * @return true if the character is whitespace.
3098 */
3099 private final boolean isWhitespace(char c) {
3100 if (c > 0x20) {
3101 return false;
3102 }
3103 if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) {
3104 return true;
3105 }
3106 return false; // illegal ...
3107 }
3108
3109 // ////////////////////////////////////////////////////////////////////
3110 // Utility routines.
3111 // ////////////////////////////////////////////////////////////////////
3112
3113 /**
3114 * Add a character to the data buffer.
3115 */
3116 private void dataBufferAppend(char c) {
3117 // Expand buffer if necessary.
3118 if (dataBufferPos >= dataBuffer.length) {
3119 dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
3120 dataBufferPos);
3121 }
3122 dataBuffer[dataBufferPos++] = c;
3123 }
3124
3125 /**
3126 * Add a string to the data buffer.
3127 */
3128 private void dataBufferAppend(String s) {
3129 dataBufferAppend(s.toCharArray(), 0, s.length());
3130 }
3131
3132 /**
3133 * Append (part of) a character array to the data buffer.
3134 */
3135 private void dataBufferAppend(char[] ch, int start, int length) {
3136 dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
3137 dataBufferPos + length);
3138
3139 System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
3140 dataBufferPos += length;
3141 }
3142
3143 /**
3144 * Normalise space characters in the data buffer.
3145 */
3146 private void dataBufferNormalize() {
3147 int i = 0;
3148 int j = 0;
3149 int end = dataBufferPos;
3150
3151 // Skip spaces at the start.
3152 while (j < end && dataBuffer[j] == ' ') {
3153 j++;
3154 }
3155
3156 // Skip whitespace at the end.
3157 while (end > j && dataBuffer[end - 1] == ' ') {
3158 end--;
3159 }
3160
3161 // Start copying to the left.
3162 while (j < end) {
3163
3164 char c = dataBuffer[j++];
3165
3166 // Normalise all other spaces to
3167 // a single space.
3168 if (c == ' ') {
3169 while (j < end && dataBuffer[j++] == ' ') {
3170 continue;
3171 }
3172 dataBuffer[i++] = ' ';
3173 dataBuffer[i++] = dataBuffer[j - 1];
3174 } else {
3175 dataBuffer[i++] = c;
3176 }
3177 }
3178
3179 // The new length is <= the old one.
3180 dataBufferPos = i;
3181 }
3182
3183 /**
3184 * Convert the data buffer to a string.
3185 */
3186 private String dataBufferToString() {
3187 String s = new String(dataBuffer, 0, dataBufferPos);
3188 dataBufferPos = 0;
3189 return s;
3190 }
3191
3192 /**
3193 * Flush the contents of the data buffer to the handler, as appropriate, and
3194 * reset the buffer for new input.
3195 */
3196 private void dataBufferFlush() throws SAXException {
3197 int saveLine = line;
3198 int saveColumn = column;
3199 line = linePrev;
3200 column = columnPrev;
3201 if (currentElementContent == CONTENT_ELEMENTS && dataBufferPos > 0
3202 && !inCDATA) {
3203 // We can't just trust the buffer to be whitespace, there
3204 // are (error) cases when it isn't
3205 for (int i = 0; i < dataBufferPos; i++) {
3206 if (!isWhitespace(dataBuffer[i])) {
3207 handler.charData(dataBuffer, 0, dataBufferPos);
3208 dataBufferPos = 0;
3209 }
3210 }
3211 if (dataBufferPos > 0) {
3212 handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
3213 dataBufferPos = 0;
3214 }
3215 } else if (dataBufferPos > 0) {
3216 handler.charData(dataBuffer, 0, dataBufferPos);
3217 dataBufferPos = 0;
3218 }
3219 line = saveLine;
3220 column = saveColumn;
3221 }
3222
3223 /**
3224 * Require a string to appear, or throw an exception.
3225 * <p>
3226 * <em>Precondition:</em> Entity expansion is not required.
3227 * <p>
3228 * <em>Precondition:</em> data buffer has no characters that will get sent
3229 * to the application.
3230 */
3231 private void require(String delim) throws SAXException, IOException {
3232 int length = delim.length();
3233 char[] ch;
3234
3235 if (length < dataBuffer.length) {
3236 ch = dataBuffer;
3237 delim.getChars(0, length, ch, 0);
3238 } else {
3239 ch = delim.toCharArray();
3240 }
3241
3242 if (USE_CHEATS && length <= (readBufferLength - readBufferPos)) {
3243 int offset = readBufferPos;
3244
3245 for (int i = 0; i < length; i++, offset++) {
3246 if (ch[i] != readBuffer[offset]) {
3247 fatal("required string", null, delim);
3248 }
3249 }
3250 readBufferPos = offset;
3251
3252 } else {
3253 for (int i = 0; i < length; i++) {
3254 require(ch[i]);
3255 }
3256 }
3257 }
3258
3259 /**
3260 * Require a character to appear, or throw an exception.
3261 */
3262 private void require(char delim) throws SAXException, IOException {
3263 char c = readCh();
3264
3265 if (c != delim) {
3266 fatal("required character", c, new Character(delim).toString());
3267 }
3268 }
3269
3270 /**
3271 * Create an interned string from a character array. Ælfred uses this
3272 * method to create an interned version of all names and name tokens, so
3273 * that it can test equality with <code>==</code> instead of
3274 * <code>String.equals ()</code>.
3275 *
3276 * <p>
3277 * This is much more efficient than constructing a non-interned string
3278 * first, and then interning it.
3279 *
3280 * @param ch
3281 * an array of characters for building the string.
3282 * @param start
3283 * the starting position in the array.
3284 * @param length
3285 * the number of characters to place in the string.
3286 * @return an interned string.
3287 * @see #intern (String)
3288 * @see java.lang.String#intern
3289 */
3290 public String intern(char[] ch, int start, int length) {
3291 int index = 0;
3292 int hash = 0;
3293 Object[] bucket;
3294
3295 // Generate a hash code. This is a widely used string hash,
3296 // often attributed to Brian Kernighan.
3297 for (int i = start; i < start + length; i++) {
3298 hash = 31 * hash + ch[i];
3299 }
3300 hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
3301
3302 // Get the bucket -- consists of {array,String} pairs
3303 if ((bucket = symbolTable[hash]) == null) {
3304 // first string in this bucket
3305 bucket = new Object[8];
3306
3307 // Search for a matching tuple, and
3308 // return the string if we find one.
3309 } else {
3310 while (index < bucket.length) {
3311 char[] chFound = (char[]) bucket[index];
3312
3313 // Stop when we hit an empty entry.
3314 if (chFound == null) {
3315 break;
3316 }
3317
3318 // If they're the same length, check for a match.
3319 if (chFound.length == length) {
3320 for (int i = 0; i < chFound.length; i++) {
3321 // continue search on failure
3322 if (ch[start + i] != chFound[i]) {
3323 break;
3324 } else if (i == length - 1) {
3325 // That's it, we have a match!
3326 return (String) bucket[index + 1];
3327 }
3328 }
3329 }
3330 index += 2;
3331 }
3332 // Not found -- we'll have to add it.
3333
3334 // Do we have to grow the bucket?
3335 bucket = (Object[]) extendArray(bucket, bucket.length, index);
3336 }
3337 symbolTable[hash] = bucket;
3338
3339 // OK, add it to the end of the bucket -- "local" interning.
3340 // Intern "globally" to let applications share interning benefits.
3341 // That is, "!=" and "==" work on our strings, not just equals().
3342 String s = new String(ch, start, length).intern();
3343 bucket[index] = s.toCharArray();
3344 bucket[index + 1] = s;
3345 return s;
3346 }
3347
3348 /**
3349 * Ensure the capacity of an array, allocating a new one if necessary.
3350 * Usually extends only for name hash collisions.
3351 */
3352 private Object extendArray(Object array, int currentSize, int requiredSize) {
3353 if (requiredSize < currentSize) {
3354 return array;
3355 } else {
3356 System.err.println(requiredSize);
3357 System.err.flush();
3358 Object newArray = null;
3359 int newSize = currentSize * 2;
3360
3361 if (newSize <= requiredSize) {
3362 newSize = requiredSize + 1;
3363 }
3364
3365 if (array instanceof char[]) {
3366 newArray = new char[newSize];
3367 } else if (array instanceof Object[]) {
3368 newArray = new Object[newSize];
3369 } else {
3370 throw new RuntimeException();
3371 }
3372
3373 System.arraycopy(array, 0, newArray, 0, currentSize);
3374 return newArray;
3375 }
3376 }
3377
3378 // ////////////////////////////////////////////////////////////////////
3379 // XML query routines.
3380 // ////////////////////////////////////////////////////////////////////
3381
3382 boolean isStandalone() {
3383 return docIsStandalone;
3384 }
3385
3386 //
3387 // Elements
3388 //
3389
3390 private int getContentType(ElementDecl element, int defaultType) {
3391 int retval;
3392
3393 if (element == null) {
3394 return defaultType;
3395 }
3396 retval = element.contentType;
3397 if (retval == CONTENT_UNDECLARED) {
3398 retval = defaultType;
3399 }
3400 return retval;
3401 }
3402
3403 /**
3404 * Look up the content type of an element.
3405 *
3406 * @param name
3407 * The element type name.
3408 * @return An integer constant representing the content type.
3409 * @see #CONTENT_UNDECLARED
3410 * @see #CONTENT_ANY
3411 * @see #CONTENT_EMPTY
3412 * @see #CONTENT_MIXED
3413 * @see #CONTENT_ELEMENTS
3414 */
3415 public int getElementContentType(String name) {
3416 ElementDecl element = elementInfo.get(name);
3417 return getContentType(element, CONTENT_UNDECLARED);
3418 }
3419
3420 /**
3421 * Register an element. Array format: [0] element type name [1] content
3422 * model (mixed, elements only) [2] attribute hash table
3423 */
3424 private void setElement(String name, int contentType, String contentModel,
3425 HashMap<String, AttributeDecl> attributes) throws SAXException {
3426 if (skippedPE) {
3427 return;
3428 }
3429
3430 ElementDecl element = elementInfo.get(name);
3431
3432 // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
3433 if (element == null) {
3434 element = new ElementDecl();
3435 element.contentType = contentType;
3436 element.contentModel = contentModel;
3437 element.attributes = attributes;
3438 elementInfo.put(name, element);
3439 return;
3440 }
3441
3442 // <!ELEMENT ...> declaration?
3443 if (contentType != CONTENT_UNDECLARED) {
3444 // ... following an associated <!ATTLIST ...>
3445 if (element.contentType == CONTENT_UNDECLARED) {
3446 element.contentType = contentType;
3447 element.contentModel = contentModel;
3448 } else {
3449 // VC: Unique Element Type Declaration
3450 handler.verror("multiple declarations for element type: "
3451 + name);
3452 }
3453 }
3454
3455 // first <!ATTLIST ...>, before <!ELEMENT ...> ?
3456 else if (attributes != null) {
3457 element.attributes = attributes;
3458 }
3459 }
3460
3461 /**
3462 * Look up the attribute hash table for an element. The hash table is the
3463 * second item in the element array.
3464 */
3465 private HashMap<String, AttributeDecl> getElementAttributes(String name) {
3466 ElementDecl element = elementInfo.get(name);
3467 return (element == null) ? null : element.attributes;
3468 }
3469
3470 //
3471 // Attributes
3472 //
3473
3474 /**
3475 * Get the declared attributes for an element type.
3476 *
3477 * @param elname
3478 * The name of the element type.
3479 * @return An iterator over all the attributes declared for a specific
3480 * element type. The results will be valid only after the DTD (if
3481 * any) has been parsed.
3482 * @see #getAttributeType
3483 * @see #getAttributeEnumeration
3484 * @see #getAttributeDefaultValueType
3485 * @see #getAttributeDefaultValue
3486 * @see #getAttributeExpandedValue
3487 */
3488 private Iterator<String> declaredAttributes(ElementDecl element) {
3489 HashMap<String, AttributeDecl> attlist;
3490
3491 if (element == null) {
3492 return null;
3493 }
3494 if ((attlist = element.attributes) == null) {
3495 return null;
3496 }
3497 return attlist.keySet().iterator();
3498 }
3499
3500 /**
3501 * Get the declared attributes for an element type.
3502 *
3503 * @param elname
3504 * The name of the element type.
3505 * @return An iterator over all the attributes declared for a specific
3506 * element type. The results will be valid only after the DTD (if
3507 * any) has been parsed.
3508 * @see #getAttributeType
3509 * @see #getAttributeEnumeration
3510 * @see #getAttributeDefaultValueType
3511 * @see #getAttributeDefaultValue
3512 * @see #getAttributeExpandedValue
3513 */
3514 public Iterator<String> declaredAttributes(String elname) {
3515 return declaredAttributes(elementInfo.get(elname));
3516 }
3517
3518 /**
3519 * Retrieve the declared type of an attribute.
3520 *
3521 * @param name
3522 * The name of the associated element.
3523 * @param aname
3524 * The name of the attribute.
3525 * @return An interend string denoting the type, or null indicating an
3526 * undeclared attribute.
3527 */
3528 public String getAttributeType(String name, String aname) {
3529 AttributeDecl attribute = getAttribute(name, aname);
3530 return (attribute == null) ? null : attribute.type;
3531 }
3532
3533 /**
3534 * Retrieve the allowed values for an enumerated attribute type.
3535 *
3536 * @param name
3537 * The name of the associated element.
3538 * @param aname
3539 * The name of the attribute.
3540 * @return A string containing the token list.
3541 */
3542 public String getAttributeEnumeration(String name, String aname) {
3543 AttributeDecl attribute = getAttribute(name, aname);
3544 // assert: attribute.enumeration is "ENUMERATION" or "NOTATION"
3545 return (attribute == null) ? null : attribute.enumeration;
3546 }
3547
3548 /**
3549 * Retrieve the default value of a declared attribute.
3550 *
3551 * @param name
3552 * The name of the associated element.
3553 * @param aname
3554 * The name of the attribute.
3555 * @return The default value, or null if the attribute was #IMPLIED or
3556 * simply undeclared and unspecified.
3557 * @see #getAttributeExpandedValue
3558 */
3559 public String getAttributeDefaultValue(String name, String aname) {
3560 AttributeDecl attribute = getAttribute(name, aname);
3561 return (attribute == null) ? null : attribute.value;
3562 }
3563
3564 /*
3565 * // FIXME: Leaving this in, until W3C finally resolves the confusion //
3566 * between parts of the XML 2nd REC about when entity declararations // are
3567 * guaranteed to be known. Current code matches what section 5.1 //
3568 * (conformance) describes, but some readings of the self-contradicting //
3569 * text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that //
3570 * attribute expansion/normalization must be deferred in some cases // (just
3571 * TRY to identify them!).
3572 *
3573 * Retrieve the expanded value of a declared attribute. <p>General entities
3574 * (and char refs) will be expanded (once). @param name The name of the
3575 * associated element. @param aname The name of the attribute. @return The
3576 * expanded default value, or null if the attribute was #IMPLIED or simply
3577 * undeclared
3578 *
3579 * @see #getAttributeDefaultValue public String getAttributeExpandedValue
3580 * (String name, String aname) throws Exception { AttributeDecl
3581 * attribute = getAttribute (name, aname);
3582 *
3583 * if (attribute == null) { return null; } else if (attribute.defaultValue ==
3584 * null && attribute.value != null) { // we MUST use the same buf for both
3585 * quotes else the literal // can't be properly terminated char buf [] = new
3586 * char [1]; int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; String type =
3587 * getAttributeType (name, aname);
3588 *
3589 * if (type != "CDATA" && type != null) flags |= LIT_NORMALIZE; buf [0] =
3590 * '"'; pushCharArray (null, buf, 0, 1); pushString (null, attribute.value);
3591 * pushCharArray (null, buf, 0, 1); attribute.defaultValue = readLiteral
3592 * (flags); } return attribute.defaultValue; }
3593 */
3594
3595 /**
3596 * Retrieve the default value mode of a declared attribute.
3597 *
3598 * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3599 * @see #ATTRIBUTE_DEFAULT_IMPLIED
3600 * @see #ATTRIBUTE_DEFAULT_REQUIRED
3601 * @see #ATTRIBUTE_DEFAULT_FIXED
3602 */
3603 public int getAttributeDefaultValueType(String name, String aname) {
3604 AttributeDecl attribute = getAttribute(name, aname);
3605 return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED
3606 : attribute.valueType;
3607 }
3608
3609 /**
3610 * Register an attribute declaration for later retrieval. Format: - String
3611 * type - String default value - int value type - enumeration - processed
3612 * default value
3613 */
3614 private void setAttribute(String elName, String name, String type,
3615 String enumeration, String value, int valueType) throws Exception {
3616 HashMap<String, AttributeDecl> attlist;
3617
3618 if (skippedPE) {
3619 return;
3620 }
3621
3622 // Create a new hashtable if necessary.
3623 attlist = getElementAttributes(elName);
3624 if (attlist == null) {
3625 attlist = new HashMap<String, AttributeDecl>();
3626 }
3627
3628 // ignore multiple attribute declarations!
3629 if (attlist.get(name) != null) {
3630 // warn ...
3631 return;
3632 } else {
3633 AttributeDecl attribute = new AttributeDecl();
3634 attribute.type = type;
3635 attribute.value = value;
3636 attribute.valueType = valueType;
3637 attribute.enumeration = enumeration;
3638 attlist.put(name, attribute);
3639
3640 // save; but don't overwrite any existing <!ELEMENT ...>
3641 setElement(elName, CONTENT_UNDECLARED, null, attlist);
3642 }
3643 }
3644
3645 /**
3646 * Retrieve the attribute declaration for the given element name and name.
3647 */
3648 private AttributeDecl getAttribute(String elName, String name) {
3649 HashMap<String, AttributeDecl> attlist = getElementAttributes(elName);
3650 return (attlist == null) ? null : attlist.get(name);
3651 }
3652
3653 //
3654 // Entities
3655 //
3656
3657 /**
3658 * Find the type of an entity.
3659 *
3660 * @returns An integer constant representing the entity type.
3661 * @see #ENTITY_UNDECLARED
3662 * @see #ENTITY_INTERNAL
3663 * @see #ENTITY_NDATA
3664 * @see #ENTITY_TEXT
3665 */
3666 public int getEntityType(String ename) {
3667 EntityInfo entity = entityInfo.get(ename);
3668 return (entity == null) ? ENTITY_UNDECLARED : entity.type;
3669 }
3670
3671 /**
3672 * Return an external entity's identifiers.
3673 *
3674 * @param ename
3675 * The name of the external entity.
3676 * @return The entity's public identifier, system identifier, and base URI.
3677 * Null if the entity was not declared as an external entity.
3678 * @see #getEntityType
3679 */
3680 public ExternalIdentifiers getEntityIds(String ename) {
3681 EntityInfo entity = entityInfo.get(ename);
3682 return (entity == null) ? null : entity.ids;
3683 }
3684
3685 /**
3686 * Return an internal entity's replacement text.
3687 *
3688 * @param ename
3689 * The name of the internal entity.
3690 * @return The entity's replacement text, or null if the entity was not
3691 * declared as an internal entity.
3692 * @see #getEntityType
3693 */
3694 public String getEntityValue(String ename) {
3695 EntityInfo entity = entityInfo.get(ename);
3696 return (entity == null) ? null : entity.value;
3697 }
3698
3699 /**
3700 * Register an entity declaration for later retrieval.
3701 */
3702 private void setInternalEntity(String eName, String value)
3703 throws SAXException {
3704 if (skippedPE) {
3705 return;
3706 }
3707
3708 if (entityInfo.get(eName) == null) {
3709 EntityInfo entity = new EntityInfo();
3710 entity.type = ENTITY_INTERNAL;
3711 entity.value = value;
3712 entityInfo.put(eName, entity);
3713 }
3714 if (handler.stringInterning) {
3715 if ("lt" == eName || "gt" == eName || "quot" == eName
3716 || "apos" == eName || "amp" == eName) {
3717 return;
3718 }
3719 } else {
3720 if ("lt".equals(eName) || "gt".equals(eName)
3721 || "quot".equals(eName) || "apos".equals(eName)
3722 || "amp".equals(eName)) {
3723 return;
3724 }
3725 }
3726 handler.getDeclHandler().internalEntityDecl(eName, value);
3727 }
3728
3729 /**
3730 * Register an external entity declaration for later retrieval.
3731 */
3732 private void setExternalEntity(String eName, int eClass,
3733 ExternalIdentifiers ids, String nName) {
3734 if (entityInfo.get(eName) == null) {
3735 EntityInfo entity = new EntityInfo();
3736 entity.type = eClass;
3737 entity.ids = ids;
3738 entity.notationName = nName;
3739 entityInfo.put(eName, entity);
3740 }
3741 }
3742
3743 //
3744 // Notations.
3745 //
3746
3747 /**
3748 * Report a notation declaration, checking for duplicates.
3749 */
3750 private void setNotation(String nname, ExternalIdentifiers ids)
3751 throws SAXException {
3752 if (skippedPE) {
3753 return;
3754 }
3755
3756 handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri);
3757 if (notationInfo.get(nname) == null) {
3758 notationInfo.put(nname, nname);
3759 } else {
3760 // VC: Unique Notation Name
3761 handler.verror("Duplicate notation name decl: " + nname);
3762 }
3763 }
3764
3765 //
3766 // Location.
3767 //
3768
3769 /**
3770 * Return the current line number.
3771 */
3772 public int getLineNumber() {
3773 if (line > 0) {
3774 return line;
3775 } else {
3776 return -1;
3777 }
3778 }
3779
3780 /**
3781 * Return the current column number.
3782 */
3783 public int getColumnNumber() {
3784 if (column > 0) {
3785 return column;
3786 } else {
3787 return -1;
3788 }
3789 }
3790
3791 // ////////////////////////////////////////////////////////////////////
3792 // High-level I/O.
3793 // ////////////////////////////////////////////////////////////////////
3794
3795 /**
3796 * Read a single character from the readBuffer.
3797 * <p>
3798 * The readDataChunk () method maintains the buffer.
3799 * <p>
3800 * If we hit the end of an entity, try to pop the stack and keep going.
3801 * <p>
3802 * (This approach doesn't really enforce XML's rules about entity
3803 * boundaries, but this is not currently a validating parser).
3804 * <p>
3805 * This routine also attempts to keep track of the current position in
3806 * external entities, but it's not entirely accurate.
3807 *
3808 * @return The next available input character.
3809 * @see #unread (char)
3810 * @see #readDataChunk
3811 * @see #readBuffer
3812 * @see #line
3813 * @return The next character from the current input source.
3814 */
3815 private char readCh() throws SAXException, IOException {
3816 // As long as there's nothing in the
3817 // read buffer, try reading more data
3818 // (for an external entity) or popping
3819 // the entity stack (for either).
3820 while (readBufferPos >= readBufferLength) {
3821 switch (sourceType) {
3822 case INPUT_READER:
3823 readDataChunk();
3824 while (readBufferLength < 1) {
3825 popInput();
3826 if (readBufferLength < 1) {
3827 readDataChunk();
3828 }
3829 }
3830 break;
3831
3832 default:
3833
3834 popInput();
3835 break;
3836 }
3837 }
3838
3839 char c = readBuffer[readBufferPos++];
3840 advanceLocation();
3841 // copied from fi.iki.hsivonen.htmlparser
3842 if ((c & 0xFC00) == 0xDC00) {
3843 // Got a low surrogate. See if prev was high surrogate
3844 if ((prev & 0xFC00) == 0xD800) {
3845 int intVal = (prev << 10) + c + SURROGATE_OFFSET;
3846 if (isNonCharacter(intVal)) {
3847 handler.warn("Astral non-character.");
3848 }
3849 if (isAstralPrivateUse(intVal)) {
3850 warnAboutPrivateUseChar();
3851 }
3852 } else {
3853 fatal("Unmatched low surrogate.");
3854 }
3855 prev = c;
3856 } else {
3857 // see if there was a lone high surrogate
3858 if ((prev & 0xFC00) == 0xD800) {
3859 fatal("Unmatched high surrogate.");
3860 }
3861 }
3862
3863 if (c == '\n') {
3864 nextCharOnNewLine = true;
3865 } else {
3866 if (c == '<') {
3867 /* the most common return to parseContent () ... NOP */
3868 } else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
3869 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) && xmlVersion == XML_11)) {
3870 fatal("illegal XML character U+" + Integer.toHexString(c));
3871 } else if (c >= '\u007F' && c <= '\u009F') // 2006-04-25 hsivonen
3872 {
3873 handler.warn("Saw a control character: U+00"
3874 + Integer.toHexString(c) + ".");
3875 }
3876
3877 if (isPrivateUse(c)) {
3878 warnAboutPrivateUseChar();
3879 }
3880 // If we're in the DTD and in a context where PEs get expanded,
3881 // do so ... 1/14/2000 errata identify those contexts. There
3882 // are also spots in the internal subset where PE refs are fatal
3883 // errors, hence yet another flag.
3884 else if (c == '%' && expandPE) {
3885 if (peIsError) {
3886 fatal("PE reference within decl in internal subset.");
3887 }
3888 parsePEReference();
3889 return readCh();
3890 }
3891 }
3892
3893 return c;
3894 }
3895
3896 /**
3897 * Push a single character back onto the current input stream.
3898 * <p>
3899 * This method usually pushes the character back onto the readBuffer.
3900 * <p>
3901 * I don't think that this would ever be called with readBufferPos = 0,
3902 * because the methods always reads a character before unreading it, but
3903 * just in case, I've added a boundary condition.
3904 *
3905 * @param c
3906 * The character to push back.
3907 * @see #readCh
3908 * @see #unread (char[])
3909 * @see #readBuffer
3910 */
3911 private void unread(char c) throws SAXException {
3912 rollbackLocation();
3913 if (readBufferPos > 0) {
3914 readBuffer[--readBufferPos] = c;
3915 } else {
3916 pushString(null, new Character(c).toString());
3917 }
3918 }
3919
3920 /**
3921 *
3922 */
3923 private void rollbackLocation() {
3924 assert (column != columnPrev) || (line != linePrev);
3925 if (column == 1) {
3926 nextCharOnNewLine = true;
3927 }
3928 line = linePrev;
3929 column = columnPrev;
3930 }
3931
3932 /**
3933 * Push a char array back onto the current input stream.
3934 * <p>
3935 * NOTE: you must <em>never</em> push back characters that you haven't
3936 * actually read: use pushString () instead.
3937 *
3938 * @see #readCh
3939 * @see #unread (char)
3940 * @see #readBuffer
3941 * @see #pushString
3942 */
3943 private void unread(char[] ch, int length) throws SAXException {
3944 if (length < readBufferPos) {
3945 readBufferPos -= length;
3946 } else {
3947 pushCharArray(null, ch, 0, length);
3948 }
3949 }
3950
3951 /**
3952 * Push, or skip, a new external input source. The source will be some kind
3953 * of parsed entity, such as a PE (including the external DTD subset) or
3954 * content for the body.
3955 *
3956 * @param url
3957 * The java.net.URL object for the entity.
3958 * @see SAXDriver#resolveEntity
3959 * @see #pushString
3960 * @see #sourceType
3961 * @see #pushInput
3962 * @see #detectEncoding
3963 * @see #sourceType
3964 * @see #readBuffer
3965 */
3966 private void pushURL(boolean isPE, String ename, ExternalIdentifiers ids,
3967 Reader aReader, InputStream aStream, String aEncoding,
3968 boolean doResolve) throws SAXException, IOException {
3969 // removed boolean ignoreEncoding -- 2006-02-03 hsivonen
3970 String systemId;
3971 InputSource source;
3972 InputSource scratch = new InputSource();
3973
3974 if (!isPE) {
3975 dataBufferFlush();
3976 }
3977
3978 scratch.setPublicId(ids.publicId);
3979 scratch.setSystemId(ids.systemId);
3980
3981 // See if we should skip or substitute the entity.
3982 // If we're not skipping, resolving reports startEntity()
3983 // and updates the (handler's) stack of URIs.
3984 if (doResolve) {
3985 // assert (stream == null && reader == null && encoding == null)
3986 source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri);
3987 if (source == null) {
3988 handler.warn("skipping entity: " + ename);
3989 handler.skippedEntity(ename);
3990 if (isPE) {
3991 skippedPE = true;
3992 }
3993 return;
3994 }
3995
3996 // we might be using alternate IDs/encoding
3997 systemId = source.getSystemId();
3998 // The following warning and setting systemId was deleted bcause
3999 // the application has the option of not setting systemId
4000 // provided that it has set the characte/byte stream.
4001 /*
4002 * if (systemId == null) { handler.warn ("missing system ID, using " +
4003 * ids.systemId); systemId = ids.systemId; }
4004 */
4005 } else {
4006 // "[document]", or "[dtd]" via getExternalSubset()
4007 scratch.setCharacterStream(aReader);
4008 scratch.setByteStream(aStream);
4009 scratch.setEncoding(aEncoding);
4010 source = scratch;
4011 systemId = ids.systemId;
4012 if (handler.stringInterning) {
4013 handler.startExternalEntity(ename, systemId,
4014 "[document]" == ename);
4015 } else {
4016 handler.startExternalEntity(ename, systemId,
4017 "[document]".equals(ename));
4018 }
4019 }
4020
4021 // Push the existing status.
4022 pushInput(ename);
4023
4024 // Create a new read buffer.
4025 // (Note the four-character margin)
4026 readBuffer = new char[READ_BUFFER_MAX + 4];
4027 readBufferPos = 0;
4028 readBufferLength = 0;
4029 readBufferOverflow = -1;
4030 is = null;
4031 reader = null;
4032 line = 0;
4033 column = 1;
4034 linePrev = 0;
4035 columnPrev = 1;
4036 nextCharOnNewLine = true;
4037 currentByteCount = 0;
4038
4039 // If there's an explicit character stream, just
4040 // ignore encoding declarations.
4041 if (source.getCharacterStream() != null) {
4042 sourceType = INPUT_READER;
4043 this.reader = source.getCharacterStream();
4044 // swallow UTF-8 BOM -- 2006-02-03 hsivonen
4045 if ("UTF-8".equalsIgnoreCase(source.getEncoding())) {
4046 char bom = readCh();
4047 if (bom != '\uFEFF') {
4048 unread(bom);
4049 }
4050 }
4051 tryEncodingDecl(source.getEncoding() == null ? ""
4052 : source.getEncoding());
4053 return;
4054 }
4055
4056 // Else we handle the conversion, and need to ensure
4057 // it's done right.
4058 if (source.getByteStream() != null) {
4059 is = source.getByteStream();
4060 } else {
4061 // Stop -- 2006-11-10 hsivonen
4062 fatal("The entity resolver didn't properly resolve the entity.");
4063 }
4064
4065 // If we get to here, there must be
4066 // an InputStream available.
4067 if (!is.markSupported()) {
4068 is = new BufferedInputStream(is);
4069 }
4070
4071 // Zapped bogus external encoding label code -- 2006-11-10 hsivonen
4072
4073 // if we got an external encoding label, use it ...
4074 if (source.getEncoding() != null) {
4075 draconianInputStreamReader(source.getEncoding(), is, false);
4076 if ("UTF-8".equalsIgnoreCase(source.getEncoding())) {
4077 char bom = readCh();
4078 if (bom != '\uFEFF') {
4079 unread(bom);
4080 }
4081 }
4082 tryEncodingDecl(source.getEncoding());
4083 // ... else autodetect from first bytes.
4084 } else {
4085 detectEncoding();
4086 // Read any XML or text declaration.
4087 String enc = tryEncodingDecl(null);
4088 if (enc == null && "UTF-32" == characterEncoding) {
4089 fatal("UTF-32 was sniffed from the BOM, but there was no matching encoding declaration. The omission of explicit encoding declaration is only allowed with UTF-8 and UTF-16.");
4090 }
4091 }
4092 }
4093
4094 /**
4095 * Check for an encoding declaration. This is the second part of the XML
4096 * encoding autodetection algorithm, relying on detectEncoding to get to the
4097 * point that this part can read any encoding declaration in the document
4098 * (using only US-ASCII characters).
4099 *
4100 * <p>
4101 * Because this part starts to fill parser buffers with this data, it's
4102 * tricky to setup a reader so that Java's built-in decoders can be used for
4103 * the character encodings that aren't built in to this parser (such as
4104 * EUC-JP, KOI8-R, Big5, etc).
4105 *
4106 * @return any encoding in the declaration, uppercased; or null
4107 * @see detectEncoding
4108 */
4109 private String tryEncodingDecl(String encoding) throws SAXException,
4110 IOException {
4111 // Read the XML/text declaration.
4112 if (tryRead("<?xml")) {
4113 if (tryWhitespace()) {
4114 if (inputStack.size() > 0) {
4115 return parseTextDecl(encoding);
4116 } else {
4117 return parseXMLDecl(encoding);
4118 }
4119 } else {
4120 // <?xml-stylesheet ...?> or similar
4121 unread('l');
4122 unread('m');
4123 unread('x');
4124 unread('?');
4125 unread('<');
4126 }
4127 }
4128 // 2006-02-03 hsivonen
4129 warnAboutLackOfEncodingDecl(encoding);
4130 return null;
4131 }
4132
4133 /**
4134 * @param characterEncoding
4135 * @throws SAXException
4136 */
4137 private void warnAboutLackOfEncodingDecl(String encoding)
4138 throws SAXException {
4139 if (!(encoding == null || "".equals(encoding)
4140 || "UTF-8".equalsIgnoreCase(encoding) || "UTF-16".equalsIgnoreCase(encoding))) {
4141 handler.warn("External encoding information specified a non-UTF-8/non-UTF-16 encoding ("
4142 + encoding
4143 + "), but there was no matching internal encoding declaration. The well-formedness status of this document may change when decoupled from the external encoding information.");
4144 }
4145 }
4146
4147 /**
4148 * Attempt to detect the encoding of an entity.
4149 * <p>
4150 * The trick here (as suggested in the XML standard) is that any entity not
4151 * in UTF-8, or in UCS-2 with a byte-order mark, <b>must</b> begin with an
4152 * XML declaration or an encoding declaration; we simply have to look for
4153 * "<?xml" in various encodings.
4154 * <p>
4155 * This method has no way to distinguish among 8-bit encodings. Instead, it
4156 * sets up for UTF-8, then (possibly) revises its assumption later in
4157 * setupDecoding (). Any ASCII-derived 8-bit encoding should work, but most
4158 * will be rejected later by setupDecoding ().
4159 *
4160 * @see #tryEncoding (byte[], byte, byte, byte, byte)
4161 * @see #tryEncoding (byte[], byte, byte)
4162 * @see #setupDecoding
4163 */
4164 private void detectEncoding() throws SAXException, IOException {
4165 byte[] signature = new byte[4];
4166
4167 // Read the first four bytes for
4168 // autodetection.
4169 is.mark(4);
4170 is.read(signature);
4171 is.reset();
4172
4173 //
4174 // FIRST: four byte encodings (who uses these?)
4175 //
4176 if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x00,
4177 (byte) 0x3c)) {
4178 // UCS-4 must begin with "<?xml"
4179 // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
4180 // "UTF-32BE"
4181 draconianInputStreamReader("UTF-32BE", is, false);
4182 } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4183 (byte) 0x00, (byte) 0x00)) {
4184 // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
4185 // "UTF-32LE"
4186 draconianInputStreamReader("UTF-32LE", is, false);
4187 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4188 (byte) 0x3c, (byte) 0x00)) {
4189 // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
4190 fatal("Unsupported 32-bit encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); // 2006-02-03
4191 // hsivonen
4192 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4193 (byte) 0x00, (byte) 0x00)) {
4194 // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
4195 fatal("Unsupported 32-bit encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); // 2006-02-03
4196 // hsivonen
4197 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4198 (byte) 0xfe, (byte) 0xff)) {
4199 // 00 00 fe ff UCS_4_1234 (with BOM)
4200 is.read();
4201 is.read();
4202 is.read();
4203 is.read();
4204 draconianInputStreamReader("UTF-32BE", is, false, "UTF-32");
4205 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4206 (byte) 0x00, (byte) 0x00)) {
4207 // ff fe 00 00 UCS_4_4321 (with BOM)
4208 is.read();
4209 is.read();
4210 is.read();
4211 is.read();
4212 draconianInputStreamReader("UTF-32LE", is, false, "UTF-32");
4213 }
4214 // SECOND: two byte encodings
4215 // note ... with 1/14/2000 errata the XML spec identifies some
4216 // more "broken UTF-16" autodetection cases, with no XML decl,
4217 // which we don't handle here (that's legal too).
4218 //
4219 else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) {
4220 // UCS-2 with a byte-order marker. (UTF-16)
4221 // 0xfe 0xff: UCS-2, big-endian (12)
4222 is.read();
4223 is.read();
4224 draconianInputStreamReader("UTF-16BE", is, false, "UTF-16");
4225 } else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) {
4226 // UCS-2 with a byte-order marker. (UTF-16)
4227 // 0xff 0xfe: UCS-2, little-endian (21)
4228 is.read();
4229 is.read();
4230 draconianInputStreamReader("UTF-16LE", is, false, "UTF-16");
4231 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4232 (byte) 0x00, (byte) 0x3f)) {
4233 // UTF-16BE (otherwise, malformed UTF-16)
4234 // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
4235 fatal("no byte-order mark for UTF-16 entity"); // s/UCS-2/UTF-16/
4236 // -- 2006-02-03
4237 // hsivonen
4238 } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4239 (byte) 0x3f, (byte) 0x00)) {
4240 // UTF-16LE (otherwise, malformed UTF-16)
4241 // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
4242 fatal("no byte-order mark for UTF-16 entity"); // s/UCS-2/UTF-16/
4243 // -- 2006-02-03
4244 // hsivonen
4245 }
4246 //
4247 // THIRD: EBCDIC
4248 //
4249 else if (tryEncoding(signature, (byte) 0x4c, (byte) 0x6f, (byte) 0xa7,
4250 (byte) 0x94)) {
4251 // 4c 6f a7 94 ... we don't understand EBCDIC flavors
4252 fatal("Unsupported EBCDIC encoding. (XML processors are only required to support UTF-8 and UTF-16.)");
4253 }
4254 //
4255 // FOURTH: ASCII-derived encodings, fixed and variable lengths
4256 //
4257 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, (byte) 0x78,
4258 (byte) 0x6d)) {
4259 // ASCII derived
4260 // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
4261 characterEncoding = null;
4262 prefetchASCIIEncodingDecl();
4263 } else if (signature[0] == (byte) 0xef && signature[1] == (byte) 0xbb
4264 && signature[2] == (byte) 0xbf) {
4265 // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text)
4266 // this un-needed notion slipped into XML 2nd ed through a
4267 // "non-normative" erratum; now required by MSFT and UDDI,
4268 // and E22 made it normative.
4269 is.read();
4270 is.read();
4271 is.read();
4272 draconianInputStreamReader("UTF-8", is, false);
4273 } else {
4274 // (default) UTF-8 without encoding/XML declaration
4275 draconianInputStreamReader("UTF-8", is, false);
4276 }
4277 }
4278
4279 /**
4280 * Check for a four-byte signature.
4281 * <p>
4282 * Utility routine for detectEncoding ().
4283 * <p>
4284 * Always looks for some part of "<?XML" in a specific encoding.
4285 *
4286 * @param sig
4287 * The first four bytes read.
4288 * @param b1
4289 * The first byte of the signature
4290 * @param b2
4291 * The second byte of the signature
4292 * @param b3
4293 * The third byte of the signature
4294 * @param b4
4295 * The fourth byte of the signature
4296 * @see #detectEncoding
4297 */
4298 private static boolean tryEncoding(byte[] sig, byte b1, byte b2, byte b3,
4299 byte b4) {
4300 return (sig[0] == b1 && sig[1] == b2 && sig[2] == b3 && sig[3] == b4);
4301 }
4302
4303 /**
4304 * Check for a two-byte signature.
4305 * <p>
4306 * Looks for a UCS-2 byte-order mark.
4307 * <p>
4308 * Utility routine for detectEncoding ().
4309 *
4310 * @param sig
4311 * The first four bytes read.
4312 * @param b1
4313 * The first byte of the signature
4314 * @param b2
4315 * The second byte of the signature
4316 * @see #detectEncoding
4317 */
4318 private static boolean tryEncoding(byte[] sig, byte b1, byte b2) {
4319 return ((sig[0] == b1) && (sig[1] == b2));
4320 }
4321
4322 /**
4323 * This method pushes a string back onto input.
4324 * <p>
4325 * It is useful either as the expansion of an internal entity, or for
4326 * backtracking during the parse.
4327 * <p>
4328 * Call pushCharArray () to do the actual work.
4329 *
4330 * @param s
4331 * The string to push back onto input.
4332 * @see #pushCharArray
4333 */
4334 private void pushString(String ename, String s) throws SAXException {
4335 char[] ch = s.toCharArray();
4336 pushCharArray(ename, ch, 0, ch.length);
4337 }
4338
4339 /**
4340 * Push a new internal input source.
4341 * <p>
4342 * This method is useful for expanding an internal entity, or for unreading
4343 * a string of characters. It creates a new readBuffer containing the
4344 * characters in the array, instead of characters converted from an input
4345 * byte stream.
4346 *
4347 * @param ch
4348 * The char array to push.
4349 * @see #pushString
4350 * @see #pushURL
4351 * @see #readBuffer
4352 * @see #sourceType
4353 * @see #pushInput
4354 */
4355 private void pushCharArray(String ename, char[] ch, int start, int length)
4356 throws SAXException {
4357 // Push the existing status
4358 pushInput(ename);
4359 if (ename != null && doReport) {
4360 dataBufferFlush();
4361 handler.startInternalEntity(ename);
4362 }
4363 sourceType = INPUT_INTERNAL;
4364 readBuffer = ch;
4365 readBufferPos = start;
4366 readBufferLength = length;
4367 readBufferOverflow = -1;
4368 }
4369
4370 /**
4371 * Save the current input source onto the stack.
4372 * <p>
4373 * This method saves all of the global variables associated with the current
4374 * input source, so that they can be restored when a new input source has
4375 * finished. It also tests for entity recursion.
4376 * <p>
4377 * The method saves the following global variables onto a stack using a
4378 * fixed-length array:
4379 * <ol>
4380 * <li>sourceType
4381 * <li>externalEntity
4382 * <li>readBuffer
4383 * <li>readBufferPos
4384 * <li>readBufferLength
4385 * <li>line
4386 * <li>characterEncoding
4387 * </ol>
4388 *
4389 * @param ename
4390 * The name of the entity (if any) causing the new input.
4391 * @see #popInput
4392 * @see #sourceType
4393 * @see #externalEntity
4394 * @see #readBuffer
4395 * @see #readBufferPos
4396 * @see #readBufferLength
4397 * @see #line
4398 * @see #characterEncoding
4399 */
4400 private void pushInput(String ename) throws SAXException {
4401 // Protect against billion laughs -- 2006-12-28 hsivonen
4402 if (entityStack.size() > 16) {
4403 fatal("Entity recursion too deep. Stopping to protect against denial of service attacks.");
4404 }
4405
4406 // Check for entity recursion.
4407 if (ename != null) {
4408 Iterator<String> entities = entityStack.iterator();
4409 while (entities.hasNext()) {
4410 String e = entities.next();
4411 if (e != null && e == ename) {
4412 fatal("recursive reference to entity", ename, null);
4413 }
4414 }
4415 }
4416 entityStack.addLast(ename);
4417
4418 // Don't bother if there is no current input.
4419 if (sourceType == INPUT_NONE) {
4420 return;
4421 }
4422
4423 // Set up a snapshot of the current
4424 // input source.
4425 Input input = new Input();
4426
4427 input.sourceType = sourceType;
4428 input.readBuffer = readBuffer;
4429 input.readBufferPos = readBufferPos;
4430 input.readBufferLength = readBufferLength;
4431 input.line = line;
4432 input.linePrev = linePrev;
4433 input.charecterEncoding = characterEncoding;
4434 input.readBufferOverflow = readBufferOverflow;
4435 input.is = is;
4436 input.currentByteCount = currentByteCount;
4437 input.column = column;
4438 input.columnPrev = columnPrev;
4439 input.nextCharOnNewLine = nextCharOnNewLine;
4440 input.reader = reader;
4441 input.prev = prev;
4442 input.normalizationChecker = normalizationChecker;
4443 input.characterHandler = characterHandler;
4444 characterHandler = null;
4445
4446 // Push it onto the stack.
4447 inputStack.addLast(input);
4448 }
4449
4450 /**
4451 * Restore a previous input source.
4452 * <p>
4453 * This method restores all of the global variables associated with the
4454 * current input source.
4455 *
4456 * @exception java.io.EOFException
4457 * If there are no more entries on the input stack.
4458 * @see #pushInput
4459 * @see #sourceType
4460 * @see #readBuffer
4461 * @see #readBufferPos
4462 * @see #readBufferLength
4463 * @see #line
4464 * @see #characterEncoding
4465 */
4466 private void popInput() throws SAXException, IOException {
4467 String ename = entityStack.removeLast();
4468
4469 if (ename != null && doReport) {
4470 dataBufferFlush();
4471 }
4472 switch (sourceType) {
4473 case INPUT_READER:
4474 handler.endExternalEntity(ename);
4475 reader.close();
4476 break;
4477 case INPUT_INTERNAL:
4478 if (ename != null && doReport) {
4479 handler.endInternalEntity(ename);
4480 }
4481 break;
4482 }
4483 if (characterHandler != null) {
4484 characterHandler.end();
4485 }
4486 if (normalizationChecker != null) {
4487 normalizationChecker.end();
4488 }
4489
4490 // Throw an EOFException if there
4491 // is nothing else to pop.
4492 if (inputStack.isEmpty()) {
4493 throw new EOFException("no more input");
4494 }
4495
4496 Input input = inputStack.removeLast();
4497
4498 sourceType = input.sourceType;
4499 readBuffer = input.readBuffer;
4500 readBufferPos = input.readBufferPos;
4501 readBufferLength = input.readBufferLength;
4502 line = input.line;
4503 linePrev = input.linePrev;
4504 characterEncoding = input.charecterEncoding;
4505 readBufferOverflow = input.readBufferOverflow;
4506 is = input.is;
4507 currentByteCount = input.currentByteCount;
4508 column = input.column;
4509 columnPrev = input.columnPrev;
4510 nextCharOnNewLine = input.nextCharOnNewLine;
4511 reader = input.reader;
4512 prev = input.prev;
4513 normalizationChecker = input.normalizationChecker;
4514 characterHandler = input.characterHandler;
4515 }
4516
4517 /**
4518 * Return true if we can read the expected character.
4519 * <p>
4520 * Note that the character will be removed from the input stream on success,
4521 * but will be put back on failure. Do not attempt to read the character
4522 * again if the method succeeds.
4523 *
4524 * @param delim
4525 * The character that should appear next. For a insensitive
4526 * match, you must supply this in upper-case.
4527 * @return true if the character was successfully read, or false if it was
4528 * not.
4529 * @see #tryRead (String)
4530 */
4531 private boolean tryRead(char delim) throws SAXException, IOException {
4532 char c;
4533
4534 // Read the character
4535 c = readCh();
4536
4537 // Test for a match, and push the character
4538 // back if the match fails.
4539 if (c == delim) {
4540 return true;
4541 } else {
4542 unread(c);
4543 return false;
4544 }
4545 }
4546
4547 /**
4548 * Return true if we can read the expected string.
4549 * <p>
4550 * This is simply a convenience method.
4551 * <p>
4552 * Note that the string will be removed from the input stream on success,
4553 * but will be put back on failure. Do not attempt to read the string again
4554 * if the method succeeds.
4555 * <p>
4556 * This method will push back a character rather than an array whenever
4557 * possible (probably the majority of cases).
4558 *
4559 * @param delim
4560 * The string that should appear next.
4561 * @return true if the string was successfully read, or false if it was not.
4562 * @see #tryRead (char)
4563 */
4564 private boolean tryRead(String delim) throws SAXException, IOException {
4565 return tryRead(delim.toCharArray());
4566 }
4567
4568 private boolean tryRead(char[] ch) throws SAXException, IOException {
4569 char c;
4570
4571 // Compare the input, character-
4572 // by character.
4573 int saveLine = line;
4574 int saveColumn = column;
4575 int saveLinePrev = linePrev;
4576 int saveColumnPrev = columnPrev;
4577 boolean saveNextCharOnNewLine = nextCharOnNewLine;
4578
4579 for (int i = 0; i < ch.length; i++) {
4580 c = readCh();
4581 if (c != ch[i]) {
4582 unread(c);
4583 if (i != 0) {
4584 unread(ch, i);
4585 }
4586 line = saveLine;
4587 column = saveColumn;
4588 linePrev = saveLinePrev;
4589 columnPrev = saveColumnPrev;
4590 nextCharOnNewLine = saveNextCharOnNewLine;
4591 return false;
4592 }
4593 }
4594 return true;
4595 }
4596
4597 /**
4598 * Return true if we can read some whitespace.
4599 * <p>
4600 * This is simply a convenience method.
4601 * <p>
4602 * This method will push back a character rather than an array whenever
4603 * possible (probably the majority of cases).
4604 *
4605 * @return true if whitespace was found.
4606 */
4607 private boolean tryWhitespace() throws SAXException, IOException {
4608 char c;
4609 c = readCh();
4610 if (isWhitespace(c)) {
4611 skipWhitespace();
4612 return true;
4613 } else {
4614 unread(c);
4615 return false;
4616 }
4617 }
4618
4619 private void parseUntil(char[] delim) throws SAXException, IOException {
4620 char c;
4621 int startLine = line;
4622
4623 try {
4624 while (!tryRead(delim)) {
4625 c = readCh();
4626 dataBufferAppend(c);
4627 }
4628 } catch (EOFException e) {
4629 fatal("end of input while looking for delimiter "
4630 + "(started on line " + startLine + ')', null, new String(
4631 delim));
4632 }
4633 }
4634
4635 // ////////////////////////////////////////////////////////////////////
4636 // Low-level I/O.
4637 // ////////////////////////////////////////////////////////////////////
4638
4639 /**
4640 * Prefetch US-ASCII XML/text decl from input stream into read buffer.
4641 * Doesn't buffer more than absolutely needed, so that when an encoding decl
4642 * says we need to create an InputStreamReader, we can discard our buffer
4643 * and reset(). Caller knows the first chars of the decl exist in the input
4644 * stream.
4645 */
4646 private void prefetchASCIIEncodingDecl() throws SAXException, IOException {
4647 int ch;
4648 readBufferPos = readBufferLength = 0;
4649
4650 is.mark(readBuffer.length);
4651 while (true) {
4652 ch = is.read();
4653 readBuffer[readBufferLength++] = (char) ch;
4654 switch (ch) {
4655 case (int) '>':
4656 return;
4657 case -1:
4658 fatal(
4659 "file ends before end of XML or encoding declaration.",
4660 null, "?>");
4661 }
4662 if (readBuffer.length == readBufferLength) {
4663 fatal("unfinished XML or encoding declaration");
4664 }
4665 }
4666 }
4667
4668 /**
4669 * Read a chunk of data from an external input source.
4670 * <p>This is simply a front-end that fills the rawReadBuffer
4671 * with bytes, then calls the appropriate encoding handler.
4672 * @see #characterEncoding
4673 * @see #rawReadBuffer
4674 * @see #readBuffer
4675 * @see #filterCR
4676 * @see #copyUtf8ReadBuffer
4677 * @see #copyIso8859_1ReadBuffer
4678 * @see #copyUcs_2ReadBuffer
4679 * @see #copyUcs_4ReadBuffer
4680 */
4681 private void readDataChunk() throws SAXException, IOException {
4682 int count;
4683
4684 // See if we have any overflow (filterCR sets for CR at end)
4685 if (readBufferOverflow > -1) {
4686 readBuffer[0] = (char) readBufferOverflow;
4687 readBufferOverflow = -1;
4688 readBufferPos = 1;
4689 sawCR = true;
4690 } else {
4691 readBufferPos = 0;
4692 sawCR = false;
4693 }
4694
4695 try {
4696 count = reader.read(readBuffer, readBufferPos, READ_BUFFER_MAX
4697 - readBufferPos);
4698 } catch (CharacterCodingException cce) {
4699 // 2006-04-25 hsivonen
4700 fatal("Input data does not conform to the input encoding. The input encoding was "
4701 + characterEncoding + ".");
4702 return; // never happens
4703 }
4704 if (characterHandler != null && count > 0) {
4705 characterHandler.characters(readBuffer, readBufferPos, count);
4706 }
4707 if (normalizationChecker != null && count > 0) {
4708 normalizationChecker.characters(readBuffer, readBufferPos, count);
4709 }
4710 if (count < 0) {
4711 readBufferLength = readBufferPos;
4712 } else {
4713 readBufferLength = readBufferPos + count;
4714 }
4715 if (readBufferLength > 0) {
4716 filterCR(count >= 0);
4717 }
4718 sawCR = false;
4719 }
4720
4721 /**
4722 * Filter carriage returns in the read buffer.
4723 * CRLF becomes LF; CR becomes LF.
4724 * @param moreData true iff more data might come from the same source
4725 * @see #readDataChunk
4726 * @see #readBuffer
4727 * @see #readBufferOverflow
4728 */
4729 private void filterCR(boolean moreData) {
4730 int i, j;
4731
4732 readBufferOverflow = -1;
4733
4734 loop: for (i = j = readBufferPos; j < readBufferLength; i++, j++) {
4735 switch (readBuffer[j]) {
4736 case '\r':
4737 if (j == readBufferLength - 1) {
4738 if (moreData) {
4739 readBufferOverflow = '\r';
4740 readBufferLength--;
4741 } else // CR at end of buffer
4742 {
4743 readBuffer[i++] = '\n';
4744 }
4745 break loop;
4746 } else if (readBuffer[j + 1] == '\n') {
4747 j++;
4748 }
4749 readBuffer[i] = '\n';
4750 break;
4751
4752 case '\n':
4753 default:
4754 readBuffer[i] = readBuffer[j];
4755 break;
4756 }
4757 }
4758 readBufferLength = i;
4759 }
4760
4761 private void warnAboutPrivateUseChar() throws SAXException {
4762 if (!alreadyWarnedAboutPrivateUseCharacters) {
4763 handler.warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)");
4764 alreadyWarnedAboutPrivateUseCharacters = true;
4765 }
4766 }
4767
4768 // copied from fi.iki.hsivonen.htmlparser
4769
4770 private boolean isPrivateUse(char c) {
4771 return c >= '\uE000' && c <= '\uF8FF';
4772 }
4773
4774 private boolean isPrivateUse(int c) {
4775 return (c >= 0xE000 && c <= 0xF8FF) || (c >= 0xF0000 && c <= 0xFFFFD)
4776 || (c >= 0x100000 && c <= 0x10FFFD);
4777 }
4778
4779 private boolean isAstralPrivateUse(int c) {
4780 return (c >= 0xF0000 && c <= 0xFFFFD)
4781 || (c >= 0x100000 && c <= 0x10FFFD);
4782 }
4783
4784 private boolean isNonCharacter(int c) {
4785 return (c & 0xFFFE) == 0xFFFE;
4786 }
4787
4788 //////////////////////////////////////////////////////////////////////
4789 // Local Variables.
4790 //////////////////////////////////////////////////////////////////////
4791
4792 /**
4793 * Re-initialize the variables for each parse.
4794 * @throws SAXException
4795 */
4796 private void initializeVariables() throws SAXException {
4797 prev = '\u0000';
4798 // First line
4799 line = 0;
4800 column = 1;
4801 linePrev = 0;
4802 columnPrev = 1;
4803 nextCharOnNewLine = true;
4804
4805 // Set up the buffers for data and names
4806 dataBufferPos = 0;
4807 dataBuffer = new char[DATA_BUFFER_INITIAL];
4808 nameBufferPos = 0;
4809 nameBuffer = new char[NAME_BUFFER_INITIAL];
4810
4811 // Set up the DTD hash tables
4812 elementInfo = new HashMap<String, ElementDecl>();
4813 entityInfo = new HashMap<String, EntityInfo>();
4814 notationInfo = new HashMap<String, String>();
4815 skippedPE = false;
4816
4817 // Set up the variables for the current
4818 // element context.
4819 currentElement = null;
4820 currentElementContent = CONTENT_UNDECLARED;
4821
4822 // Set up the input variables
4823 sourceType = INPUT_NONE;
4824 inputStack = new LinkedList<Input>();
4825 entityStack = new LinkedList<String>();
4826 tagAttributePos = 0;
4827 tagAttributes = new String[100];
4828 rawReadBuffer = new byte[READ_BUFFER_MAX];
4829 readBufferOverflow = -1;
4830
4831 inLiteral = false;
4832 expandPE = false;
4833 peIsError = false;
4834
4835 doReport = false;
4836
4837 inCDATA = false;
4838
4839 symbolTable = new Object[SYMBOL_TABLE_LENGTH][];
4840
4841 if (handler.checkNormalization) {
4842 normalizationChecker = new NormalizationChecker(handler);
4843 normalizationChecker.setErrorHandler(handler.getErrorHandler());
4844 normalizationChecker.start();
4845 } else {
4846 normalizationChecker = null;
4847 }
4848 if (handler.characterHandler != null) {
4849 characterHandler = handler.characterHandler;
4850 handler.characterHandler = null;
4851 characterHandler.start();
4852 } else {
4853 characterHandler = null;
4854 }
4855 }
4856
4857 static class ExternalIdentifiers {
4858
4859 String publicId;
4860
4861 String systemId;
4862
4863 String baseUri;
4864
4865 ExternalIdentifiers() {
4866 }
4867
4868 ExternalIdentifiers(String publicId, String systemId, String baseUri) {
4869 this.publicId = publicId;
4870 this.systemId = systemId;
4871 this.baseUri = baseUri;
4872 }
4873
4874 }
4875
4876 static class EntityInfo {
4877
4878 int type;
4879
4880 ExternalIdentifiers ids;
4881
4882 String value;
4883
4884 String notationName;
4885
4886 }
4887
4888 static class AttributeDecl {
4889
4890 String type;
4891
4892 String value;
4893
4894 int valueType;
4895
4896 String enumeration;
4897
4898 String defaultValue;
4899
4900 }
4901
4902 static class ElementDecl {
4903
4904 int contentType;
4905
4906 String contentModel;
4907
4908 HashMap<String, AttributeDecl> attributes;
4909
4910 }
4911
4912 static class Input {
4913 CharacterHandler characterHandler;
4914
4915 boolean nextCharOnNewLine;
4916
4917 int columnPrev;
4918
4919 int linePrev;
4920
4921 char prev;
4922
4923 int sourceType;
4924
4925 char[] readBuffer;
4926
4927 int readBufferPos;
4928
4929 int readBufferLength;
4930
4931 int line;
4932
4933 String charecterEncoding;
4934
4935 int readBufferOverflow;
4936
4937 InputStream is;
4938
4939 int currentByteCount;
4940
4941 int column;
4942
4943 Reader reader;
4944
4945 NormalizationChecker normalizationChecker;
4946 }
4947
4948 }