001    /* XmlParser.java -- 
002     Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
003     Portions Copyright 2006 Henri Sivonen.
004    
005     This file is part of GNU JAXP.
006    
007     GNU JAXP is free software; you can redistribute it and/or modify
008     it under the terms of the GNU General Public License as published by
009     the Free Software Foundation; either version 2, or (at your option)
010     any later version.
011    
012     GNU JAXP is distributed in the hope that it will be useful, but
013     WITHOUT ANY WARRANTY; without even the implied warranty of
014     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
015     General Public License for more details.
016    
017     You should have received a copy of the GNU General Public License
018     along with GNU JAXP; see the file COPYING.  If not, write to the
019     Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
020     02111-1307 USA.
021    
022     Linking this library statically or dynamically with other modules is
023     making a combined work based on this library.  Thus, the terms and
024     conditions of the GNU General Public License cover the whole
025     combination.
026    
027     As a special exception, the copyright holders of this library give you
028     permission to link this library with independent modules to produce an
029     executable, regardless of the license terms of these independent
030     modules, and to copy and distribute the resulting executable under
031     terms of your choice, provided that you also meet, for each linked
032     independent module, the terms and conditions of the license of that
033     module.  An independent module is a module which is not derived from
034     or based on this library.  If you modify this library, you may extend
035     this exception to your version of the library, but you are not
036     obligated to do so.  If you do not wish to do so, delete this
037     exception statement from your version.
038    
039     Partly derived from code which carried the following notice:
040    
041     Copyright (c) 1997, 1998 by Microstar Software Ltd.
042    
043     AElfred is free for both commercial and non-commercial use and
044     redistribution, provided that Microstar's copyright and disclaimer are
045     retained intact.  You are free to modify AElfred for your own use and
046     to redistribute AElfred with your modifications, provided that the
047     modifications are clearly documented.
048    
049     This program is distributed in the hope that it will be useful, but
050     WITHOUT ANY WARRANTY; without even the implied warranty of
051     merchantability or fitness for a particular purpose.  Please use it AT
052     YOUR OWN RISK.
053     */
054    
055    package nu.validator.gnu.xml.aelfred2;
056    
057    import java.io.BufferedInputStream;
058    import java.io.EOFException;
059    import java.io.IOException;
060    import java.io.InputStream;
061    import java.io.InputStreamReader;
062    import java.io.Reader;
063    import java.nio.charset.CharacterCodingException;
064    import java.nio.charset.Charset;
065    import java.nio.charset.CharsetDecoder;
066    import java.nio.charset.CodingErrorAction;
067    import java.nio.charset.IllegalCharsetNameException;
068    import java.nio.charset.UnsupportedCharsetException;
069    import java.util.HashMap;
070    import java.util.Iterator;
071    import java.util.LinkedList;
072    
073    import nu.validator.htmlparser.impl.CharacterHandler;
074    import nu.validator.htmlparser.impl.NormalizationChecker;
075    import nu.validator.io.EncodingInfo;
076    
077    import org.xml.sax.InputSource;
078    import org.xml.sax.SAXException;
079    
080    // Organized imports -- 2005-08-20 hsivonen
081    
082    /**
083     * Parse XML documents and return parse events through call-backs. Use the
084     * <code>SAXDriver</code> class as your entry point, as all internal parser
085     * interfaces are subject to change.
086     * 
087     * @author Written by David Megginson &lt;dmeggins@microstar.com&gt; (version
088     *         1.2a with bugfixes)
089     * @author Updated by David Brownell &lt;dbrownell@users.sourceforge.net&gt;
090     * @author Modified by Henri Sivonen &lt;hsivonen@iki.fi&gt;
091     * @see SAXDriver
092     */
093    final class XmlParser {
094    
095        // avoid slow per-character readCh()
096        private final static boolean USE_CHEATS = false;
097    
098        // //////////////////////////////////////////////////////////////////////
099        // Constants.
100        // //////////////////////////////////////////////////////////////////////
101    
102        private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
103    
104        //
105        // Constants for element content type.
106        //
107    
108        /**
109         * Constant: an element has not been declared.
110         * 
111         * @see #getElementContentType
112         */
113        public final static int CONTENT_UNDECLARED = 0;
114    
115        /**
116         * Constant: the element has a content model of ANY.
117         * 
118         * @see #getElementContentType
119         */
120        public final static int CONTENT_ANY = 1;
121    
122        /**
123         * Constant: the element has declared content of EMPTY.
124         * 
125         * @see #getElementContentType
126         */
127        public final static int CONTENT_EMPTY = 2;
128    
129        /**
130         * Constant: the element has mixed content.
131         * 
132         * @see #getElementContentType
133         */
134        public final static int CONTENT_MIXED = 3;
135    
136        /**
137         * Constant: the element has element content.
138         * 
139         * @see #getElementContentType
140         */
141        public final static int CONTENT_ELEMENTS = 4;
142    
143        //
144        // Constants for the entity type.
145        //
146    
147        /**
148         * Constant: the entity has not been declared.
149         * 
150         * @see #getEntityType
151         */
152        public final static int ENTITY_UNDECLARED = 0;
153    
154        /**
155         * Constant: the entity is internal.
156         * 
157         * @see #getEntityType
158         */
159        public final static int ENTITY_INTERNAL = 1;
160    
161        /**
162         * Constant: the entity is external, non-parsable data.
163         * 
164         * @see #getEntityType
165         */
166        public final static int ENTITY_NDATA = 2;
167    
168        /**
169         * Constant: the entity is external XML data.
170         * 
171         * @see #getEntityType
172         */
173        public final static int ENTITY_TEXT = 3;
174    
175        //
176        // Attribute type constants are interned literal strings.
177        //
178    
179        //
180        // Constants for attribute default value.
181        //
182    
183        /**
184         * Constant: the attribute is not declared.
185         * 
186         * @see #getAttributeDefaultValueType
187         */
188        public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
189    
190        /**
191         * Constant: the attribute has a literal default value specified.
192         * 
193         * @see #getAttributeDefaultValueType
194         * @see #getAttributeDefaultValue
195         */
196        public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
197    
198        /**
199         * Constant: the attribute was declared #IMPLIED.
200         * 
201         * @see #getAttributeDefaultValueType
202         */
203        public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
204    
205        /**
206         * Constant: the attribute was declared #REQUIRED.
207         * 
208         * @see #getAttributeDefaultValueType
209         */
210        public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
211    
212        /**
213         * Constant: the attribute was declared #FIXED.
214         * 
215         * @see #getAttributeDefaultValueType
216         * @see #getAttributeDefaultValue
217         */
218        public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
219    
220        //
221        // Constants for input.
222        //
223        private final static int INPUT_NONE = 0;
224    
225        private final static int INPUT_INTERNAL = 1;
226    
227        private final static int INPUT_READER = 5;
228    
229        //
230        // Flags for reading literals.
231        //
232        // expand general entity refs (attribute values in dtd and content)
233        private final static int LIT_ENTITY_REF = 2;
234    
235        // normalize this value (space chars) (attributes, public ids)
236        private final static int LIT_NORMALIZE = 4;
237    
238        // literal is an attribute value
239        private final static int LIT_ATTRIBUTE = 8;
240    
241        // don't expand parameter entities
242        private final static int LIT_DISABLE_PE = 16;
243    
244        // don't expand [or parse] character refs
245        private final static int LIT_DISABLE_CREF = 32;
246    
247        // don't parse general entity refs
248        private final static int LIT_DISABLE_EREF = 64;
249    
250        // literal is a public ID value
251        private final static int LIT_PUBID = 256;
252    
253        //
254        // Flags affecting PE handling in DTDs (if expandPE is true).
255        // PEs expand with space padding, except inside literals.
256        //
257        private final static int CONTEXT_NORMAL = 0;
258    
259        private final static int CONTEXT_LITERAL = 1;
260    
261        // Emit warnings for relative URIs with no base URI.
262        static boolean uriWarnings;
263        static {
264            String key = "gnu.xml.aelfred2.XmlParser.uriWarnings";
265            try {
266                uriWarnings = "true".equals(System.getProperty(key));
267            } catch (SecurityException e) {
268                uriWarnings = false;
269            }
270        }
271    
272        //
273        // The current XML handler interface.
274        //
275        private SAXDriver handler;
276    
277        //
278        // I/O information.
279        //
280        private Reader reader; // current reader
281    
282        private InputStream is; // current input stream
283    
284        private int line; // current line number
285    
286        private int linePrev; // the line of the previous character -- hsivonen
287                                // 2007-09-28
288    
289        private int column; // current column number
290    
291        private int columnPrev; // the column of the previous character -- hsivonen
292                                // 2007-09-28
293    
294        private boolean nextCharOnNewLine; // indicates whether the next character
295                                            // is on the next line -- hsivonen
296                                            // 2007-09-28
297        
298        private int sourceType; // type of input source
299    
300        private LinkedList<Input> inputStack; // stack of input soruces
301    
302        private String characterEncoding; // current character encoding
303    
304        private int currentByteCount; // bytes read from current source
305    
306        private InputSource scratch; // temporary
307    
308        //
309        // Buffers for decoded but unparsed character input.
310        //
311        private char[] readBuffer;
312    
313        private int readBufferPos;
314    
315        private int readBufferLength;
316    
317        private int readBufferOverflow; // overflow from last data chunk.
318    
319        //
320        // Buffer for undecoded raw byte input.
321        //
322        private final static int READ_BUFFER_MAX = 16384;
323    
324        private byte[] rawReadBuffer;
325    
326        //
327        // Buffer for attribute values, char refs, DTD stuff.
328        //
329        private static int DATA_BUFFER_INITIAL = 4096;
330    
331        private char[] dataBuffer;
332    
333        private int dataBufferPos;
334    
335        //
336        // Buffer for parsed names.
337        //
338        private static int NAME_BUFFER_INITIAL = 1024;
339    
340        private char[] nameBuffer;
341    
342        private int nameBufferPos;
343    
344        //
345        // Save any standalone flag
346        //
347        private boolean docIsStandalone;
348    
349        //
350        // Hashtables for DTD information on elements, entities, and notations.
351        // Populated until we start ignoring decls (because of skipping a PE)
352        //
353        private HashMap<String, ElementDecl> elementInfo;
354    
355        private HashMap<String, EntityInfo> entityInfo;
356    
357        private HashMap<String, String> notationInfo;
358    
359        private boolean skippedPE;
360    
361        //
362        // Element type currently in force.
363        //
364        private String currentElement;
365    
366        private int currentElementContent;
367    
368        //
369        // Stack of entity names, to detect recursion.
370        //
371        private LinkedList<String> entityStack;
372    
373        //
374        // PE expansion is enabled in most chunks of the DTD, not all.
375        // When it's enabled, literals are treated differently.
376        //
377        private boolean inLiteral;
378    
379        private boolean expandPE;
380    
381        private boolean peIsError;
382    
383        //
384        // can't report entity expansion inside two constructs:
385        // - attribute expansions (internal entities only)
386        // - markup declarations (parameter entities only)
387        //
388        private boolean doReport;
389    
390        //
391        // Symbol table, for caching interned names.
392        //
393        // These show up wherever XML names or nmtokens are used: naming elements,
394        // attributes, PIs, notations, entities, and enumerated attribute values.
395        //
396        // NOTE: This hashtable doesn't grow. The default size is intended to be
397        // rather large for most documents. Example: one snapshot of the DocBook
398        // XML 4.1 DTD used only about 350 such names. As a rule, only pathological
399        // documents (ones that don't reuse names) should ever see much collision.
400        //
401        // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing.
402        // "2039" keeps the hash table size at about two memory pages on typical
403        // 32 bit hardware.
404        //
405        private final static int SYMBOL_TABLE_LENGTH = 2039;
406    
407        private Object[][] symbolTable;
408    
409        //
410        // Hash table of attributes found in current start tag.
411        //
412        private String[] tagAttributes;
413    
414        private int tagAttributePos;
415    
416        //
417        // Utility flag: have we noticed a CR while reading the last
418        // data chunk? If so, we will have to go back and normalise
419        // CR or CR/LF line ends.
420        //
421        private boolean sawCR;
422    
423        //
424        // Utility flag: are we in CDATA? If so, whitespace isn't ignorable.
425        // 
426        private boolean inCDATA;
427    
428        //
429        // Xml version.
430        //  
431        private static final int XML_10 = 0;
432    
433        private static final int XML_11 = 1;
434    
435        private int xmlVersion = XML_10;
436    
437        //
438        // Normalization checking
439        //
440    
441        private NormalizationChecker normalizationChecker;
442        
443        private CharacterHandler characterHandler;
444    
445        // ////////////////////////////////////////////////////////////////////
446        // Constructors.
447        // //////////////////////////////////////////////////////////////////////
448    
449        /**
450         * Construct a new parser with no associated handler.
451         * 
452         * @see #setHandler
453         * @see #parse
454         */
455        // package private
456        XmlParser() {
457        }
458    
459        /**
460         * Set the handler that will receive parsing events.
461         * 
462         * @param handler
463         *            The handler to receive callback events.
464         * @see #parse
465         */
466        // package private
467        void setHandler(SAXDriver handler) {
468            this.handler = handler;
469        }
470    
471        /**
472         * Parse an XML document from the character stream, byte stream, or URI that
473         * you provide (in that order of preference). Any URI that you supply will
474         * become the base URI for resolving relative URI, and may be used to
475         * acquire a reader or byte stream.
476         * 
477         * <p>
478         * Only one thread at a time may use this parser; since it is private to
479         * this package, post-parse cleanup is done by the caller, which MUST NOT
480         * REUSE the parser (just null it).
481         * 
482         * @param systemId
483         *            Absolute URI of the document; should never be null, but may be
484         *            so iff a reader <em>or</em> a stream is provided.
485         * @param publicId
486         *            The public identifier of the document, or null.
487         * @param reader
488         *            A character stream; must be null if stream isn't.
489         * @param stream
490         *            A byte input stream; must be null if reader isn't.
491         * @param characterEncoding
492         *            The suggested encoding, or null if unknown.
493         * @exception java.lang.Exception
494         *                Basically SAXException or IOException
495         */
496        // package private
497        void doParse(String systemId, String publicId, Reader reader,
498                InputStream stream, String encoding) throws Exception {
499            if (handler == null) {
500                throw new IllegalStateException("no callback handler");
501            }
502    
503            alreadyWarnedAboutPrivateUseCharacters = false;
504            initializeVariables();
505    
506            // predeclare the built-in entities here (replacement texts)
507            // we don't need to intern(), since we're guaranteed literals
508            // are always (globally) interned.
509            setInternalEntity("amp", "&#38;");
510            setInternalEntity("lt", "&#60;");
511            setInternalEntity("gt", "&#62;");
512            setInternalEntity("apos", "&#39;");
513            setInternalEntity("quot", "&#34;");
514    
515            try {
516                // pushURL first to ensure locator is correct in startDocument
517                // ... it might report an IO or encoding exception.
518                handler.startDocument();
519                pushURL(false, "[document]",
520                        // default baseURI: null
521                        new ExternalIdentifiers(publicId, systemId, null), reader,
522                        stream, encoding, false);
523    
524                parseDocument();
525            } catch (EOFException e) {
526                // empty input
527                fatal("empty document, with no root element.");
528            } finally {
529                if (reader != null) {
530                    try {
531                        reader.close();
532                    } catch (IOException e) {
533                        /* ignore */
534                    }
535                }
536                if (stream != null) {
537                    try {
538                        stream.close();
539                    } catch (IOException e) {
540                        /* ignore */
541                    }
542                }
543                if (is != null) {
544                    try {
545                        is.close();
546                    } catch (IOException e) {
547                        /* ignore */
548                    }
549                }
550            }
551        }
552    
553        // ////////////////////////////////////////////////////////////////////
554        // Error reporting.
555        // ////////////////////////////////////////////////////////////////////
556    
557        /**
558         * Report an error.
559         * 
560         * @param message
561         *            The error message.
562         * @param textFound
563         *            The text that caused the error (or null).
564         * @see SAXDriver#error
565         * @see #line
566         */
567        private void fatal(String message, String textFound, String textExpected)
568                throws SAXException {
569            // smart quotes -- 2005-08-20 hsivonen
570            if (textFound != null) {
571                message = message + " (found \u201C" + textFound + "\u201D)";
572            }
573            if (textExpected != null) {
574                message = message + " (expected \u201C" + textExpected + "\u201D)";
575            }
576            handler.fatal(message);
577    
578            // "can't happen"
579            throw new SAXException(message);
580        }
581    
582        /**
583         * Report a serious error.
584         * 
585         * @param message
586         *            The error message.
587         * @param textFound
588         *            The text that caused the error (or null).
589         */
590        private void fatal(String message, char textFound, String textExpected)
591                throws SAXException {
592            fatal(message, new Character(textFound).toString(), textExpected);
593        }
594    
595        /**
596         * Report typical case fatal errors.
597         */
598        private void fatal(String message) throws SAXException {
599            handler.fatal(message);
600        }
601    
602        /**
603         * Report non-fatal errors.
604         */
605        private void err(String message) throws SAXException {
606            handler.verror(message);
607        }
608    
609        // ////////////////////////////////////////////////////////////////////
610        // Major syntactic productions.
611        // ////////////////////////////////////////////////////////////////////
612    
613        /**
614         * Parse an XML document.
615         * 
616         * <pre>
617         *  [1] document ::= prolog element Misc*
618         * </pre>
619         * 
620         * <p>
621         * This is the top-level parsing function for a single XML document. As a
622         * minimum, a well-formed document must have a document element, and a valid
623         * document must have a prolog (one with doctype) as well.
624         */
625        private void parseDocument() throws Exception {
626            try { // added by MHK
627                boolean sawDTD = parseProlog();
628                require('<');
629                parseElement(!sawDTD);
630            } catch (EOFException ee) { // added by MHK
631                fatal("premature end of file", "[EOF]", null);
632            }
633    
634            try {
635                parseMisc(); // skip all white, PIs, and comments
636                char c = readCh(); // if this doesn't throw an exception...
637                fatal("unexpected characters after document end", c, null);
638            } catch (EOFException e) {
639                if (characterHandler != null) {
640                    characterHandler.end();
641                }
642                if (normalizationChecker != null) {
643                    normalizationChecker.end();
644                }
645                return;
646            }
647        }
648    
649        static final char[] startDelimComment = { '<', '!', '-', '-' };
650    
651        static final char[] endDelimComment = { '-', '-' };
652    
653        /**
654         * Skip a comment.
655         * 
656         * <pre>
657         *  [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* &quot;--&gt;&quot;
658         * </pre>
659         * 
660         * <p>
661         * (The <code>&lt;!--</code> has already been read.)
662         */
663        private void parseComment() throws Exception {
664            boolean saved = expandPE;
665    
666            expandPE = false;
667            parseUntil(endDelimComment);
668            require('>');
669            expandPE = saved;
670            handler.comment(dataBuffer, 0, dataBufferPos);
671            dataBufferPos = 0;
672        }
673    
674        static final char[] startDelimPI = { '<', '?' };
675    
676        static final char[] endDelimPI = { '?', '>' };
677    
678        /**
679         * Parse a processing instruction and do a call-back.
680         * 
681         * <pre>
682         *  [16] PI ::= '&lt;?' PITarget
683         *     (S (Char* - (Char* '?&gt;' Char*)))?
684         *     '?&gt;'
685         *  [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
686         * </pre>
687         * 
688         * <p>
689         * (The <code>&lt;?</code> has already been read.)
690         */
691        private void parsePI() throws SAXException, IOException {
692            String name;
693            boolean saved = expandPE;
694    
695            expandPE = false;
696            name = readNmtoken(true);
697            // NE08
698            if (name.indexOf(':') >= 0) {
699                fatal("Illegal character(':') in processing instruction name ",
700                        name, null);
701            }
702            if ("xml".equalsIgnoreCase(name)) {
703                fatal("Illegal processing instruction target", name, null);
704            }
705            if (!tryRead(endDelimPI)) {
706                requireWhitespace();
707                parseUntil(endDelimPI);
708            }
709            expandPE = saved;
710            handler.processingInstruction(name, dataBufferToString());
711        }
712    
713        static final char[] endDelimCDATA = { ']', ']', '>' };
714    
715        private boolean isDirtyCurrentElement;
716    
717        private boolean alreadyWarnedAboutPrivateUseCharacters;
718    
719        private char prev;
720    
721        /**
722         * Parse a CDATA section.
723         * 
724         * <pre>
725         *  [18] CDSect ::= CDStart CData CDEnd
726         *  [19] CDStart ::= '&lt;![CDATA['
727         *  [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
728         *  [21] CDEnd ::= ']]&gt;'
729         * </pre>
730         * 
731         * <p>
732         * (The '&lt;![CDATA[' has already been read.)
733         */
734        private void parseCDSect() throws Exception {
735            parseUntil(endDelimCDATA);
736            dataBufferFlush();
737        }
738    
739        /**
740         * Parse the prolog of an XML document.
741         * 
742         * <pre>
743         *  [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
744         * </pre>
745         * 
746         * <p>
747         * We do not look for the XML declaration here, because it was handled by
748         * pushURL ().
749         * 
750         * @see pushURL
751         * @return true if a DTD was read.
752         */
753        private boolean parseProlog() throws Exception {
754            parseMisc();
755    
756            if (tryRead("<!DOCTYPE")) {
757                parseDoctypedecl();
758                parseMisc();
759                return true;
760            }
761            return false;
762        }
763    
764        private void checkLegalVersion(String version) throws SAXException {
765            int len = version.length();
766            for (int i = 0; i < len; i++) {
767                char c = version.charAt(i);
768                if ('0' <= c && c <= '9') {
769                    continue;
770                }
771                if (c == '_' || c == '.' || c == ':' || c == '-') {
772                    continue;
773                }
774                if ('a' <= c && c <= 'z') {
775                    continue;
776                }
777                if ('A' <= c && c <= 'Z') {
778                    continue;
779                }
780                fatal("illegal character in version", version, "1.0");
781            }
782        }
783    
784        /**
785         * Parse the XML declaration.
786         * 
787         * <pre>
788         *  [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
789         *  [24] VersionInfo ::= S 'version' Eq
790         *     (&quot;'&quot; VersionNum &quot;'&quot; | '&quot;' VersionNum '&quot;' )
791         *  [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
792         *  [32] SDDecl ::= S 'standalone' Eq
793         *     ( &quot;'&quot;&quot; ('yes' | 'no') &quot;'&quot;&quot; | '&quot;' (&quot;yes&quot; | &quot;no&quot;) '&quot;' )
794         *  [80] EncodingDecl ::= S 'encoding' Eq
795         *     ( &quot;'&quot; EncName &quot;'&quot; | &quot;'&quot; EncName &quot;'&quot; )
796         *  [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
797         * </pre>
798         * 
799         * <p>
800         * (The <code>&lt;?xml</code> and whitespace have already been read.)
801         * 
802         * @return the encoding in the declaration, uppercased; or null
803         * @see #parseTextDecl
804         * @see #setupDecoding
805         */
806        private String parseXMLDecl(String encoding) throws SAXException,
807                IOException {
808            String version;
809            String encodingName = null;
810            String standalone = null;
811            int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
812    
813            // Read the version.
814            require("version");
815            parseEq();
816            checkLegalVersion(version = readLiteral(flags));
817            if (!version.equals("1.0")) {
818                if (version.equals("1.1")) {
819                    fatal("XML 1.1 not supported."); // 2006-04-24 hsivonen
820                } else {
821                    fatal("illegal XML version", version, "1.0"); // removed 1.1
822                                                                    // -- 2006-04-24
823                                                                    // hsivonen
824                }
825            } else {
826                xmlVersion = XML_10;
827            }
828            // Try reading an encoding declaration.
829            boolean white = tryWhitespace();
830    
831            if (tryRead("encoding")) {
832                if (!white) {
833                    fatal("whitespace required before 'encoding='");
834                }
835                parseEq();
836                encodingName = readLiteral(flags);
837                checkEncodingLiteral(encodingName); // 2006-04-28 hsivonen
838                if (reader == null) {
839                    draconianInputStreamReader(encodingName, is, true);
840                } else {
841                    checkEncodingMatch(encoding, encodingName);
842                }
843            }
844    
845            // Try reading a standalone declaration
846            if (encodingName != null) {
847                white = tryWhitespace();
848            } else {
849                if (encoding == null) {
850                    draconianInputStreamReader("UTF-8", is, false); // 2006-04-24
851                                                                    // hsivonen
852                }
853                warnAboutLackOfEncodingDecl(encoding);
854            }
855            if (tryRead("standalone")) {
856                if (!white) {
857                    fatal("whitespace required before 'standalone='");
858                }
859                parseEq();
860                standalone = readLiteral(flags);
861                if ("yes".equals(standalone)) {
862                    docIsStandalone = true;
863                } else if (!"no".equals(standalone)) {
864                    fatal("standalone flag must be 'yes' or 'no'");
865                }
866            }
867    
868            skipWhitespace();
869            require("?>");
870    
871            return encodingName;
872        }
873    
874        // hsivonen 2006-04-28
875        private void checkEncodingLiteral(String encodingName) throws SAXException {
876            if (encodingName == null) {
877                return;
878            }
879            if (encodingName.length() == 0) {
880                fatal("The empty string does not a legal encoding name.");
881            }
882            char c = encodingName.charAt(0);
883            if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
884                fatal("The encoding name must start with an ASCII letter.");
885            }
886            for (int i = 1; i < encodingName.length(); i++) {
887                c = encodingName.charAt(i);
888                if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
889                        || (c >= '0' && c <= '9') || (c == '.') || (c == '_') || (c == '-'))) {
890                    fatal("Illegal character in encoding name: U+"
891                            + Integer.toHexString(c) + ".");
892                }
893            }
894        }
895    
896        /**
897         * Parse a text declaration.
898         * 
899         * <pre>
900         *  [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
901         *  [80] EncodingDecl ::= S 'encoding' Eq
902         *     ( '&quot;' EncName '&quot;' | &quot;'&quot; EncName &quot;'&quot; )
903         *  [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
904         * </pre>
905         * 
906         * <p>
907         * (The <code>&lt;?xml</code>' and whitespace have already been read.)
908         * 
909         * @return the encoding in the declaration, uppercased; or null
910         * @see #parseXMLDecl
911         * @see #setupDecoding
912         */
913        private String parseTextDecl(String encoding) throws SAXException,
914                IOException {
915            String encodingName = null;
916            int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
917    
918            // Read an optional version.
919            if (tryRead("version")) {
920                String version;
921                parseEq();
922                checkLegalVersion(version = readLiteral(flags));
923                if (!version.equals("1.0")) {
924                    if (version.equals("1.1")) {
925                        fatal("XML 1.1 not supported."); // 2006-04-24 hsivonen
926                    } else {
927                        fatal("illegal XML version", version, "1.0"); // removed
928                                                                        // 1.1 --
929                                                                        // 2006-04-24
930                                                                        // hsivonen
931                    }
932                }
933                requireWhitespace();
934            }
935    
936            // Read the encoding.
937            require("encoding");
938            parseEq();
939            encodingName = readLiteral(flags);
940            checkEncodingLiteral(encodingName); // 2006-04-28 hsivonen
941            if (reader == null) {
942                draconianInputStreamReader(encodingName, is, true);
943            } else {
944                checkEncodingMatch(encoding, encodingName);
945            }
946            skipWhitespace();
947            require("?>");
948    
949            return encodingName;
950        }
951    
952        private void checkEncodingMatch(String used, String detected)
953                throws SAXException {
954            // method added -- 2006-02-03 hsivonen
955            if (used == null) {
956                if (!characterEncoding.equalsIgnoreCase(detected)) {
957                    fatal(
958                            "Declared character encoding was not the one sniffed from the BOM.",
959                            detected, characterEncoding);
960                }
961            } else {
962                if (!"".equals(used) && !used.equalsIgnoreCase(detected)) {
963                    handler.warn("External encoding information specified "
964                            + used
965                            + ", but XML declaration specified "
966                            + detected
967                            + ". Allowing external to override per RFC 3023. The well-formedness status of this document may change when decoupled from the external character encoding information.");
968                }
969            }
970        }
971    
972        private void draconianInputStreamReader(String encoding,
973                InputStream stream, boolean requireAsciiSuperset)
974                throws SAXException, IOException {
975            draconianInputStreamReader(encoding, stream, requireAsciiSuperset,
976                    encoding);
977        }
978    
979        private void draconianInputStreamReader(String encoding,
980                InputStream stream, boolean requireAsciiSuperset, String actualName)
981                throws SAXException, IOException {
982            // method added -- 2005-08-21 hsivonen
983            sourceType = INPUT_READER;
984            characterEncoding = actualName.toUpperCase();
985            encoding = encoding.toUpperCase();
986            try {
987                Charset cs = Charset.forName(encoding);
988                String canonName = cs.name();
989                if (requireAsciiSuperset) {
990                    if (!EncodingInfo.isAsciiSuperset(canonName)) {
991                        fatal("The encoding \u201C"
992                                + encoding
993                                + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration.");
994                    }
995                }
996                if (canonName.startsWith("X-") || canonName.startsWith("x-")
997                        || canonName.startsWith("Mac")) {
998                    if (encoding.startsWith("X-")) {
999                        err(encoding
1000                                + " is not an IANA-registered encoding. (Charmod C022)");
1001                    } else {
1002                        err(encoding
1003                                + "is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
1004                    }
1005                } else if (!canonName.equalsIgnoreCase(encoding)) {
1006                    err(encoding
1007                            + " is not the preferred name of the character encoding in use. The preferred name is "
1008                            + canonName + ". (Charmod C024)");
1009                }
1010                if (!("UTF-8".equals(encoding) || "UTF-16".equals(encoding)
1011                        || "UTF-16BE".equals(encoding)
1012                        || "UTF-16LE".equals(encoding)
1013                        || "ISO-8859-1".equals(encoding) || "US-ASCII".equals(encoding))) {
1014                    handler.warn("XML processors are required to support the UTF-8 and UTF-16 character encodings. The encoding was "
1015                            + actualName
1016                            + " instead, which is an incompatibility risk.");
1017                }
1018                CharsetDecoder decoder = cs.newDecoder();
1019                decoder.onMalformedInput(CodingErrorAction.REPORT);
1020                decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
1021                this.reader = new InputStreamReader(stream, decoder);
1022            } catch (IllegalCharsetNameException e) {
1023                fatal("Illegal character encoding name: " + encoding);
1024            } catch (UnsupportedCharsetException e) {
1025                handler.fatal("Unsupported character encoding: " + encoding);
1026            }
1027        }
1028    
1029        /**
1030         * Parse miscellaneous markup outside the document element and DOCTYPE
1031         * declaration.
1032         * 
1033         * <pre>
1034         *  [27] Misc ::= Comment | PI | S
1035         * </pre>
1036         */
1037        private void parseMisc() throws Exception {
1038            while (true) {
1039                skipWhitespace();
1040                if (tryRead(startDelimPI)) {
1041                    parsePI();
1042                } else if (tryRead(startDelimComment)) {
1043                    parseComment();
1044                } else {
1045                    return;
1046                }
1047            }
1048        }
1049    
1050        /**
1051         * Parse a document type declaration.
1052         * 
1053         * <pre>
1054         *  [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
1055         *     ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
1056         * </pre>
1057         * 
1058         * <p>
1059         * (The <code>&lt;!DOCTYPE</code> has already been read.)
1060         */
1061        private void parseDoctypedecl() throws Exception {
1062            String rootName;
1063            ExternalIdentifiers ids;
1064    
1065            // Read the document type name.
1066            requireWhitespace();
1067            rootName = readNmtoken(true);
1068    
1069            // Read the External subset's IDs
1070            skipWhitespace();
1071            ids = readExternalIds(false, true);
1072    
1073            // report (a) declaration of name, (b) lexical info (ids)
1074            handler.doctypeDecl(rootName, ids.publicId, ids.systemId);
1075    
1076            // Internal subset is parsed first, if present
1077            skipWhitespace();
1078            if (tryRead('[')) {
1079    
1080                // loop until the subset ends
1081                while (true) {
1082                    doReport = expandPE = true;
1083                    skipWhitespace();
1084                    doReport = expandPE = false;
1085                    if (tryRead(']')) {
1086                        break; // end of subset
1087                    } else {
1088                        // WFC, PEs in internal subset (only between decls)
1089                        peIsError = expandPE = true;
1090                        parseMarkupdecl();
1091                        peIsError = expandPE = false;
1092                    }
1093                }
1094            }
1095            skipWhitespace();
1096            require('>');
1097    
1098            // Read the external subset, if any
1099            InputSource subset;
1100    
1101            if (ids.systemId == null) {
1102                subset = handler.getExternalSubset(rootName, handler.getSystemId());
1103            } else {
1104                subset = null;
1105            }
1106            if (ids.systemId != null || subset != null) {
1107                pushString(null, ">");
1108    
1109                // NOTE: [dtd] is so we say what SAX2 expects,
1110                // though it's misleading (subset, not entire dtd)
1111                if (ids.systemId != null) {
1112                    pushURL(true, "[dtd]", ids, null, null, null, true);
1113                } else {
1114                    handler.warn("modifying document by adding external subset");
1115                    pushURL(true, "[dtd]", new ExternalIdentifiers(
1116                            subset.getPublicId(), subset.getSystemId(), null),
1117                            subset.getCharacterStream(), subset.getByteStream(),
1118                            subset.getEncoding(), false);
1119                }
1120    
1121                // Loop until we end up back at '>'
1122                while (true) {
1123                    doReport = expandPE = true;
1124                    skipWhitespace();
1125                    doReport = expandPE = false;
1126                    if (tryRead('>')) {
1127                        break;
1128                    } else {
1129                        expandPE = true;
1130                        parseMarkupdecl();
1131                        expandPE = false;
1132                    }
1133                }
1134    
1135                // the ">" string isn't popped yet
1136                if (inputStack.size() != 1) {
1137                    fatal("external subset has unmatched '>'");
1138                }
1139            }
1140    
1141            // done dtd
1142            handler.endDoctype();
1143            expandPE = false;
1144            doReport = true;
1145        }
1146    
1147        /**
1148         * Parse a markup declaration in the internal or external DTD subset.
1149         * 
1150         * <pre>
1151         *  [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
1152         *     | NotationDecl | PI | Comment
1153         *  [30] extSubsetDecl ::= (markupdecl | conditionalSect
1154         *     | PEReference | S) *
1155         * </pre>
1156         * 
1157         * <p>
1158         * Reading toplevel PE references is handled as a lexical issue by the
1159         * caller, as is whitespace.
1160         */
1161        private void parseMarkupdecl() throws Exception {
1162            char[] saved = null;
1163            boolean savedPE = expandPE;
1164    
1165            // prevent "<%foo;" and ensures saved entity is right
1166            require('<');
1167            unread('<');
1168            expandPE = false;
1169    
1170            if (tryRead("<!ELEMENT")) {
1171                saved = readBuffer;
1172                expandPE = savedPE;
1173                parseElementDecl();
1174            } else if (tryRead("<!ATTLIST")) {
1175                saved = readBuffer;
1176                expandPE = savedPE;
1177                parseAttlistDecl();
1178            } else if (tryRead("<!ENTITY")) {
1179                saved = readBuffer;
1180                expandPE = savedPE;
1181                parseEntityDecl();
1182            } else if (tryRead("<!NOTATION")) {
1183                saved = readBuffer;
1184                expandPE = savedPE;
1185                parseNotationDecl();
1186            } else if (tryRead(startDelimPI)) {
1187                saved = readBuffer;
1188                expandPE = savedPE;
1189                parsePI();
1190            } else if (tryRead(startDelimComment)) {
1191                saved = readBuffer;
1192                expandPE = savedPE;
1193                parseComment();
1194            } else if (tryRead("<![")) {
1195                saved = readBuffer;
1196                expandPE = savedPE;
1197                if (inputStack.size() > 0) {
1198                    parseConditionalSect(saved);
1199                } else {
1200                    fatal("conditional sections illegal in internal subset");
1201                }
1202            } else {
1203                fatal("expected markup declaration");
1204            }
1205    
1206            // VC: Proper Decl/PE Nesting
1207            if (readBuffer != saved) {
1208                handler.verror("Illegal Declaration/PE nesting");
1209            }
1210        }
1211    
1212        /**
1213         * Parse an element, with its tags.
1214         * 
1215         * <pre>
1216         *  [39] element ::= EmptyElementTag | STag content ETag
1217         *  [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
1218         *  [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
1219         * </pre>
1220         * 
1221         * <p>
1222         * (The '&lt;' has already been read.)
1223         * <p>
1224         * NOTE: this method actually chains onto parseContent (), if necessary, and
1225         * parseContent () will take care of calling parseETag ().
1226         */
1227        private void parseElement(boolean maybeGetSubset) throws Exception {
1228            String gi;
1229            char c;
1230            int oldElementContent = currentElementContent;
1231            String oldElement = currentElement;
1232            ElementDecl element;
1233    
1234            // This is the (global) counter for the
1235            // array of specified attributes.
1236            tagAttributePos = 0;
1237    
1238            // Read the element type name.
1239            gi = readNmtoken(true);
1240    
1241            // If we saw no DTD, and this is the document root element,
1242            // let the application modify the input stream by providing one.
1243            if (maybeGetSubset) {
1244                InputSource subset = handler.getExternalSubset(gi,
1245                        handler.getSystemId());
1246                if (subset != null) {
1247                    String publicId = subset.getPublicId();
1248                    String systemId = subset.getSystemId();
1249    
1250                    handler.warn("modifying document by adding DTD");
1251                    handler.doctypeDecl(gi, publicId, systemId);
1252                    pushString(null, ">");
1253    
1254                    // NOTE: [dtd] is so we say what SAX2 expects,
1255                    // though it's misleading (subset, not entire dtd)
1256                    pushURL(true, "[dtd]", new ExternalIdentifiers(publicId,
1257                            systemId, null), subset.getCharacterStream(),
1258                            subset.getByteStream(), subset.getEncoding(), false);
1259    
1260                    // Loop until we end up back at '>'
1261                    while (true) {
1262                        doReport = expandPE = true;
1263                        skipWhitespace();
1264                        doReport = expandPE = false;
1265                        if (tryRead('>')) {
1266                            break;
1267                        } else {
1268                            expandPE = true;
1269                            parseMarkupdecl();
1270                            expandPE = false;
1271                        }
1272                    }
1273    
1274                    // the ">" string isn't popped yet
1275                    if (inputStack.size() != 1) {
1276                        fatal("external subset has unmatched '>'");
1277                    }
1278    
1279                    handler.endDoctype();
1280                }
1281            }
1282    
1283            // Determine the current content type.
1284            currentElement = gi;
1285            element = elementInfo.get(gi);
1286            currentElementContent = getContentType(element, CONTENT_ANY);
1287    
1288            // Read the attributes, if any.
1289            // After this loop, "c" is the closing delimiter.
1290            boolean white = tryWhitespace();
1291            c = readCh();
1292            while (c != '/' && c != '>') {
1293                unread(c);
1294                if (!white) {
1295                    fatal("need whitespace between attributes");
1296                }
1297                parseAttribute(gi);
1298                white = tryWhitespace();
1299                c = readCh();
1300            }
1301    
1302            // Supply any defaulted attributes.
1303            Iterator<String> atts = declaredAttributes(element);
1304            if (atts != null) {
1305                String aname;
1306                loop: while (atts.hasNext()) {
1307                    aname = atts.next();
1308                    // See if it was specified.
1309                    for (int i = 0; i < tagAttributePos; i++) {
1310                        if (tagAttributes[i] == aname) {
1311                            continue loop;
1312                        }
1313                    }
1314                    // ... or has a default
1315                    String value = getAttributeDefaultValue(gi, aname);
1316    
1317                    if (value == null) {
1318                        continue;
1319                    }
1320                    handler.attribute(aname, value, false);
1321                }
1322            }
1323    
1324            // Figure out if this is a start tag
1325            // or an empty element, and dispatch an
1326            // event accordingly.
1327            switch (c) {
1328                case '>':
1329                    handler.startElement(gi);
1330                    parseContent();
1331                    break;
1332                case '/':
1333                    require('>');
1334                    handler.startElement(gi);
1335                    handler.endElement(gi);
1336                    break;
1337            }
1338    
1339            // Restore the previous state.
1340            currentElement = oldElement;
1341            currentElementContent = oldElementContent;
1342        }
1343    
1344        /**
1345         * Parse an attribute assignment.
1346         * 
1347         * <pre>
1348         *  [41] Attribute ::= Name Eq AttValue
1349         * </pre>
1350         * 
1351         * @param name
1352         *            The name of the attribute's element.
1353         * @see SAXDriver#attribute
1354         */
1355        private void parseAttribute(String name) throws Exception {
1356            String aname;
1357            String type;
1358            String value;
1359            int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF;
1360    
1361            // Read the attribute name.
1362            aname = readNmtoken(true);
1363            type = getAttributeType(name, aname);
1364    
1365            // Parse '='
1366            parseEq();
1367    
1368            // Read the value, normalizing whitespace
1369            // unless it is CDATA.
1370            if (handler.stringInterning) {
1371                if (type == "CDATA" || type == null) {
1372                    value = readLiteral(flags);
1373                } else {
1374                    value = readLiteral(flags | LIT_NORMALIZE);
1375                }
1376            } else {
1377                if (type.equals("CDATA") || type == null) {
1378                    value = readLiteral(flags);
1379                } else {
1380                    value = readLiteral(flags | LIT_NORMALIZE);
1381                }
1382            }
1383    
1384            // WFC: no duplicate attributes
1385            for (int i = 0; i < tagAttributePos; i++) {
1386                if (aname.equals(tagAttributes[i])) {
1387                    fatal("duplicate attribute", aname, null);
1388                }
1389            }
1390    
1391            // Inform the handler about the
1392            // attribute.
1393            handler.attribute(aname, value, true);
1394            dataBufferPos = 0;
1395    
1396            // Note that the attribute has been
1397            // specified.
1398            if (tagAttributePos == tagAttributes.length) {
1399                String newAttrib[] = new String[tagAttributes.length * 2];
1400                System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1401                tagAttributes = newAttrib;
1402            }
1403            tagAttributes[tagAttributePos++] = aname;
1404        }
1405    
1406        /**
1407         * Parse an equals sign surrounded by optional whitespace.
1408         * 
1409         * <pre>
1410         *  [25] Eq ::= S? '=' S?
1411         * </pre>
1412         */
1413        private void parseEq() throws SAXException, IOException {
1414            skipWhitespace();
1415            require('=');
1416            skipWhitespace();
1417        }
1418    
1419        /**
1420         * Parse an end tag.
1421         * 
1422         * <pre>
1423         *  [42] ETag ::= '&lt;/' Name S? '&gt;'
1424         * </pre>
1425         * 
1426         * <p>
1427         * NOTE: parseContent () chains to here, we already read the "&lt;/".
1428         */
1429        private void parseETag() throws Exception {
1430            require(currentElement);
1431            skipWhitespace();
1432            require('>');
1433            handler.endElement(currentElement);
1434            // not re-reporting any SAXException re bogus end tags,
1435            // even though that diagnostic might be clearer ...
1436        }
1437    
1438        /**
1439         * Parse the content of an element.
1440         * 
1441         * <pre>
1442         *  [43] content ::= (element | CharData | Reference
1443         *     | CDSect | PI | Comment)*
1444         *  [67] Reference ::= EntityRef | CharRef
1445         * </pre>
1446         * 
1447         * <p>
1448         * NOTE: consumes ETtag.
1449         */
1450        private void parseContent() throws Exception {
1451            char c;
1452    
1453            while (true) {
1454                // consume characters (or ignorable whitspace) until delimiter
1455                parseCharData();
1456    
1457                // Handle delimiters
1458                c = readCh();
1459                switch (c) {
1460                    case '&': // Found "&"
1461                        c = readCh();
1462                        if (c == '#') {
1463                            parseCharRef();
1464                        } else {
1465                            unread(c);
1466                            parseEntityRef(true);
1467                        }
1468                        isDirtyCurrentElement = true;
1469                        break;
1470    
1471                    case '<': // Found "<"
1472                        dataBufferFlush();
1473                        c = readCh();
1474                        switch (c) {
1475                            case '!': // Found "<!"
1476                                c = readCh();
1477                                switch (c) {
1478                                    case '-': // Found "<!-"
1479                                        require('-');
1480                                        isDirtyCurrentElement = false;
1481                                        parseComment();
1482                                        break;
1483                                    case '[': // Found "<!["
1484                                        isDirtyCurrentElement = false;
1485                                        require("CDATA[");
1486                                        handler.startCDATA();
1487                                        inCDATA = true;
1488                                        parseCDSect();
1489                                        inCDATA = false;
1490                                        handler.endCDATA();
1491                                        break;
1492                                    default:
1493                                        fatal("expected comment or CDATA section",
1494                                                c, null);
1495                                        break;
1496                                }
1497                                break;
1498    
1499                            case '?': // Found "<?"
1500                                isDirtyCurrentElement = false;
1501                                parsePI();
1502                                break;
1503    
1504                            case '/': // Found "</"
1505                                isDirtyCurrentElement = false;
1506                                parseETag();
1507                                return;
1508    
1509                            default: // Found "<" followed by something else
1510                                isDirtyCurrentElement = false;
1511                                unread(c);
1512                                parseElement(false);
1513                                break;
1514                        }
1515                }
1516            }
1517        }
1518    
1519        /**
1520         * Parse an element type declaration.
1521         * 
1522         * <pre>
1523         *  [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1524         * </pre>
1525         * 
1526         * <p>
1527         * NOTE: the '&lt;!ELEMENT' has already been read.
1528         */
1529        private void parseElementDecl() throws Exception {
1530            String name;
1531    
1532            requireWhitespace();
1533            // Read the element type name.
1534            name = readNmtoken(true);
1535    
1536            requireWhitespace();
1537            // Read the content model.
1538            parseContentspec(name);
1539    
1540            skipWhitespace();
1541            require('>');
1542        }
1543    
1544        /**
1545         * Content specification.
1546         * 
1547         * <pre>
1548         *  [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1549         * </pre>
1550         */
1551        private void parseContentspec(String name) throws Exception {
1552            // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ...
1553            if (tryRead("EMPTY")) {
1554                setElement(name, CONTENT_EMPTY, null, null);
1555                if (!skippedPE) {
1556                    handler.getDeclHandler().elementDecl(name, "EMPTY");
1557                }
1558                return;
1559            } else if (tryRead("ANY")) {
1560                setElement(name, CONTENT_ANY, null, null);
1561                if (!skippedPE) {
1562                    handler.getDeclHandler().elementDecl(name, "ANY");
1563                }
1564                return;
1565            } else {
1566                String model;
1567                char[] saved;
1568    
1569                require('(');
1570                saved = readBuffer;
1571                dataBufferAppend('(');
1572                skipWhitespace();
1573                if (tryRead("#PCDATA")) {
1574                    dataBufferAppend("#PCDATA");
1575                    parseMixed(saved);
1576                    model = dataBufferToString();
1577                    setElement(name, CONTENT_MIXED, model, null);
1578                } else {
1579                    parseElements(saved);
1580                    model = dataBufferToString();
1581                    setElement(name, CONTENT_ELEMENTS, model, null);
1582                }
1583                if (!skippedPE) {
1584                    handler.getDeclHandler().elementDecl(name, model);
1585                }
1586            }
1587        }
1588    
1589        /**
1590         * Parse an element-content model.
1591         * 
1592         * <pre>
1593         *  [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1594         *  [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1595         *  [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1596         * </pre>
1597         * 
1598         * <p>
1599         * NOTE: the opening '(' and S have already been read.
1600         * 
1601         * @param saved
1602         *            Buffer for entity that should have the terminal ')'
1603         */
1604        private void parseElements(char[] saved) throws Exception {
1605            char c;
1606            char sep;
1607    
1608            // Parse the first content particle
1609            skipWhitespace();
1610            parseCp();
1611    
1612            // Check for end or for a separator.
1613            skipWhitespace();
1614            c = readCh();
1615            switch (c) {
1616                case ')':
1617                    // VC: Proper Group/PE Nesting
1618                    if (readBuffer != saved) {
1619                        handler.verror("Illegal Group/PE nesting");
1620                    }
1621    
1622                    dataBufferAppend(')');
1623                    c = readCh();
1624                    switch (c) {
1625                        case '*':
1626                        case '+':
1627                        case '?':
1628                            dataBufferAppend(c);
1629                            break;
1630                        default:
1631                            unread(c);
1632                    }
1633                    return;
1634                case ',': // Register the separator.
1635                case '|':
1636                    sep = c;
1637                    dataBufferAppend(c);
1638                    break;
1639                default:
1640                    fatal("bad separator in content model", c, null);
1641                    return;
1642            }
1643    
1644            // Parse the rest of the content model.
1645            while (true) {
1646                skipWhitespace();
1647                parseCp();
1648                skipWhitespace();
1649                c = readCh();
1650                if (c == ')') {
1651                    // VC: Proper Group/PE Nesting
1652                    if (readBuffer != saved) {
1653                        handler.verror("Illegal Group/PE nesting");
1654                    }
1655    
1656                    dataBufferAppend(')');
1657                    break;
1658                } else if (c != sep) {
1659                    fatal("bad separator in content model", c, null);
1660                    return;
1661                } else {
1662                    dataBufferAppend(c);
1663                }
1664            }
1665    
1666            // Check for the occurrence indicator.
1667            c = readCh();
1668            switch (c) {
1669                case '?':
1670                case '*':
1671                case '+':
1672                    dataBufferAppend(c);
1673                    return;
1674                default:
1675                    unread(c);
1676                    return;
1677            }
1678        }
1679    
1680        /**
1681         * Parse a content particle.
1682         * 
1683         * <pre>
1684         *  [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1685         * </pre>
1686         */
1687        private void parseCp() throws Exception {
1688            if (tryRead('(')) {
1689                dataBufferAppend('(');
1690                parseElements(readBuffer);
1691            } else {
1692                dataBufferAppend(readNmtoken(true));
1693                char c = readCh();
1694                switch (c) {
1695                    case '?':
1696                    case '*':
1697                    case '+':
1698                        dataBufferAppend(c);
1699                        break;
1700                    default:
1701                        unread(c);
1702                        break;
1703                }
1704            }
1705        }
1706    
1707        /**
1708         * Parse mixed content.
1709         * 
1710         * <pre>
1711         *  [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1712         *         | '(' S? ('#PCDATA') S? ')'
1713         * </pre>
1714         * 
1715         * @param saved
1716         *            Buffer for entity that should have the terminal ')'
1717         */
1718        private void parseMixed(char[] saved) throws Exception {
1719            // Check for PCDATA alone.
1720            skipWhitespace();
1721            if (tryRead(')')) {
1722                // VC: Proper Group/PE Nesting
1723                if (readBuffer != saved) {
1724                    handler.verror("Illegal Group/PE nesting");
1725                }
1726    
1727                dataBufferAppend(")*");
1728                tryRead('*');
1729                return;
1730            }
1731    
1732            // Parse mixed content.
1733            skipWhitespace();
1734            while (!tryRead(")")) {
1735                require('|');
1736                dataBufferAppend('|');
1737                skipWhitespace();
1738                dataBufferAppend(readNmtoken(true));
1739                skipWhitespace();
1740            }
1741    
1742            // VC: Proper Group/PE Nesting
1743            if (readBuffer != saved) {
1744                handler.verror("Illegal Group/PE nesting");
1745            }
1746    
1747            require('*');
1748            dataBufferAppend(")*");
1749        }
1750    
1751        /**
1752         * Parse an attribute list declaration.
1753         * 
1754         * <pre>
1755         *  [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1756         * </pre>
1757         * 
1758         * <p>
1759         * NOTE: the '&lt;!ATTLIST' has already been read.
1760         */
1761        private void parseAttlistDecl() throws Exception {
1762            String elementName;
1763    
1764            requireWhitespace();
1765            elementName = readNmtoken(true);
1766            boolean white = tryWhitespace();
1767            while (!tryRead('>')) {
1768                if (!white) {
1769                    fatal("whitespace required before attribute definition");
1770                }
1771                parseAttDef(elementName);
1772                white = tryWhitespace();
1773            }
1774        }
1775    
1776        /**
1777         * Parse a single attribute definition.
1778         * 
1779         * <pre>
1780         *  [53] AttDef ::= S Name S AttType S DefaultDecl
1781         * </pre>
1782         */
1783        private void parseAttDef(String elementName) throws Exception {
1784            String name;
1785            String type;
1786            String enumer = null;
1787    
1788            // Read the attribute name.
1789            name = readNmtoken(true);
1790    
1791            // Read the attribute type.
1792            requireWhitespace();
1793            type = readAttType();
1794    
1795            // Get the string of enumerated values if necessary.
1796            if (handler.stringInterning) {
1797                if ("ENUMERATION" == type || "NOTATION" == type) {
1798                    enumer = dataBufferToString();
1799                }
1800            } else {
1801                if ("ENUMERATION".equals(type) || "NOTATION".equals(type)) {
1802                    enumer = dataBufferToString();
1803                }
1804            }
1805    
1806            // Read the default value.
1807            requireWhitespace();
1808            parseDefault(elementName, name, type, enumer);
1809        }
1810    
1811        /**
1812         * Parse the attribute type.
1813         * 
1814         * <pre>
1815         *  [54] AttType ::= StringType | TokenizedType | EnumeratedType
1816         *  [55] StringType ::= 'CDATA'
1817         *  [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1818         *     | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1819         *  [57] EnumeratedType ::= NotationType | Enumeration
1820         * </pre>
1821         */
1822        private String readAttType() throws Exception {
1823            if (tryRead('(')) {
1824                parseEnumeration(false);
1825                return "ENUMERATION";
1826            } else {
1827                String typeString = readNmtoken(true);
1828                if (handler.stringInterning) {
1829                    if ("NOTATION" == typeString) {
1830                        parseNotationType();
1831                        return typeString;
1832                    } else if ("CDATA" == typeString || "ID" == typeString
1833                            || "IDREF" == typeString || "IDREFS" == typeString
1834                            || "ENTITY" == typeString || "ENTITIES" == typeString
1835                            || "NMTOKEN" == typeString || "NMTOKENS" == typeString) {
1836                        return typeString;
1837                    }
1838                } else {
1839                    if ("NOTATION".equals(typeString)) {
1840                        parseNotationType();
1841                        return typeString;
1842                    } else if ("CDATA".equals(typeString)
1843                            || "ID".equals(typeString)
1844                            || "IDREF".equals(typeString)
1845                            || "IDREFS".equals(typeString)
1846                            || "ENTITY".equals(typeString)
1847                            || "ENTITIES".equals(typeString)
1848                            || "NMTOKEN".equals(typeString)
1849                            || "NMTOKENS".equals(typeString)) {
1850                        return typeString;
1851                    }
1852                }
1853                fatal("illegal attribute type", typeString, null);
1854                return null;
1855            }
1856        }
1857    
1858        /**
1859         * Parse an enumeration.
1860         * 
1861         * <pre>
1862         *  [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1863         * </pre>
1864         * 
1865         * <p>
1866         * NOTE: the '(' has already been read.
1867         */
1868        private void parseEnumeration(boolean isNames) throws Exception {
1869            dataBufferAppend('(');
1870    
1871            // Read the first token.
1872            skipWhitespace();
1873            dataBufferAppend(readNmtoken(isNames));
1874            // Read the remaining tokens.
1875            skipWhitespace();
1876            while (!tryRead(')')) {
1877                require('|');
1878                dataBufferAppend('|');
1879                skipWhitespace();
1880                dataBufferAppend(readNmtoken(isNames));
1881                skipWhitespace();
1882            }
1883            dataBufferAppend(')');
1884        }
1885    
1886        /**
1887         * Parse a notation type for an attribute.
1888         * 
1889         * <pre>
1890         *  [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
1891         *     (S? '|' S? name)* S? ')'
1892         * </pre>
1893         * 
1894         * <p>
1895         * NOTE: the 'NOTATION' has already been read
1896         */
1897        private void parseNotationType() throws Exception {
1898            requireWhitespace();
1899            require('(');
1900    
1901            parseEnumeration(true);
1902        }
1903    
1904        /**
1905         * Parse the default value for an attribute.
1906         * 
1907         * <pre>
1908         *  [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1909         *     | (('#FIXED' S)? AttValue)
1910         * </pre>
1911         */
1912        private void parseDefault(String elementName, String name, String type,
1913                String enumer) throws Exception {
1914            int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1915            String value = null;
1916            int flags = LIT_ATTRIBUTE;
1917            boolean saved = expandPE;
1918            String defaultType = null;
1919    
1920            // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
1921            // chars to spaces (doesn't matter when that's done if it doesn't
1922            // interfere with char refs expanding to whitespace).
1923    
1924            if (!skippedPE) {
1925                flags |= LIT_ENTITY_REF;
1926                if (handler.stringInterning) {
1927                    if ("CDATA" != type) {
1928                        flags |= LIT_NORMALIZE;
1929                    }
1930                } else {
1931                    if (!"CDATA".equals(type)) {
1932                        flags |= LIT_NORMALIZE;
1933                    }
1934                }
1935            }
1936    
1937            expandPE = false;
1938            if (tryRead('#')) {
1939                if (tryRead("FIXED")) {
1940                    defaultType = "#FIXED";
1941                    valueType = ATTRIBUTE_DEFAULT_FIXED;
1942                    requireWhitespace();
1943                    value = readLiteral(flags);
1944                } else if (tryRead("REQUIRED")) {
1945                    defaultType = "#REQUIRED";
1946                    valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1947                } else if (tryRead("IMPLIED")) {
1948                    defaultType = "#IMPLIED";
1949                    valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1950                } else {
1951                    fatal("illegal keyword for attribute default value");
1952                }
1953            } else {
1954                value = readLiteral(flags);
1955            }
1956            expandPE = saved;
1957            setAttribute(elementName, name, type, enumer, value, valueType);
1958            if (handler.stringInterning) {
1959                if ("ENUMERATION" == type) {
1960                    type = enumer;
1961                } else if ("NOTATION" == type) {
1962                    type = "NOTATION " + enumer;
1963                }
1964            } else {
1965                if ("ENUMERATION".equals(type)) {
1966                    type = enumer;
1967                } else if ("NOTATION".equals(type)) {
1968                    type = "NOTATION " + enumer;
1969                }
1970            }
1971            if (!skippedPE) {
1972                handler.getDeclHandler().attributeDecl(elementName, name, type,
1973                        defaultType, value);
1974            }
1975        }
1976    
1977        /**
1978         * Parse a conditional section.
1979         * 
1980         * <pre>
1981         *  [61] conditionalSect ::= includeSect || ignoreSect
1982         *  [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
1983         *     extSubsetDecl ']]&gt;'
1984         *  [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
1985         *     ignoreSectContents* ']]&gt;'
1986         *  [64] ignoreSectContents ::= Ignore
1987         *     ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
1988         *  [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
1989         * </pre>
1990         * 
1991         * <p>
1992         * NOTE: the '&gt;![' has already been read.
1993         */
1994        private void parseConditionalSect(char[] saved) throws Exception {
1995            skipWhitespace();
1996            if (tryRead("INCLUDE")) {
1997                skipWhitespace();
1998                require('[');
1999                // VC: Proper Conditional Section/PE Nesting
2000                if (readBuffer != saved) {
2001                    handler.verror("Illegal Conditional Section/PE nesting");
2002                }
2003                skipWhitespace();
2004                while (!tryRead("]]>")) {
2005                    parseMarkupdecl();
2006                    skipWhitespace();
2007                }
2008            } else if (tryRead("IGNORE")) {
2009                skipWhitespace();
2010                require('[');
2011                // VC: Proper Conditional Section/PE Nesting
2012                if (readBuffer != saved) {
2013                    handler.verror("Illegal Conditional Section/PE nesting");
2014                }
2015                char c;
2016                expandPE = false;
2017                for (int nest = 1; nest > 0;) {
2018                    c = readCh();
2019                    switch (c) {
2020                        case '<':
2021                            if (tryRead("![")) {
2022                                nest++;
2023                            }
2024                        case ']':
2025                            if (tryRead("]>")) {
2026                                nest--;
2027                            }
2028                    }
2029                }
2030                expandPE = true;
2031            } else {
2032                fatal("conditional section must begin with INCLUDE or IGNORE");
2033            }
2034        }
2035    
2036        private void parseCharRef() throws SAXException, IOException {
2037            parseCharRef(true /* do flushDataBuffer by default */);
2038        }
2039    
2040        /**
2041         * Try to read a character reference without consuming data from buffer.
2042         * 
2043         * <pre>
2044         *  [66] CharRef ::= '&amp;#' [0-9]+ ';' | '&amp;#x' [0-9a-fA-F]+ ';'
2045         * </pre>
2046         * 
2047         * <p>
2048         * NOTE: the '&#' has already been read.
2049         */
2050        private void tryReadCharRef() throws SAXException, IOException {
2051            int value = 0;
2052            char c;
2053    
2054            if (tryRead('x')) {
2055                loop1: while (true) {
2056                    c = readCh();
2057                    if (c == ';') {
2058                        break loop1;
2059                    } else {
2060                        int n = Character.digit(c, 16);
2061                        if (n == -1) {
2062                            fatal("illegal character in character reference", c,
2063                                    null);
2064                            break loop1;
2065                        }
2066                        value *= 16;
2067                        value += n;
2068                    }
2069                }
2070            } else {
2071                loop2: while (true) {
2072                    c = readCh();
2073                    if (c == ';') {
2074                        break loop2;
2075                    } else {
2076                        int n = Character.digit(c, 10);
2077                        if (n == -1) {
2078                            fatal("illegal character in character reference", c,
2079                                    null);
2080                            break loop2;
2081                        }
2082                        value *= 10;
2083                        value += n;
2084                    }
2085                }
2086            }
2087    
2088            // check for character refs being legal XML
2089            if ((value < 0x0020 && !(value == '\n' || value == '\t' || value == '\r'))
2090                    || (value >= 0xD800 && value <= 0xDFFF)
2091                    || value == 0xFFFE
2092                    || value == 0xFFFF || value > 0x0010ffff) {
2093                fatal("illegal XML character reference U+"
2094                        + Integer.toHexString(value));
2095            } else if (value >= 0x007F && value <= 0x009F) // 2006-11-13 hsivonen
2096            {
2097                handler.warn("Character reference expands to a control character: U+00"
2098                        + Integer.toHexString(c) + ".");
2099            }
2100            if (isPrivateUse(value)) {
2101                warnAboutPrivateUseChar();
2102            }
2103            // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2104            // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2105            if (value > 0x0010ffff) {
2106                // too big for surrogate
2107                fatal("character reference " + value + " is too large for UTF-16",
2108                        new Integer(value).toString(), null);
2109            }
2110    
2111        }
2112    
2113        /**
2114         * Read and interpret a character reference.
2115         * 
2116         * <pre>
2117         *  [66] CharRef ::= '&amp;#' [0-9]+ ';' | '&amp;#x' [0-9a-fA-F]+ ';'
2118         * </pre>
2119         * 
2120         * <p>
2121         * NOTE: the '&#' has already been read.
2122         */
2123        private void parseCharRef(boolean doFlush) throws SAXException, IOException {
2124            int value = 0;
2125            char c;
2126    
2127            if (tryRead('x')) {
2128                loop1: while (true) {
2129                    c = readCh();
2130                    if (c == ';') {
2131                        break loop1;
2132                    } else {
2133                        int n = Character.digit(c, 16);
2134                        if (n == -1) {
2135                            fatal("illegal character in character reference", c,
2136                                    null);
2137                            break loop1;
2138                        }
2139                        value *= 16;
2140                        value += n;
2141                    }
2142                }
2143            } else {
2144                loop2: while (true) {
2145                    c = readCh();
2146                    if (c == ';') {
2147                        break loop2;
2148                    } else {
2149                        int n = Character.digit(c, 10);
2150                        if (n == -1) {
2151                            fatal("illegal character in character reference", c,
2152                                    null);
2153                            break loop2;
2154                        }
2155                        value *= 10;
2156                        value += c - '0';
2157                    }
2158                }
2159            }
2160    
2161            // check for character refs being legal XML
2162            if ((value < 0x0020 && !(value == '\n' || value == '\t' || value == '\r'))
2163                    || (value >= 0xD800 && value <= 0xDFFF)
2164                    || value == 0xFFFE
2165                    || value == 0xFFFF || value > 0x0010ffff) {
2166                fatal("illegal XML character reference U+"
2167                        + Integer.toHexString(value));
2168            } else if (value >= 0x007F && value <= 0x009F) // 2006-11-13 hsivonen
2169            {
2170                handler.warn("Character reference expands to a control character: U+00"
2171                        + Integer.toHexString(c) + ".");
2172            }
2173            if (isPrivateUse(value)) {
2174                warnAboutPrivateUseChar();
2175            }
2176    
2177            // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2178            // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2179            if (value <= 0x0000ffff) {
2180                // no surrogates needed
2181                dataBufferAppend((char) value);
2182            } else if (value <= 0x0010ffff) {
2183                value -= 0x10000;
2184                // > 16 bits, surrogate needed
2185                dataBufferAppend((char) (0xd800 | (value >> 10)));
2186                dataBufferAppend((char) (0xdc00 | (value & 0x0003ff)));
2187            } else {
2188                // too big for surrogate
2189                fatal("character reference " + value + " is too large for UTF-16",
2190                        new Integer(value).toString(), null);
2191            }
2192            if (doFlush) {
2193                dataBufferFlush();
2194            }
2195        }
2196    
2197        /**
2198         * Parse and expand an entity reference.
2199         * 
2200         * <pre>
2201         *  [68] EntityRef ::= '&amp;' Name ';'
2202         * </pre>
2203         * 
2204         * <p>
2205         * NOTE: the '&amp;' has already been read.
2206         * 
2207         * @param externalAllowed
2208         *            External entities are allowed here.
2209         */
2210        private void parseEntityRef(boolean externalAllowed) throws SAXException,
2211                IOException {
2212            String name;
2213    
2214            name = readNmtoken(true);
2215            require(';');
2216            switch (getEntityType(name)) {
2217                case ENTITY_UNDECLARED:
2218                    // NOTE: XML REC describes amazingly convoluted handling for
2219                    // this case. Nothing as meaningful as being a WFness error
2220                    // unless the processor might _legitimately_ not have seen a
2221                    // declaration ... which is what this implements.
2222                    String message;
2223    
2224                    message = "reference to undeclared general entity " + name;
2225                    if (skippedPE && !docIsStandalone) {
2226                        handler.verror(message);
2227                        // we don't know this entity, and it might be external...
2228                        if (externalAllowed) {
2229                            handler.skippedEntity(name);
2230                        }
2231                    } else {
2232                        fatal(message);
2233                    }
2234                    break;
2235                case ENTITY_INTERNAL:
2236                    pushString(name, getEntityValue(name));
2237    
2238                    // workaround for possible input pop before marking
2239                    // the buffer reading position
2240                    char t = readCh();
2241                    unread(t);
2242                    int bufferPosMark = readBufferPos;
2243    
2244                    int end = readBufferPos + getEntityValue(name).length();
2245                    for (int k = readBufferPos; k < end; k++) {
2246                        t = readCh();
2247                        if (t == '&') {
2248                            t = readCh();
2249                            if (t == '#') {
2250                                // try to match a character ref
2251                                tryReadCharRef();
2252    
2253                                // everything has been read
2254                                if (readBufferPos >= end) {
2255                                    break;
2256                                }
2257                                k = readBufferPos;
2258                                continue;
2259                            } else if (Character.isLetter(t)) {
2260                                // looks like an entity ref
2261                                unread(t);
2262                                readNmtoken(true);
2263                                require(';');
2264    
2265                                // everything has been read
2266                                if (readBufferPos >= end) {
2267                                    break;
2268                                }
2269                                k = readBufferPos;
2270                                continue;
2271                            }
2272                            fatal(" malformed entity reference");
2273                        }
2274    
2275                    }
2276                    readBufferPos = bufferPosMark;
2277                    break;
2278                case ENTITY_TEXT:
2279                    if (externalAllowed) {
2280                        pushURL(false, name, getEntityIds(name), null, null, null,
2281                                true);
2282                    } else {
2283                        fatal("reference to external entity in attribute value.",
2284                                name, null);
2285                    }
2286                    break;
2287                case ENTITY_NDATA:
2288                    if (externalAllowed) {
2289                        fatal("unparsed entity reference in content", name, null);
2290                    } else {
2291                        fatal("reference to external entity in attribute value.",
2292                                name, null);
2293                    }
2294                    break;
2295                default:
2296                    throw new RuntimeException();
2297            }
2298        }
2299    
2300        /**
2301         * Parse and expand a parameter entity reference.
2302         * 
2303         * <pre>
2304         *  [69] PEReference ::= '%' Name ';'
2305         * </pre>
2306         * 
2307         * <p>
2308         * NOTE: the '%' has already been read.
2309         */
2310        private void parsePEReference() throws SAXException, IOException {
2311            String name;
2312    
2313            name = "%" + readNmtoken(true);
2314            require(';');
2315            switch (getEntityType(name)) {
2316                case ENTITY_UNDECLARED:
2317                    // VC: Entity Declared
2318                    handler.verror("reference to undeclared parameter entity "
2319                            + name);
2320    
2321                    // we should disable handling of all subsequent declarations
2322                    // unless this is a standalone document (info discarded)
2323                    break;
2324                case ENTITY_INTERNAL:
2325                    if (inLiteral) {
2326                        pushString(name, getEntityValue(name));
2327                    } else {
2328                        pushString(name, ' ' + getEntityValue(name) + ' ');
2329                    }
2330                    break;
2331                case ENTITY_TEXT:
2332                    if (!inLiteral) {
2333                        pushString(null, " ");
2334                    }
2335                    pushURL(true, name, getEntityIds(name), null, null, null, true);
2336                    if (!inLiteral) {
2337                        pushString(null, " ");
2338                    }
2339                    break;
2340            }
2341        }
2342    
2343        /**
2344         * Parse an entity declaration.
2345         * 
2346         * <pre>
2347         *  [70] EntityDecl ::= GEDecl | PEDecl
2348         *  [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
2349         *  [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
2350         *  [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
2351         *  [74] PEDef ::= EntityValue | ExternalID
2352         *  [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2353         *        | 'PUBLIC' S PubidLiteral S SystemLiteral
2354         *  [76] NDataDecl ::= S 'NDATA' S Name
2355         * </pre>
2356         * 
2357         * <p>
2358         * NOTE: the '&lt;!ENTITY' has already been read.
2359         */
2360        private void parseEntityDecl() throws Exception {
2361            boolean peFlag = false;
2362            int flags = 0;
2363    
2364            // Check for a parameter entity.
2365            expandPE = false;
2366            requireWhitespace();
2367            if (tryRead('%')) {
2368                peFlag = true;
2369                requireWhitespace();
2370            }
2371            expandPE = true;
2372    
2373            // Read the entity name, and prepend
2374            // '%' if necessary.
2375            String name = readNmtoken(true);
2376            // NE08
2377            if (name.indexOf(':') >= 0) {
2378                fatal("Illegal character(':') in entity name ", name, null);
2379            }
2380            if (peFlag) {
2381                name = "%" + name;
2382            }
2383    
2384            // Read the entity value.
2385            requireWhitespace();
2386            char c = readCh();
2387            unread(c);
2388            if (c == '"' || c == '\'') {
2389                // Internal entity ... replacement text has expanded refs
2390                // to characters and PEs, but not to general entities
2391                String value = readLiteral(flags);
2392                setInternalEntity(name, value);
2393            } else {
2394                // Read the external IDs
2395                ExternalIdentifiers ids = readExternalIds(false, false);
2396    
2397                // Check for NDATA declaration.
2398                boolean white = tryWhitespace();
2399                if (!peFlag && tryRead("NDATA")) {
2400                    if (!white) {
2401                        fatal("whitespace required before NDATA");
2402                    }
2403                    requireWhitespace();
2404                    String notationName = readNmtoken(true);
2405                    if (!skippedPE) {
2406                        setExternalEntity(name, ENTITY_NDATA, ids, notationName);
2407                        handler.unparsedEntityDecl(name, ids.publicId,
2408                                ids.systemId, ids.baseUri, notationName);
2409                    }
2410                } else if (!skippedPE) {
2411                    setExternalEntity(name, ENTITY_TEXT, ids, null);
2412                    handler.getDeclHandler().externalEntityDecl(name, ids.publicId,
2413                            handler.resolveURIs()
2414                            // FIXME: ASSUMES not skipped
2415                            // "false" forces error on bad URI
2416                            ? handler.absolutize(ids.baseUri, ids.systemId, false)
2417                                    : ids.systemId);
2418                }
2419            }
2420    
2421            // Finish the declaration.
2422            skipWhitespace();
2423            require('>');
2424        }
2425    
2426        /**
2427         * Parse a notation declaration.
2428         * 
2429         * <pre>
2430         *  [82] NotationDecl ::= '&lt;!NOTATION' S Name S
2431         *     (ExternalID | PublicID) S? '&gt;'
2432         *  [83] PublicID ::= 'PUBLIC' S PubidLiteral
2433         * </pre>
2434         * 
2435         * <P>
2436         * NOTE: the '&lt;!NOTATION' has already been read.
2437         */
2438        private void parseNotationDecl() throws Exception {
2439            String nname;
2440            ExternalIdentifiers ids;
2441    
2442            requireWhitespace();
2443            nname = readNmtoken(true);
2444            // NE08
2445            if (nname.indexOf(':') >= 0) {
2446                fatal("Illegal character(':') in notation name ", nname, null);
2447            }
2448            requireWhitespace();
2449    
2450            // Read the external identifiers.
2451            ids = readExternalIds(true, false);
2452    
2453            // Register the notation.
2454            setNotation(nname, ids);
2455    
2456            skipWhitespace();
2457            require('>');
2458        }
2459    
2460        /**
2461         * Parse character data.
2462         * 
2463         * <pre>
2464         *  [14] CharData ::= [&circ;&lt;&amp;]* - ([&circ;&lt;&amp;]* ']]&gt;' [&circ;&lt;&amp;]*)
2465         * </pre>
2466         */
2467        private void parseCharData() throws Exception {
2468            char c;
2469            int state = 0;
2470            boolean pureWhite = false;
2471    
2472            // assert (dataBufferPos == 0);
2473    
2474            // are we expecting pure whitespace? it might be dirty...
2475            if ((currentElementContent == CONTENT_ELEMENTS)
2476                    && !isDirtyCurrentElement) {
2477                pureWhite = true;
2478            }
2479    
2480            // always report right out of readBuffer
2481            // to minimize (pointless) buffer copies
2482            while (true) {
2483                int i;
2484    
2485                loop: for (i = readBufferPos; i < readBufferLength; i++) {
2486                    advanceLocation();
2487                    switch (c = readBuffer[i]) {
2488                        case '\n':
2489                            nextCharOnNewLine = true;
2490                            // pureWhite unmodified
2491                            break;
2492                        case '\r': // should not happen!!
2493                        case '\t':
2494                        case ' ':
2495                            // pureWhite unmodified
2496                            break;
2497                        case '&':
2498                        case '<':
2499                            // pureWhite unmodified
2500                            // CLEAN end of text sequence
2501                            state = 1;
2502                            break loop;
2503                        case ']':
2504                            // that's not a whitespace char, and
2505                            // can not terminate pure whitespace either
2506                            pureWhite = false;
2507                            if ((i + 2) < readBufferLength) {
2508                                if (readBuffer[i + 1] == ']'
2509                                        && readBuffer[i + 2] == '>') {
2510                                    // ERROR end of text sequence
2511                                    state = 2;
2512                                    break loop;
2513                                }
2514                            } else {
2515                                // FIXME missing two end-of-buffer cases
2516                            }
2517                            break;
2518                        default:
2519                            if ((c < 0x0020 || c > 0xFFFD)
2520                                    || ((c >= 0x007f) && (c <= 0x009f)
2521                                            && (c != 0x0085) && xmlVersion == XML_11)) {
2522                                fatal("illegal XML character U+"
2523                                        + Integer.toHexString(c));
2524                            } else if (c >= '\u007F' && c <= '\u009F') // 2006-04-25
2525                                                                        // hsivonen
2526                            {
2527                                handler.warn("Saw a control character: U+00"
2528                                        + Integer.toHexString(c) + ".");
2529                            }
2530                            // that's not a whitespace char
2531                            pureWhite = false;
2532                    }
2533                }
2534                rollbackLocation();
2535                // report characters/whitspace
2536                int length = i - readBufferPos;
2537    
2538                if (length != 0) {
2539                    int saveLine = line;
2540                    int saveColumn = column;
2541                    line = linePrev;
2542                    column = columnPrev;
2543                    if (pureWhite) {
2544                        handler.ignorableWhitespace(readBuffer, readBufferPos,
2545                                length);
2546                    } else {
2547                        handler.charData(readBuffer, readBufferPos, length);
2548                    }
2549                    line = saveLine;
2550                    column = saveColumn;
2551                    readBufferPos = i;
2552                }
2553    
2554                if (state != 0) {
2555                    break;
2556                }
2557    
2558                // fill next buffer from this entity, or
2559                // pop stack and continue with previous entity
2560                unread(readCh());
2561            }
2562            if (!pureWhite) {
2563                isDirtyCurrentElement = true;
2564            }
2565            // finish, maybe with error
2566            if (state != 1) // finish, no error
2567            {
2568                fatal("character data may not contain ']]>'");
2569            }
2570        }
2571    
2572        /**
2573         * 
2574         */
2575        private void advanceLocation() {
2576            linePrev = line;
2577            columnPrev = column;
2578            if (nextCharOnNewLine) {
2579                line++;
2580                column = 1;
2581            } else {
2582                column++;
2583            }
2584            nextCharOnNewLine = false;
2585        }
2586    
2587        // ////////////////////////////////////////////////////////////////////
2588        // High-level reading and scanning methods.
2589        // ////////////////////////////////////////////////////////////////////
2590    
2591        /**
2592         * Require whitespace characters.
2593         */
2594        private void requireWhitespace() throws SAXException, IOException {
2595            char c = readCh();
2596            if (isWhitespace(c)) {
2597                skipWhitespace();
2598            } else {
2599                fatal("whitespace required", c, null);
2600            }
2601        }
2602    
2603        /**
2604         * Skip whitespace characters.
2605         * 
2606         * <pre>
2607         *  [3] S ::= (#x20 | #x9 | #xd | #xa)+
2608         * </pre>
2609         */
2610        private void skipWhitespace() throws SAXException, IOException {
2611            // Start with a little cheat. Most of
2612            // the time, the white space will fall
2613            // within the current read buffer; if
2614            // not, then fall through.
2615            if (USE_CHEATS) {
2616    
2617                loop: for (int i = readBufferPos; i < readBufferLength; i++) {
2618                    advanceLocation();
2619                    switch (readBuffer[i]) {
2620                        case ' ':
2621                        case '\t':
2622                        case '\r':
2623                            break;
2624                        case '\n':
2625                            nextCharOnNewLine = true;
2626                            break;
2627                        case '%':
2628                            if (expandPE) {
2629                                break loop;
2630                            }
2631                            // else fall through...
2632                        default:
2633                            readBufferPos = i;
2634                            return;
2635                    }
2636                }
2637            }
2638    
2639            // OK, do it the slow way.
2640            char c = readCh();
2641            while (isWhitespace(c)) {
2642                c = readCh();
2643            }
2644            unread(c);
2645        }
2646    
2647        /**
2648         * Read a name or (when parsing an enumeration) name token.
2649         * 
2650         * <pre>
2651         *  [5] Name ::= (Letter | '_' | ':') (NameChar)*
2652         *  [7] Nmtoken ::= (NameChar)+
2653         * </pre>
2654         */
2655        private String readNmtoken(boolean isName) throws SAXException, IOException {
2656            char c;
2657    
2658            if (USE_CHEATS) {
2659                loop: for (int i = readBufferPos; i < readBufferLength; i++) {
2660                    c = readBuffer[i];
2661                    switch (c) {
2662                        case '%':
2663                            if (expandPE) {
2664                                break loop;
2665                            }
2666                            // else fall through...
2667    
2668                            // What may legitimately come AFTER a name/nmtoken?
2669                        case '<':
2670                        case '>':
2671                        case '&':
2672                        case ',':
2673                        case '|':
2674                        case '*':
2675                        case '+':
2676                        case '?':
2677                        case ')':
2678                        case '=':
2679                        case '\'':
2680                        case '"':
2681                        case '[':
2682                        case ' ':
2683                        case '\t':
2684                        case '\r':
2685                        case '\n':
2686                        case ';':
2687                        case '/':
2688                            int start = readBufferPos;
2689                            if (i == start) {
2690                                fatal("name expected", readBuffer[i], null);
2691                            }
2692                            readBufferPos = i;
2693                            return intern(readBuffer, start, i - start);
2694    
2695                        default:
2696                            // FIXME ... per IBM's OASIS test submission, these:
2697                            // ? U+06dd
2698                            // Combining U+309B
2699                            // these switches are kind of ugly but at least we won't
2700                            // have to go over the whole lits for each char
2701                            if (isName && i == readBufferPos) {
2702                                char c2 = (char) (c & 0x00f0);
2703                                switch (c & 0xff00) {
2704                                    // starting with 01
2705                                    case 0x0100:
2706                                        switch (c2) {
2707                                            case 0x0030:
2708                                                if (c == 0x0132 || c == 0x0133
2709                                                        || c == 0x013f) {
2710                                                    fatal("Not a name start character, U+"
2711                                                            + Integer.toHexString(c));
2712                                                }
2713                                                break;
2714                                            case 0x0040:
2715                                                if (c == 0x0140 || c == 0x0149) {
2716                                                    fatal("Not a name start character, U+"
2717                                                            + Integer.toHexString(c));
2718                                                }
2719                                                break;
2720                                            case 0x00c0:
2721                                                if (c == 0x01c4 || c == 0x01cc) {
2722                                                    fatal("Not a name start character, U+"
2723                                                            + Integer.toHexString(c));
2724                                                }
2725                                                break;
2726                                            case 0x00f0:
2727                                                if (c == 0x01f1 || c == 0x01f3) {
2728                                                    fatal("Not a name start character, U+"
2729                                                            + Integer.toHexString(c));
2730                                                }
2731                                                break;
2732                                            case 0x00b0:
2733                                                if (c == 0x01f1 || c == 0x01f3) {
2734                                                    fatal("Not a name start character, U+"
2735                                                            + Integer.toHexString(c));
2736                                                }
2737                                                break;
2738                                            default:
2739                                                if (c == 0x017f) {
2740                                                    fatal("Not a name start character, U+"
2741                                                            + Integer.toHexString(c));
2742                                                }
2743                                        }
2744    
2745                                        break;
2746                                    // starting with 11
2747                                    case 0x1100:
2748                                        switch (c2) {
2749                                            case 0x0000:
2750                                                if (c == 0x1104 || c == 0x1108
2751                                                        || c == 0x110a
2752                                                        || c == 0x110d) {
2753                                                    fatal("Not a name start character, U+"
2754                                                            + Integer.toHexString(c));
2755                                                }
2756                                                break;
2757                                            case 0x0030:
2758                                                if (c == 0x113b || c == 0x113f) {
2759                                                    fatal("Not a name start character, U+"
2760                                                            + Integer.toHexString(c));
2761                                                }
2762                                                break;
2763                                            case 0x0040:
2764                                                if (c == 0x1141 || c == 0x114d
2765                                                        || c == 0x114f) {
2766                                                    fatal("Not a name start character, U+"
2767                                                            + Integer.toHexString(c));
2768                                                }
2769                                                break;
2770                                            case 0x0050:
2771                                                if (c == 0x1151 || c == 0x1156) {
2772                                                    fatal("Not a name start character, U+"
2773                                                            + Integer.toHexString(c));
2774                                                }
2775                                                break;
2776                                            case 0x0060:
2777                                                if (c == 0x1162 || c == 0x1164
2778                                                        || c == 0x1166
2779                                                        || c == 0x116b
2780                                                        || c == 0x116f) {
2781                                                    fatal("Not a name start character, U+"
2782                                                            + Integer.toHexString(c));
2783                                                }
2784                                                break;
2785                                            case 0x00b0:
2786                                                if (c == 0x11b6 || c == 0x11b9
2787                                                        || c == 0x11bb
2788                                                        || c == 0x116f) {
2789                                                    fatal("Not a name start character, U+"
2790                                                            + Integer.toHexString(c));
2791                                                }
2792                                                break;
2793                                            default:
2794                                                if (c == 0x1174 || c == 0x119f
2795                                                        || c == 0x11ac
2796                                                        || c == 0x11c3
2797                                                        || c == 0x11f1) {
2798                                                    fatal("Not a name start character, U+"
2799                                                            + Integer.toHexString(c));
2800                                                }
2801                                        }
2802                                        break;
2803                                    default:
2804                                        if (c == 0x0e46 || c == 0x1011
2805                                                || c == 0x212f || c == 0x0587
2806                                                || c == 0x0230) {
2807                                            fatal("Not a name start character, U+"
2808                                                    + Integer.toHexString(c));
2809                                        }
2810                                }
2811                            }
2812                            // punt on exact tests from Appendix A; approximate
2813                            // them using the Unicode ID start/part rules
2814                            if (i == readBufferPos && isName) {
2815                                if (!Character.isUnicodeIdentifierStart(c)
2816                                        && c != ':' && c != '_') {
2817                                    fatal("Not a name start character, U+"
2818                                            + Integer.toHexString(c));
2819                                }
2820                            } else if (!Character.isUnicodeIdentifierPart(c)
2821                                    && c != '-' && c != ':' && c != '_' && c != '.'
2822                                    && !isExtender(c)) {
2823                                fatal("Not a name character, U+"
2824                                        + Integer.toHexString(c));
2825                            }
2826                    }
2827                }
2828            }
2829    
2830            nameBufferPos = 0;
2831    
2832            // Read the first character.
2833            loop: while (true) {
2834                c = readCh();
2835                switch (c) {
2836                    case '%':
2837                    case '<':
2838                    case '>':
2839                    case '&':
2840                    case ',':
2841                    case '|':
2842                    case '*':
2843                    case '+':
2844                    case '?':
2845                    case ')':
2846                    case '=':
2847                    case '\'':
2848                    case '"':
2849                    case '[':
2850                    case ' ':
2851                    case '\t':
2852                    case '\n':
2853                    case '\r':
2854                    case ';':
2855                    case '/':
2856                        unread(c);
2857                        if (nameBufferPos == 0) {
2858                            fatal("name expected");
2859                        }
2860                        // punt on exact tests from Appendix A, but approximate them
2861                        if (isName
2862                                && !Character.isUnicodeIdentifierStart(nameBuffer[0])
2863                                && ":_".indexOf(nameBuffer[0]) == -1) {
2864                            fatal("Not a name start character, U+"
2865                                    + Integer.toHexString(nameBuffer[0]));
2866                        }
2867                        String s = intern(nameBuffer, 0, nameBufferPos);
2868                        nameBufferPos = 0;
2869                        return s;
2870                    default:
2871                        // punt on exact tests from Appendix A, but approximate them
2872    
2873                        if ((nameBufferPos != 0 || !isName)
2874                                && !Character.isUnicodeIdentifierPart(c)
2875                                && ":-_.".indexOf(c) == -1 && !isExtender(c)) {
2876                            fatal("Not a name character, U+"
2877                                    + Integer.toHexString(c));
2878                        }
2879                        if (nameBufferPos >= nameBuffer.length) {
2880                            nameBuffer = (char[]) extendArray(nameBuffer,
2881                                    nameBuffer.length, nameBufferPos);
2882                        }
2883                        nameBuffer[nameBufferPos++] = c;
2884                }
2885            }
2886        }
2887    
2888        private static boolean isExtender(char c) {
2889            // [88] Extender ::= ...
2890            return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
2891                    || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
2892                    || (c >= 0x3031 && c <= 0x3035) || (c >= 0x309d && c <= 0x309e)
2893                    || (c >= 0x30fc && c <= 0x30fe);
2894        }
2895    
2896        /**
2897         * Read a literal. With matching single or double quotes as delimiters (and
2898         * not embedded!) this is used to parse:
2899         * 
2900         * <pre>
2901         *   [9] EntityValue ::= ... ([&circ;%&amp;] | PEReference | Reference)* ...
2902         *   [10] AttValue ::= ... ([&circ;&lt;&amp;] | Reference)* ...
2903         *   [11] SystemLiteral ::= ... (URLchar - &quot;'&quot;)* ...
2904         *   [12] PubidLiteral ::= ... (PubidChar - &quot;'&quot;)* ...
2905         * </pre>
2906         * 
2907         * as well as the quoted strings in XML and text declarations (for version,
2908         * encoding, and standalone) which have their own constraints.
2909         */
2910        private String readLiteral(int flags) throws SAXException, IOException {
2911            char delim, c;
2912            int startLine = line;
2913            boolean saved = expandPE;
2914            boolean savedReport = doReport;
2915    
2916            // Find the first delimiter.
2917            delim = readCh();
2918            if (delim != '"' && delim != '\'') {
2919                fatal("expected '\"' or \"'\"", delim, null);
2920                return null;
2921            }
2922            inLiteral = true;
2923            if ((flags & LIT_DISABLE_PE) != 0) {
2924                expandPE = false;
2925            }
2926            doReport = false;
2927    
2928            // Each level of input source has its own buffer; remember
2929            // ours, so we won't read the ending delimiter from any
2930            // other input source, regardless of entity processing.
2931            char[] ourBuf = readBuffer;
2932    
2933            // Read the literal.
2934            try {
2935                c = readCh();
2936                loop: while (!(c == delim && readBuffer == ourBuf)) {
2937                    switch (c) {
2938                        // attributes and public ids are normalized
2939                        // in almost the same ways
2940                        case '\n':
2941                        case '\r':
2942                            if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) {
2943                                c = ' ';
2944                            }
2945                            break;
2946                        case '\t':
2947                            if ((flags & LIT_ATTRIBUTE) != 0) {
2948                                c = ' ';
2949                            }
2950                            break;
2951                        case '&':
2952                            c = readCh();
2953                            // Char refs are expanded immediately, except for
2954                            // all the cases where it's deferred.
2955                            if (c == '#') {
2956                                if ((flags & LIT_DISABLE_CREF) != 0) {
2957                                    dataBufferAppend('&');
2958                                    break;
2959                                }
2960                                parseCharRef(false /* Do not do flushDataBuffer */);
2961    
2962                                // exotic WFness risk: this is an entity literal,
2963                                // dataBuffer [dataBufferPos - 1] == '&', and
2964                                // following chars are a _partial_ entity/char ref
2965    
2966                                // It looks like an entity ref ...
2967                            } else {
2968                                unread(c);
2969                                // Expand it?
2970                                if ((flags & LIT_ENTITY_REF) > 0) {
2971                                    parseEntityRef(false);
2972                                    // Is it just data?
2973                                } else if ((flags & LIT_DISABLE_EREF) != 0) {
2974                                    dataBufferAppend('&');
2975    
2976                                    // OK, it will be an entity ref -- expanded
2977                                    // later.
2978                                } else {
2979                                    String name = readNmtoken(true);
2980                                    require(';');
2981                                    dataBufferAppend('&');
2982                                    dataBufferAppend(name);
2983                                    dataBufferAppend(';');
2984                                }
2985                            }
2986                            c = readCh();
2987                            continue loop;
2988    
2989                        case '<':
2990                            // and why? Perhaps so "&foo;" expands the same
2991                            // inside and outside an attribute?
2992                            if ((flags & LIT_ATTRIBUTE) != 0) {
2993                                fatal("attribute values may not contain '<'");
2994                            }
2995                            break;
2996    
2997                        // We don't worry about case '%' and PE refs, readCh does.
2998    
2999                        default:
3000                            break;
3001                    }
3002                    dataBufferAppend(c);
3003                    c = readCh();
3004                }
3005            } catch (EOFException e) {
3006                fatal("end of input while looking for delimiter (started on line "
3007                        + startLine + ')', null, new Character(delim).toString());
3008            }
3009            inLiteral = false;
3010            expandPE = saved;
3011            doReport = savedReport;
3012    
3013            // Normalise whitespace if necessary.
3014            if ((flags & LIT_NORMALIZE) > 0) {
3015                dataBufferNormalize();
3016            }
3017    
3018            // Return the value.
3019            return dataBufferToString();
3020        }
3021    
3022        /**
3023         * Try reading external identifiers. A system identifier is not required for
3024         * notations.
3025         * 
3026         * @param inNotation
3027         *            Are we parsing a notation decl?
3028         * @param isSubset
3029         *            Parsing external subset decl (may be omitted)?
3030         * @return A three-member String array containing the identifiers, or nulls.
3031         *         Order: public, system, baseURI.
3032         */
3033        private ExternalIdentifiers readExternalIds(boolean inNotation,
3034                boolean isSubset) throws Exception {
3035            char c;
3036            ExternalIdentifiers ids = new ExternalIdentifiers();
3037            int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
3038    
3039            if (tryRead("PUBLIC")) {
3040                requireWhitespace();
3041                ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags);
3042                if (inNotation) {
3043                    skipWhitespace();
3044                    c = readCh();
3045                    unread(c);
3046                    if (c == '"' || c == '\'') {
3047                        ids.systemId = readLiteral(flags);
3048                    }
3049                } else {
3050                    requireWhitespace();
3051                    ids.systemId = readLiteral(flags);
3052                }
3053    
3054                for (int i = 0; i < ids.publicId.length(); i++) {
3055                    c = ids.publicId.charAt(i);
3056                    if (c >= 'a' && c <= 'z') {
3057                        continue;
3058                    }
3059                    if (c >= 'A' && c <= 'Z') {
3060                        continue;
3061                    }
3062                    if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1) {
3063                        continue;
3064                    }
3065                    fatal("illegal PUBLIC id character U+" + Integer.toHexString(c));
3066                }
3067            } else if (tryRead("SYSTEM")) {
3068                requireWhitespace();
3069                ids.systemId = readLiteral(flags);
3070            } else if (!isSubset) {
3071                fatal("missing SYSTEM or PUBLIC keyword");
3072            }
3073    
3074            if (ids.systemId != null) {
3075                if (ids.systemId.indexOf('#') != -1) {
3076                    handler.verror("SYSTEM id has a URI fragment: " + ids.systemId);
3077                }
3078                ids.baseUri = handler.getSystemId();
3079                if (ids.baseUri == null && uriWarnings) {
3080                    handler.warn("No base URI; hope URI is absolute: "
3081                            + ids.systemId);
3082                }
3083            }
3084    
3085            return ids;
3086        }
3087    
3088        /**
3089         * Test if a character is whitespace.
3090         * 
3091         * <pre>
3092         *  [3] S ::= (#x20 | #x9 | #xd | #xa)+
3093         * </pre>
3094         * 
3095         * @param c
3096         *            The character to test.
3097         * @return true if the character is whitespace.
3098         */
3099        private final boolean isWhitespace(char c) {
3100            if (c > 0x20) {
3101                return false;
3102            }
3103            if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) {
3104                return true;
3105            }
3106            return false; // illegal ...
3107        }
3108    
3109        // ////////////////////////////////////////////////////////////////////
3110        // Utility routines.
3111        // ////////////////////////////////////////////////////////////////////
3112    
3113        /**
3114         * Add a character to the data buffer.
3115         */
3116        private void dataBufferAppend(char c) {
3117            // Expand buffer if necessary.
3118            if (dataBufferPos >= dataBuffer.length) {
3119                dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
3120                        dataBufferPos);
3121            }
3122            dataBuffer[dataBufferPos++] = c;
3123        }
3124    
3125        /**
3126         * Add a string to the data buffer.
3127         */
3128        private void dataBufferAppend(String s) {
3129            dataBufferAppend(s.toCharArray(), 0, s.length());
3130        }
3131    
3132        /**
3133         * Append (part of) a character array to the data buffer.
3134         */
3135        private void dataBufferAppend(char[] ch, int start, int length) {
3136            dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
3137                    dataBufferPos + length);
3138    
3139            System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
3140            dataBufferPos += length;
3141        }
3142    
3143        /**
3144         * Normalise space characters in the data buffer.
3145         */
3146        private void dataBufferNormalize() {
3147            int i = 0;
3148            int j = 0;
3149            int end = dataBufferPos;
3150    
3151            // Skip spaces at the start.
3152            while (j < end && dataBuffer[j] == ' ') {
3153                j++;
3154            }
3155    
3156            // Skip whitespace at the end.
3157            while (end > j && dataBuffer[end - 1] == ' ') {
3158                end--;
3159            }
3160    
3161            // Start copying to the left.
3162            while (j < end) {
3163    
3164                char c = dataBuffer[j++];
3165    
3166                // Normalise all other spaces to
3167                // a single space.
3168                if (c == ' ') {
3169                    while (j < end && dataBuffer[j++] == ' ') {
3170                        continue;
3171                    }
3172                    dataBuffer[i++] = ' ';
3173                    dataBuffer[i++] = dataBuffer[j - 1];
3174                } else {
3175                    dataBuffer[i++] = c;
3176                }
3177            }
3178    
3179            // The new length is <= the old one.
3180            dataBufferPos = i;
3181        }
3182    
3183        /**
3184         * Convert the data buffer to a string.
3185         */
3186        private String dataBufferToString() {
3187            String s = new String(dataBuffer, 0, dataBufferPos);
3188            dataBufferPos = 0;
3189            return s;
3190        }
3191    
3192        /**
3193         * Flush the contents of the data buffer to the handler, as appropriate, and
3194         * reset the buffer for new input.
3195         */
3196        private void dataBufferFlush() throws SAXException {
3197            int saveLine = line;
3198            int saveColumn = column;
3199            line = linePrev;
3200            column = columnPrev;
3201            if (currentElementContent == CONTENT_ELEMENTS && dataBufferPos > 0
3202                    && !inCDATA) {
3203                // We can't just trust the buffer to be whitespace, there
3204                // are (error) cases when it isn't
3205                for (int i = 0; i < dataBufferPos; i++) {
3206                    if (!isWhitespace(dataBuffer[i])) {
3207                        handler.charData(dataBuffer, 0, dataBufferPos);
3208                        dataBufferPos = 0;
3209                    }
3210                }
3211                if (dataBufferPos > 0) {
3212                    handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
3213                    dataBufferPos = 0;
3214                }
3215            } else if (dataBufferPos > 0) {
3216                handler.charData(dataBuffer, 0, dataBufferPos);
3217                dataBufferPos = 0;
3218            }
3219            line = saveLine;
3220            column = saveColumn;
3221        }
3222    
3223        /**
3224         * Require a string to appear, or throw an exception.
3225         * <p>
3226         * <em>Precondition:</em> Entity expansion is not required.
3227         * <p>
3228         * <em>Precondition:</em> data buffer has no characters that will get sent
3229         * to the application.
3230         */
3231        private void require(String delim) throws SAXException, IOException {
3232            int length = delim.length();
3233            char[] ch;
3234    
3235            if (length < dataBuffer.length) {
3236                ch = dataBuffer;
3237                delim.getChars(0, length, ch, 0);
3238            } else {
3239                ch = delim.toCharArray();
3240            }
3241    
3242            if (USE_CHEATS && length <= (readBufferLength - readBufferPos)) {
3243                int offset = readBufferPos;
3244    
3245                for (int i = 0; i < length; i++, offset++) {
3246                    if (ch[i] != readBuffer[offset]) {
3247                        fatal("required string", null, delim);
3248                    }
3249                }
3250                readBufferPos = offset;
3251    
3252            } else {
3253                for (int i = 0; i < length; i++) {
3254                    require(ch[i]);
3255                }
3256            }
3257        }
3258    
3259        /**
3260         * Require a character to appear, or throw an exception.
3261         */
3262        private void require(char delim) throws SAXException, IOException {
3263            char c = readCh();
3264    
3265            if (c != delim) {
3266                fatal("required character", c, new Character(delim).toString());
3267            }
3268        }
3269    
3270        /**
3271         * Create an interned string from a character array. &AElig;lfred uses this
3272         * method to create an interned version of all names and name tokens, so
3273         * that it can test equality with <code>==</code> instead of
3274         * <code>String.equals ()</code>.
3275         * 
3276         * <p>
3277         * This is much more efficient than constructing a non-interned string
3278         * first, and then interning it.
3279         * 
3280         * @param ch
3281         *            an array of characters for building the string.
3282         * @param start
3283         *            the starting position in the array.
3284         * @param length
3285         *            the number of characters to place in the string.
3286         * @return an interned string.
3287         * @see #intern (String)
3288         * @see java.lang.String#intern
3289         */
3290        public String intern(char[] ch, int start, int length) {
3291            int index = 0;
3292            int hash = 0;
3293            Object[] bucket;
3294    
3295            // Generate a hash code. This is a widely used string hash,
3296            // often attributed to Brian Kernighan.
3297            for (int i = start; i < start + length; i++) {
3298                hash = 31 * hash + ch[i];
3299            }
3300            hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
3301    
3302            // Get the bucket -- consists of {array,String} pairs
3303            if ((bucket = symbolTable[hash]) == null) {
3304                // first string in this bucket
3305                bucket = new Object[8];
3306    
3307                // Search for a matching tuple, and
3308                // return the string if we find one.
3309            } else {
3310                while (index < bucket.length) {
3311                    char[] chFound = (char[]) bucket[index];
3312    
3313                    // Stop when we hit an empty entry.
3314                    if (chFound == null) {
3315                        break;
3316                    }
3317    
3318                    // If they're the same length, check for a match.
3319                    if (chFound.length == length) {
3320                        for (int i = 0; i < chFound.length; i++) {
3321                            // continue search on failure
3322                            if (ch[start + i] != chFound[i]) {
3323                                break;
3324                            } else if (i == length - 1) {
3325                                // That's it, we have a match!
3326                                return (String) bucket[index + 1];
3327                            }
3328                        }
3329                    }
3330                    index += 2;
3331                }
3332                // Not found -- we'll have to add it.
3333    
3334                // Do we have to grow the bucket?
3335                bucket = (Object[]) extendArray(bucket, bucket.length, index);
3336            }
3337            symbolTable[hash] = bucket;
3338    
3339            // OK, add it to the end of the bucket -- "local" interning.
3340            // Intern "globally" to let applications share interning benefits.
3341            // That is, "!=" and "==" work on our strings, not just equals().
3342            String s = new String(ch, start, length).intern();
3343            bucket[index] = s.toCharArray();
3344            bucket[index + 1] = s;
3345            return s;
3346        }
3347    
3348        /**
3349         * Ensure the capacity of an array, allocating a new one if necessary.
3350         * Usually extends only for name hash collisions.
3351         */
3352        private Object extendArray(Object array, int currentSize, int requiredSize) {
3353            if (requiredSize < currentSize) {
3354                return array;
3355            } else {
3356                System.err.println(requiredSize);
3357                System.err.flush();
3358                Object newArray = null;
3359                int newSize = currentSize * 2;
3360    
3361                if (newSize <= requiredSize) {
3362                    newSize = requiredSize + 1;
3363                }
3364    
3365                if (array instanceof char[]) {
3366                    newArray = new char[newSize];
3367                } else if (array instanceof Object[]) {
3368                    newArray = new Object[newSize];
3369                } else {
3370                    throw new RuntimeException();
3371                }
3372    
3373                System.arraycopy(array, 0, newArray, 0, currentSize);
3374                return newArray;
3375            }
3376        }
3377    
3378        // ////////////////////////////////////////////////////////////////////
3379        // XML query routines.
3380        // ////////////////////////////////////////////////////////////////////
3381    
3382        boolean isStandalone() {
3383            return docIsStandalone;
3384        }
3385    
3386        //
3387        // Elements
3388        //
3389    
3390        private int getContentType(ElementDecl element, int defaultType) {
3391            int retval;
3392    
3393            if (element == null) {
3394                return defaultType;
3395            }
3396            retval = element.contentType;
3397            if (retval == CONTENT_UNDECLARED) {
3398                retval = defaultType;
3399            }
3400            return retval;
3401        }
3402    
3403        /**
3404         * Look up the content type of an element.
3405         * 
3406         * @param name
3407         *            The element type name.
3408         * @return An integer constant representing the content type.
3409         * @see #CONTENT_UNDECLARED
3410         * @see #CONTENT_ANY
3411         * @see #CONTENT_EMPTY
3412         * @see #CONTENT_MIXED
3413         * @see #CONTENT_ELEMENTS
3414         */
3415        public int getElementContentType(String name) {
3416            ElementDecl element = elementInfo.get(name);
3417            return getContentType(element, CONTENT_UNDECLARED);
3418        }
3419    
3420        /**
3421         * Register an element. Array format: [0] element type name [1] content
3422         * model (mixed, elements only) [2] attribute hash table
3423         */
3424        private void setElement(String name, int contentType, String contentModel,
3425                HashMap<String, AttributeDecl> attributes) throws SAXException {
3426            if (skippedPE) {
3427                return;
3428            }
3429    
3430            ElementDecl element = elementInfo.get(name);
3431    
3432            // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
3433            if (element == null) {
3434                element = new ElementDecl();
3435                element.contentType = contentType;
3436                element.contentModel = contentModel;
3437                element.attributes = attributes;
3438                elementInfo.put(name, element);
3439                return;
3440            }
3441    
3442            // <!ELEMENT ...> declaration?
3443            if (contentType != CONTENT_UNDECLARED) {
3444                // ... following an associated <!ATTLIST ...>
3445                if (element.contentType == CONTENT_UNDECLARED) {
3446                    element.contentType = contentType;
3447                    element.contentModel = contentModel;
3448                } else {
3449                    // VC: Unique Element Type Declaration
3450                    handler.verror("multiple declarations for element type: "
3451                            + name);
3452                }
3453            }
3454    
3455            // first <!ATTLIST ...>, before <!ELEMENT ...> ?
3456            else if (attributes != null) {
3457                element.attributes = attributes;
3458            }
3459        }
3460    
3461        /**
3462         * Look up the attribute hash table for an element. The hash table is the
3463         * second item in the element array.
3464         */
3465        private HashMap<String, AttributeDecl> getElementAttributes(String name) {
3466            ElementDecl element = elementInfo.get(name);
3467            return (element == null) ? null : element.attributes;
3468        }
3469    
3470        //
3471        // Attributes
3472        //
3473    
3474        /**
3475         * Get the declared attributes for an element type.
3476         * 
3477         * @param elname
3478         *            The name of the element type.
3479         * @return An iterator over all the attributes declared for a specific
3480         *         element type. The results will be valid only after the DTD (if
3481         *         any) has been parsed.
3482         * @see #getAttributeType
3483         * @see #getAttributeEnumeration
3484         * @see #getAttributeDefaultValueType
3485         * @see #getAttributeDefaultValue
3486         * @see #getAttributeExpandedValue
3487         */
3488        private Iterator<String> declaredAttributes(ElementDecl element) {
3489            HashMap<String, AttributeDecl> attlist;
3490    
3491            if (element == null) {
3492                return null;
3493            }
3494            if ((attlist = element.attributes) == null) {
3495                return null;
3496            }
3497            return attlist.keySet().iterator();
3498        }
3499    
3500        /**
3501         * Get the declared attributes for an element type.
3502         * 
3503         * @param elname
3504         *            The name of the element type.
3505         * @return An iterator over all the attributes declared for a specific
3506         *         element type. The results will be valid only after the DTD (if
3507         *         any) has been parsed.
3508         * @see #getAttributeType
3509         * @see #getAttributeEnumeration
3510         * @see #getAttributeDefaultValueType
3511         * @see #getAttributeDefaultValue
3512         * @see #getAttributeExpandedValue
3513         */
3514        public Iterator<String> declaredAttributes(String elname) {
3515            return declaredAttributes(elementInfo.get(elname));
3516        }
3517    
3518        /**
3519         * Retrieve the declared type of an attribute.
3520         * 
3521         * @param name
3522         *            The name of the associated element.
3523         * @param aname
3524         *            The name of the attribute.
3525         * @return An interend string denoting the type, or null indicating an
3526         *         undeclared attribute.
3527         */
3528        public String getAttributeType(String name, String aname) {
3529            AttributeDecl attribute = getAttribute(name, aname);
3530            return (attribute == null) ? null : attribute.type;
3531        }
3532    
3533        /**
3534         * Retrieve the allowed values for an enumerated attribute type.
3535         * 
3536         * @param name
3537         *            The name of the associated element.
3538         * @param aname
3539         *            The name of the attribute.
3540         * @return A string containing the token list.
3541         */
3542        public String getAttributeEnumeration(String name, String aname) {
3543            AttributeDecl attribute = getAttribute(name, aname);
3544            // assert: attribute.enumeration is "ENUMERATION" or "NOTATION"
3545            return (attribute == null) ? null : attribute.enumeration;
3546        }
3547    
3548        /**
3549         * Retrieve the default value of a declared attribute.
3550         * 
3551         * @param name
3552         *            The name of the associated element.
3553         * @param aname
3554         *            The name of the attribute.
3555         * @return The default value, or null if the attribute was #IMPLIED or
3556         *         simply undeclared and unspecified.
3557         * @see #getAttributeExpandedValue
3558         */
3559        public String getAttributeDefaultValue(String name, String aname) {
3560            AttributeDecl attribute = getAttribute(name, aname);
3561            return (attribute == null) ? null : attribute.value;
3562        }
3563    
3564        /*
3565         *  // FIXME: Leaving this in, until W3C finally resolves the confusion //
3566         * between parts of the XML 2nd REC about when entity declararations // are
3567         * guaranteed to be known. Current code matches what section 5.1 //
3568         * (conformance) describes, but some readings of the self-contradicting //
3569         * text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that //
3570         * attribute expansion/normalization must be deferred in some cases // (just
3571         * TRY to identify them!).
3572         * 
3573         * Retrieve the expanded value of a declared attribute. <p>General entities
3574         * (and char refs) will be expanded (once). @param name The name of the
3575         * associated element. @param aname The name of the attribute. @return The
3576         * expanded default value, or null if the attribute was #IMPLIED or simply
3577         * undeclared
3578         * 
3579         * @see #getAttributeDefaultValue public String getAttributeExpandedValue
3580         *      (String name, String aname) throws Exception { AttributeDecl
3581         *      attribute = getAttribute (name, aname);
3582         * 
3583         * if (attribute == null) { return null; } else if (attribute.defaultValue ==
3584         * null && attribute.value != null) { // we MUST use the same buf for both
3585         * quotes else the literal // can't be properly terminated char buf [] = new
3586         * char [1]; int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; String type =
3587         * getAttributeType (name, aname);
3588         * 
3589         * if (type != "CDATA" && type != null) flags |= LIT_NORMALIZE; buf [0] =
3590         * '"'; pushCharArray (null, buf, 0, 1); pushString (null, attribute.value);
3591         * pushCharArray (null, buf, 0, 1); attribute.defaultValue = readLiteral
3592         * (flags); } return attribute.defaultValue; }
3593         */
3594    
3595        /**
3596         * Retrieve the default value mode of a declared attribute.
3597         * 
3598         * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3599         * @see #ATTRIBUTE_DEFAULT_IMPLIED
3600         * @see #ATTRIBUTE_DEFAULT_REQUIRED
3601         * @see #ATTRIBUTE_DEFAULT_FIXED
3602         */
3603        public int getAttributeDefaultValueType(String name, String aname) {
3604            AttributeDecl attribute = getAttribute(name, aname);
3605            return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED
3606                    : attribute.valueType;
3607        }
3608    
3609        /**
3610         * Register an attribute declaration for later retrieval. Format: - String
3611         * type - String default value - int value type - enumeration - processed
3612         * default value
3613         */
3614        private void setAttribute(String elName, String name, String type,
3615                String enumeration, String value, int valueType) throws Exception {
3616            HashMap<String, AttributeDecl> attlist;
3617    
3618            if (skippedPE) {
3619                return;
3620            }
3621    
3622            // Create a new hashtable if necessary.
3623            attlist = getElementAttributes(elName);
3624            if (attlist == null) {
3625                attlist = new HashMap<String, AttributeDecl>();
3626            }
3627    
3628            // ignore multiple attribute declarations!
3629            if (attlist.get(name) != null) {
3630                // warn ...
3631                return;
3632            } else {
3633                AttributeDecl attribute = new AttributeDecl();
3634                attribute.type = type;
3635                attribute.value = value;
3636                attribute.valueType = valueType;
3637                attribute.enumeration = enumeration;
3638                attlist.put(name, attribute);
3639    
3640                // save; but don't overwrite any existing <!ELEMENT ...>
3641                setElement(elName, CONTENT_UNDECLARED, null, attlist);
3642            }
3643        }
3644    
3645        /**
3646         * Retrieve the attribute declaration for the given element name and name.
3647         */
3648        private AttributeDecl getAttribute(String elName, String name) {
3649            HashMap<String, AttributeDecl> attlist = getElementAttributes(elName);
3650            return (attlist == null) ? null : attlist.get(name);
3651        }
3652    
3653        //
3654        // Entities
3655        //
3656    
3657        /**
3658         * Find the type of an entity.
3659         * 
3660         * @returns An integer constant representing the entity type.
3661         * @see #ENTITY_UNDECLARED
3662         * @see #ENTITY_INTERNAL
3663         * @see #ENTITY_NDATA
3664         * @see #ENTITY_TEXT
3665         */
3666        public int getEntityType(String ename) {
3667            EntityInfo entity = entityInfo.get(ename);
3668            return (entity == null) ? ENTITY_UNDECLARED : entity.type;
3669        }
3670    
3671        /**
3672         * Return an external entity's identifiers.
3673         * 
3674         * @param ename
3675         *            The name of the external entity.
3676         * @return The entity's public identifier, system identifier, and base URI.
3677         *         Null if the entity was not declared as an external entity.
3678         * @see #getEntityType
3679         */
3680        public ExternalIdentifiers getEntityIds(String ename) {
3681            EntityInfo entity = entityInfo.get(ename);
3682            return (entity == null) ? null : entity.ids;
3683        }
3684    
3685        /**
3686         * Return an internal entity's replacement text.
3687         * 
3688         * @param ename
3689         *            The name of the internal entity.
3690         * @return The entity's replacement text, or null if the entity was not
3691         *         declared as an internal entity.
3692         * @see #getEntityType
3693         */
3694        public String getEntityValue(String ename) {
3695            EntityInfo entity = entityInfo.get(ename);
3696            return (entity == null) ? null : entity.value;
3697        }
3698    
3699        /**
3700         * Register an entity declaration for later retrieval.
3701         */
3702        private void setInternalEntity(String eName, String value)
3703                throws SAXException {
3704            if (skippedPE) {
3705                return;
3706            }
3707    
3708            if (entityInfo.get(eName) == null) {
3709                EntityInfo entity = new EntityInfo();
3710                entity.type = ENTITY_INTERNAL;
3711                entity.value = value;
3712                entityInfo.put(eName, entity);
3713            }
3714            if (handler.stringInterning) {
3715                if ("lt" == eName || "gt" == eName || "quot" == eName
3716                        || "apos" == eName || "amp" == eName) {
3717                    return;
3718                }
3719            } else {
3720                if ("lt".equals(eName) || "gt".equals(eName)
3721                        || "quot".equals(eName) || "apos".equals(eName)
3722                        || "amp".equals(eName)) {
3723                    return;
3724                }
3725            }
3726            handler.getDeclHandler().internalEntityDecl(eName, value);
3727        }
3728    
3729        /**
3730         * Register an external entity declaration for later retrieval.
3731         */
3732        private void setExternalEntity(String eName, int eClass,
3733                ExternalIdentifiers ids, String nName) {
3734            if (entityInfo.get(eName) == null) {
3735                EntityInfo entity = new EntityInfo();
3736                entity.type = eClass;
3737                entity.ids = ids;
3738                entity.notationName = nName;
3739                entityInfo.put(eName, entity);
3740            }
3741        }
3742    
3743        //
3744        // Notations.
3745        //
3746    
3747        /**
3748         * Report a notation declaration, checking for duplicates.
3749         */
3750        private void setNotation(String nname, ExternalIdentifiers ids)
3751                throws SAXException {
3752            if (skippedPE) {
3753                return;
3754            }
3755    
3756            handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri);
3757            if (notationInfo.get(nname) == null) {
3758                notationInfo.put(nname, nname);
3759            } else {
3760                // VC: Unique Notation Name
3761                handler.verror("Duplicate notation name decl: " + nname);
3762            }
3763        }
3764    
3765        //
3766        // Location.
3767        //
3768    
3769        /**
3770         * Return the current line number.
3771         */
3772        public int getLineNumber() {
3773            if (line > 0) {
3774                return line;            
3775            } else {
3776                return -1;
3777            }
3778        }
3779    
3780        /**
3781         * Return the current column number.
3782         */
3783        public int getColumnNumber() {
3784            if (column > 0) {
3785                return column;
3786            } else {
3787                return -1;
3788            }
3789        }
3790    
3791        // ////////////////////////////////////////////////////////////////////
3792        // High-level I/O.
3793        // ////////////////////////////////////////////////////////////////////
3794    
3795        /**
3796         * Read a single character from the readBuffer.
3797         * <p>
3798         * The readDataChunk () method maintains the buffer.
3799         * <p>
3800         * If we hit the end of an entity, try to pop the stack and keep going.
3801         * <p>
3802         * (This approach doesn't really enforce XML's rules about entity
3803         * boundaries, but this is not currently a validating parser).
3804         * <p>
3805         * This routine also attempts to keep track of the current position in
3806         * external entities, but it's not entirely accurate.
3807         * 
3808         * @return The next available input character.
3809         * @see #unread (char)
3810         * @see #readDataChunk
3811         * @see #readBuffer
3812         * @see #line
3813         * @return The next character from the current input source.
3814         */
3815        private char readCh() throws SAXException, IOException {
3816            // As long as there's nothing in the
3817            // read buffer, try reading more data
3818            // (for an external entity) or popping
3819            // the entity stack (for either).
3820            while (readBufferPos >= readBufferLength) {
3821                switch (sourceType) {
3822                    case INPUT_READER:
3823                        readDataChunk();
3824                        while (readBufferLength < 1) {
3825                            popInput();
3826                            if (readBufferLength < 1) {
3827                                readDataChunk();
3828                            }
3829                        }
3830                        break;
3831    
3832                    default:
3833    
3834                        popInput();
3835                        break;
3836                }
3837            }
3838    
3839            char c = readBuffer[readBufferPos++];
3840            advanceLocation();
3841            // copied from fi.iki.hsivonen.htmlparser
3842            if ((c & 0xFC00) == 0xDC00) {
3843                // Got a low surrogate. See if prev was high surrogate
3844                if ((prev & 0xFC00) == 0xD800) {
3845                    int intVal = (prev << 10) + c + SURROGATE_OFFSET;
3846                    if (isNonCharacter(intVal)) {
3847                        handler.warn("Astral non-character.");
3848                    }
3849                    if (isAstralPrivateUse(intVal)) {
3850                        warnAboutPrivateUseChar();
3851                    }
3852                } else {
3853                    fatal("Unmatched low surrogate.");
3854                }
3855                prev = c;
3856            } else {
3857                // see if there was a lone high surrogate
3858                if ((prev & 0xFC00) == 0xD800) {
3859                    fatal("Unmatched high surrogate.");
3860                }
3861            }
3862    
3863            if (c == '\n') {
3864                nextCharOnNewLine = true;
3865            } else {
3866                if (c == '<') {
3867                    /* the most common return to parseContent () ... NOP */
3868                } else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
3869                        || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) && xmlVersion == XML_11)) {
3870                    fatal("illegal XML character U+" + Integer.toHexString(c));
3871                } else if (c >= '\u007F' && c <= '\u009F') // 2006-04-25 hsivonen
3872                {
3873                    handler.warn("Saw a control character: U+00"
3874                            + Integer.toHexString(c) + ".");
3875                }
3876    
3877                if (isPrivateUse(c)) {
3878                    warnAboutPrivateUseChar();
3879                }
3880                // If we're in the DTD and in a context where PEs get expanded,
3881                // do so ... 1/14/2000 errata identify those contexts. There
3882                // are also spots in the internal subset where PE refs are fatal
3883                // errors, hence yet another flag.
3884                else if (c == '%' && expandPE) {
3885                    if (peIsError) {
3886                        fatal("PE reference within decl in internal subset.");
3887                    }
3888                    parsePEReference();
3889                    return readCh();
3890                }
3891            }
3892    
3893            return c;
3894        }
3895    
3896        /**
3897         * Push a single character back onto the current input stream.
3898         * <p>
3899         * This method usually pushes the character back onto the readBuffer.
3900         * <p>
3901         * I don't think that this would ever be called with readBufferPos = 0,
3902         * because the methods always reads a character before unreading it, but
3903         * just in case, I've added a boundary condition.
3904         * 
3905         * @param c
3906         *            The character to push back.
3907         * @see #readCh
3908         * @see #unread (char[])
3909         * @see #readBuffer
3910         */
3911        private void unread(char c) throws SAXException {
3912            rollbackLocation();
3913            if (readBufferPos > 0) {
3914                readBuffer[--readBufferPos] = c;
3915            } else {
3916                pushString(null, new Character(c).toString());
3917            }
3918        }
3919    
3920        /**
3921         * 
3922         */
3923        private void rollbackLocation() {
3924            assert (column != columnPrev) || (line != linePrev); 
3925            if (column == 1) {
3926                nextCharOnNewLine = true;
3927            }
3928            line = linePrev;
3929            column = columnPrev;
3930        }
3931    
3932        /**
3933         * Push a char array back onto the current input stream.
3934         * <p>
3935         * NOTE: you must <em>never</em> push back characters that you haven't
3936         * actually read: use pushString () instead.
3937         * 
3938         * @see #readCh
3939         * @see #unread (char)
3940         * @see #readBuffer
3941         * @see #pushString
3942         */
3943        private void unread(char[] ch, int length) throws SAXException {
3944            if (length < readBufferPos) {
3945                readBufferPos -= length;
3946            } else {
3947                pushCharArray(null, ch, 0, length);
3948            }
3949        }
3950    
3951        /**
3952         * Push, or skip, a new external input source. The source will be some kind
3953         * of parsed entity, such as a PE (including the external DTD subset) or
3954         * content for the body.
3955         * 
3956         * @param url
3957         *            The java.net.URL object for the entity.
3958         * @see SAXDriver#resolveEntity
3959         * @see #pushString
3960         * @see #sourceType
3961         * @see #pushInput
3962         * @see #detectEncoding
3963         * @see #sourceType
3964         * @see #readBuffer
3965         */
3966        private void pushURL(boolean isPE, String ename, ExternalIdentifiers ids,
3967                Reader aReader, InputStream aStream, String aEncoding,
3968                boolean doResolve) throws SAXException, IOException {
3969            // removed boolean ignoreEncoding -- 2006-02-03 hsivonen
3970            String systemId;
3971            InputSource source;
3972            InputSource scratch = new InputSource();
3973    
3974            if (!isPE) {
3975                dataBufferFlush();
3976            }
3977    
3978            scratch.setPublicId(ids.publicId);
3979            scratch.setSystemId(ids.systemId);
3980    
3981            // See if we should skip or substitute the entity.
3982            // If we're not skipping, resolving reports startEntity()
3983            // and updates the (handler's) stack of URIs.
3984            if (doResolve) {
3985                // assert (stream == null && reader == null && encoding == null)
3986                source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri);
3987                if (source == null) {
3988                    handler.warn("skipping entity: " + ename);
3989                    handler.skippedEntity(ename);
3990                    if (isPE) {
3991                        skippedPE = true;
3992                    }
3993                    return;
3994                }
3995    
3996                // we might be using alternate IDs/encoding
3997                systemId = source.getSystemId();
3998                // The following warning and setting systemId was deleted bcause
3999                // the application has the option of not setting systemId
4000                // provided that it has set the characte/byte stream.
4001                /*
4002                 * if (systemId == null) { handler.warn ("missing system ID, using " +
4003                 * ids.systemId); systemId = ids.systemId; }
4004                 */
4005            } else {
4006                // "[document]", or "[dtd]" via getExternalSubset()
4007                scratch.setCharacterStream(aReader);
4008                scratch.setByteStream(aStream);
4009                scratch.setEncoding(aEncoding);
4010                source = scratch;
4011                systemId = ids.systemId;
4012                if (handler.stringInterning) {
4013                    handler.startExternalEntity(ename, systemId,
4014                            "[document]" == ename);
4015                } else {
4016                    handler.startExternalEntity(ename, systemId,
4017                            "[document]".equals(ename));
4018                }
4019            }
4020    
4021            // Push the existing status.
4022            pushInput(ename);
4023    
4024            // Create a new read buffer.
4025            // (Note the four-character margin)
4026            readBuffer = new char[READ_BUFFER_MAX + 4];
4027            readBufferPos = 0;
4028            readBufferLength = 0;
4029            readBufferOverflow = -1;
4030            is = null;
4031            reader = null;
4032            line = 0;
4033            column = 1;
4034            linePrev = 0;
4035            columnPrev = 1;
4036            nextCharOnNewLine = true;
4037            currentByteCount = 0;
4038    
4039            // If there's an explicit character stream, just
4040            // ignore encoding declarations.
4041            if (source.getCharacterStream() != null) {
4042                sourceType = INPUT_READER;
4043                this.reader = source.getCharacterStream();
4044                // swallow UTF-8 BOM -- 2006-02-03 hsivonen
4045                if ("UTF-8".equalsIgnoreCase(source.getEncoding())) {
4046                    char bom = readCh();
4047                    if (bom != '\uFEFF') {
4048                        unread(bom);
4049                    }
4050                }
4051                tryEncodingDecl(source.getEncoding() == null ? ""
4052                        : source.getEncoding());
4053                return;
4054            }
4055    
4056            // Else we handle the conversion, and need to ensure
4057            // it's done right.
4058            if (source.getByteStream() != null) {
4059                is = source.getByteStream();
4060            } else {
4061                // Stop -- 2006-11-10 hsivonen
4062                fatal("The entity resolver didn't properly resolve the entity.");
4063            }
4064    
4065            // If we get to here, there must be
4066            // an InputStream available.
4067            if (!is.markSupported()) {
4068                is = new BufferedInputStream(is);
4069            }
4070    
4071            // Zapped bogus external encoding label code -- 2006-11-10 hsivonen
4072    
4073            // if we got an external encoding label, use it ...
4074            if (source.getEncoding() != null) {
4075                draconianInputStreamReader(source.getEncoding(), is, false);
4076                if ("UTF-8".equalsIgnoreCase(source.getEncoding())) {
4077                    char bom = readCh();
4078                    if (bom != '\uFEFF') {
4079                        unread(bom);
4080                    }
4081                }
4082                tryEncodingDecl(source.getEncoding());
4083                // ... else autodetect from first bytes.
4084            } else {
4085                detectEncoding();
4086                // Read any XML or text declaration.
4087                String enc = tryEncodingDecl(null);
4088                if (enc == null && "UTF-32" == characterEncoding) {
4089                    fatal("UTF-32 was sniffed from the BOM, but there was no matching encoding declaration. The omission of explicit encoding declaration is only allowed with UTF-8 and UTF-16.");
4090                }
4091            }
4092        }
4093    
4094        /**
4095         * Check for an encoding declaration. This is the second part of the XML
4096         * encoding autodetection algorithm, relying on detectEncoding to get to the
4097         * point that this part can read any encoding declaration in the document
4098         * (using only US-ASCII characters).
4099         * 
4100         * <p>
4101         * Because this part starts to fill parser buffers with this data, it's
4102         * tricky to setup a reader so that Java's built-in decoders can be used for
4103         * the character encodings that aren't built in to this parser (such as
4104         * EUC-JP, KOI8-R, Big5, etc).
4105         * 
4106         * @return any encoding in the declaration, uppercased; or null
4107         * @see detectEncoding
4108         */
4109        private String tryEncodingDecl(String encoding) throws SAXException,
4110                IOException {
4111            // Read the XML/text declaration.
4112            if (tryRead("<?xml")) {
4113                if (tryWhitespace()) {
4114                    if (inputStack.size() > 0) {
4115                        return parseTextDecl(encoding);
4116                    } else {
4117                        return parseXMLDecl(encoding);
4118                    }
4119                } else {
4120                    // <?xml-stylesheet ...?> or similar
4121                    unread('l');
4122                    unread('m');
4123                    unread('x');
4124                    unread('?');
4125                    unread('<');
4126                }
4127            }
4128            // 2006-02-03 hsivonen
4129            warnAboutLackOfEncodingDecl(encoding);
4130            return null;
4131        }
4132    
4133        /**
4134         * @param characterEncoding
4135         * @throws SAXException
4136         */
4137        private void warnAboutLackOfEncodingDecl(String encoding)
4138                throws SAXException {
4139            if (!(encoding == null || "".equals(encoding)
4140                    || "UTF-8".equalsIgnoreCase(encoding) || "UTF-16".equalsIgnoreCase(encoding))) {
4141                handler.warn("External encoding information specified a non-UTF-8/non-UTF-16 encoding ("
4142                        + encoding
4143                        + "), but there was no matching internal encoding declaration. The well-formedness status of this document may change when decoupled from the external encoding information.");
4144            }
4145        }
4146    
4147        /**
4148         * Attempt to detect the encoding of an entity.
4149         * <p>
4150         * The trick here (as suggested in the XML standard) is that any entity not
4151         * in UTF-8, or in UCS-2 with a byte-order mark, <b>must</b> begin with an
4152         * XML declaration or an encoding declaration; we simply have to look for
4153         * "&lt;?xml" in various encodings.
4154         * <p>
4155         * This method has no way to distinguish among 8-bit encodings. Instead, it
4156         * sets up for UTF-8, then (possibly) revises its assumption later in
4157         * setupDecoding (). Any ASCII-derived 8-bit encoding should work, but most
4158         * will be rejected later by setupDecoding ().
4159         * 
4160         * @see #tryEncoding (byte[], byte, byte, byte, byte)
4161         * @see #tryEncoding (byte[], byte, byte)
4162         * @see #setupDecoding
4163         */
4164        private void detectEncoding() throws SAXException, IOException {
4165            byte[] signature = new byte[4];
4166    
4167            // Read the first four bytes for
4168            // autodetection.
4169            is.mark(4);
4170            is.read(signature);
4171            is.reset();
4172    
4173            //
4174            // FIRST: four byte encodings (who uses these?)
4175            //
4176            if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x00,
4177                    (byte) 0x3c)) {
4178                // UCS-4 must begin with "<?xml"
4179                // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
4180                // "UTF-32BE"
4181                draconianInputStreamReader("UTF-32BE", is, false);
4182            } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4183                    (byte) 0x00, (byte) 0x00)) {
4184                // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
4185                // "UTF-32LE"
4186                draconianInputStreamReader("UTF-32LE", is, false);
4187            } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4188                    (byte) 0x3c, (byte) 0x00)) {
4189                // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
4190                fatal("Unsupported 32-bit encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); // 2006-02-03
4191                                                                                                                        // hsivonen
4192            } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4193                    (byte) 0x00, (byte) 0x00)) {
4194                // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
4195                fatal("Unsupported 32-bit encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); // 2006-02-03
4196                                                                                                                        // hsivonen
4197            } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4198                    (byte) 0xfe, (byte) 0xff)) {
4199                // 00 00 fe ff UCS_4_1234 (with BOM)
4200                is.read();
4201                is.read();
4202                is.read();
4203                is.read();
4204                draconianInputStreamReader("UTF-32BE", is, false, "UTF-32");
4205            } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4206                    (byte) 0x00, (byte) 0x00)) {
4207                // ff fe 00 00 UCS_4_4321 (with BOM)
4208                is.read();
4209                is.read();
4210                is.read();
4211                is.read();
4212                draconianInputStreamReader("UTF-32LE", is, false, "UTF-32");
4213            }
4214            // SECOND: two byte encodings
4215            // note ... with 1/14/2000 errata the XML spec identifies some
4216            // more "broken UTF-16" autodetection cases, with no XML decl,
4217            // which we don't handle here (that's legal too).
4218            //
4219            else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) {
4220                // UCS-2 with a byte-order marker. (UTF-16)
4221                // 0xfe 0xff: UCS-2, big-endian (12)
4222                is.read();
4223                is.read();
4224                draconianInputStreamReader("UTF-16BE", is, false, "UTF-16");
4225            } else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) {
4226                // UCS-2 with a byte-order marker. (UTF-16)
4227                // 0xff 0xfe: UCS-2, little-endian (21)
4228                is.read();
4229                is.read();
4230                draconianInputStreamReader("UTF-16LE", is, false, "UTF-16");
4231            } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4232                    (byte) 0x00, (byte) 0x3f)) {
4233                // UTF-16BE (otherwise, malformed UTF-16)
4234                // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
4235                fatal("no byte-order mark for UTF-16 entity"); // s/UCS-2/UTF-16/
4236                                                                // -- 2006-02-03
4237                                                                // hsivonen
4238            } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4239                    (byte) 0x3f, (byte) 0x00)) {
4240                // UTF-16LE (otherwise, malformed UTF-16)
4241                // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
4242                fatal("no byte-order mark for UTF-16 entity"); // s/UCS-2/UTF-16/
4243                                                                // -- 2006-02-03
4244                                                                // hsivonen
4245            }
4246            //
4247            // THIRD: EBCDIC
4248            //
4249            else if (tryEncoding(signature, (byte) 0x4c, (byte) 0x6f, (byte) 0xa7,
4250                    (byte) 0x94)) {
4251                // 4c 6f a7 94 ... we don't understand EBCDIC flavors
4252                fatal("Unsupported EBCDIC encoding. (XML processors are only required to support UTF-8 and UTF-16.)");
4253            }
4254            //
4255            // FOURTH: ASCII-derived encodings, fixed and variable lengths
4256            //
4257            else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, (byte) 0x78,
4258                    (byte) 0x6d)) {
4259                // ASCII derived
4260                // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
4261                characterEncoding = null;
4262                prefetchASCIIEncodingDecl();
4263            } else if (signature[0] == (byte) 0xef && signature[1] == (byte) 0xbb
4264                    && signature[2] == (byte) 0xbf) {
4265                // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text)
4266                // this un-needed notion slipped into XML 2nd ed through a
4267                // "non-normative" erratum; now required by MSFT and UDDI,
4268                // and E22 made it normative.
4269                is.read();
4270                is.read();
4271                is.read();
4272                draconianInputStreamReader("UTF-8", is, false);
4273            } else {
4274                // (default) UTF-8 without encoding/XML declaration
4275                draconianInputStreamReader("UTF-8", is, false);
4276            }
4277        }
4278    
4279        /**
4280         * Check for a four-byte signature.
4281         * <p>
4282         * Utility routine for detectEncoding ().
4283         * <p>
4284         * Always looks for some part of "<?XML" in a specific encoding.
4285         * 
4286         * @param sig
4287         *            The first four bytes read.
4288         * @param b1
4289         *            The first byte of the signature
4290         * @param b2
4291         *            The second byte of the signature
4292         * @param b3
4293         *            The third byte of the signature
4294         * @param b4
4295         *            The fourth byte of the signature
4296         * @see #detectEncoding
4297         */
4298        private static boolean tryEncoding(byte[] sig, byte b1, byte b2, byte b3,
4299                byte b4) {
4300            return (sig[0] == b1 && sig[1] == b2 && sig[2] == b3 && sig[3] == b4);
4301        }
4302    
4303        /**
4304         * Check for a two-byte signature.
4305         * <p>
4306         * Looks for a UCS-2 byte-order mark.
4307         * <p>
4308         * Utility routine for detectEncoding ().
4309         * 
4310         * @param sig
4311         *            The first four bytes read.
4312         * @param b1
4313         *            The first byte of the signature
4314         * @param b2
4315         *            The second byte of the signature
4316         * @see #detectEncoding
4317         */
4318        private static boolean tryEncoding(byte[] sig, byte b1, byte b2) {
4319            return ((sig[0] == b1) && (sig[1] == b2));
4320        }
4321    
4322        /**
4323         * This method pushes a string back onto input.
4324         * <p>
4325         * It is useful either as the expansion of an internal entity, or for
4326         * backtracking during the parse.
4327         * <p>
4328         * Call pushCharArray () to do the actual work.
4329         * 
4330         * @param s
4331         *            The string to push back onto input.
4332         * @see #pushCharArray
4333         */
4334        private void pushString(String ename, String s) throws SAXException {
4335            char[] ch = s.toCharArray();
4336            pushCharArray(ename, ch, 0, ch.length);
4337        }
4338    
4339        /**
4340         * Push a new internal input source.
4341         * <p>
4342         * This method is useful for expanding an internal entity, or for unreading
4343         * a string of characters. It creates a new readBuffer containing the
4344         * characters in the array, instead of characters converted from an input
4345         * byte stream.
4346         * 
4347         * @param ch
4348         *            The char array to push.
4349         * @see #pushString
4350         * @see #pushURL
4351         * @see #readBuffer
4352         * @see #sourceType
4353         * @see #pushInput
4354         */
4355        private void pushCharArray(String ename, char[] ch, int start, int length)
4356                throws SAXException {
4357            // Push the existing status
4358            pushInput(ename);
4359            if (ename != null && doReport) {
4360                dataBufferFlush();
4361                handler.startInternalEntity(ename);
4362            }
4363            sourceType = INPUT_INTERNAL;
4364            readBuffer = ch;
4365            readBufferPos = start;
4366            readBufferLength = length;
4367            readBufferOverflow = -1;
4368        }
4369    
4370        /**
4371         * Save the current input source onto the stack.
4372         * <p>
4373         * This method saves all of the global variables associated with the current
4374         * input source, so that they can be restored when a new input source has
4375         * finished. It also tests for entity recursion.
4376         * <p>
4377         * The method saves the following global variables onto a stack using a
4378         * fixed-length array:
4379         * <ol>
4380         * <li>sourceType
4381         * <li>externalEntity
4382         * <li>readBuffer
4383         * <li>readBufferPos
4384         * <li>readBufferLength
4385         * <li>line
4386         * <li>characterEncoding
4387         * </ol>
4388         * 
4389         * @param ename
4390         *            The name of the entity (if any) causing the new input.
4391         * @see #popInput
4392         * @see #sourceType
4393         * @see #externalEntity
4394         * @see #readBuffer
4395         * @see #readBufferPos
4396         * @see #readBufferLength
4397         * @see #line
4398         * @see #characterEncoding
4399         */
4400        private void pushInput(String ename) throws SAXException {
4401            // Protect against billion laughs -- 2006-12-28 hsivonen
4402            if (entityStack.size() > 16) {
4403                fatal("Entity recursion too deep. Stopping to protect against denial of service attacks.");
4404            }
4405    
4406            // Check for entity recursion.
4407            if (ename != null) {
4408                Iterator<String> entities = entityStack.iterator();
4409                while (entities.hasNext()) {
4410                    String e = entities.next();
4411                    if (e != null && e == ename) {
4412                        fatal("recursive reference to entity", ename, null);
4413                    }
4414                }
4415            }
4416            entityStack.addLast(ename);
4417    
4418            // Don't bother if there is no current input.
4419            if (sourceType == INPUT_NONE) {
4420                return;
4421            }
4422    
4423            // Set up a snapshot of the current
4424            // input source.
4425            Input input = new Input();
4426    
4427            input.sourceType = sourceType;
4428            input.readBuffer = readBuffer;
4429            input.readBufferPos = readBufferPos;
4430            input.readBufferLength = readBufferLength;
4431            input.line = line;
4432            input.linePrev = linePrev;
4433            input.charecterEncoding = characterEncoding;
4434            input.readBufferOverflow = readBufferOverflow;
4435            input.is = is;
4436            input.currentByteCount = currentByteCount;
4437            input.column = column;
4438            input.columnPrev = columnPrev;
4439            input.nextCharOnNewLine = nextCharOnNewLine;
4440            input.reader = reader;
4441            input.prev = prev;
4442            input.normalizationChecker = normalizationChecker;
4443            input.characterHandler = characterHandler;
4444            characterHandler = null;
4445    
4446            // Push it onto the stack.
4447            inputStack.addLast(input);
4448        }
4449    
4450        /**
4451         * Restore a previous input source.
4452         * <p>
4453         * This method restores all of the global variables associated with the
4454         * current input source.
4455         * 
4456         * @exception java.io.EOFException
4457         *                If there are no more entries on the input stack.
4458         * @see #pushInput
4459         * @see #sourceType
4460         * @see #readBuffer
4461         * @see #readBufferPos
4462         * @see #readBufferLength
4463         * @see #line
4464         * @see #characterEncoding
4465         */
4466        private void popInput() throws SAXException, IOException {
4467            String ename = entityStack.removeLast();
4468    
4469            if (ename != null && doReport) {
4470                dataBufferFlush();
4471            }
4472            switch (sourceType) {
4473                case INPUT_READER:
4474                    handler.endExternalEntity(ename);
4475                    reader.close();
4476                    break;
4477                case INPUT_INTERNAL:
4478                    if (ename != null && doReport) {
4479                        handler.endInternalEntity(ename);
4480                    }
4481                    break;
4482            }
4483            if (characterHandler != null) {
4484                characterHandler.end();
4485            }
4486            if (normalizationChecker != null) {
4487                normalizationChecker.end();
4488            }
4489    
4490            // Throw an EOFException if there
4491            // is nothing else to pop.
4492            if (inputStack.isEmpty()) {
4493                throw new EOFException("no more input");
4494            }
4495    
4496            Input input = inputStack.removeLast();
4497    
4498            sourceType = input.sourceType;
4499            readBuffer = input.readBuffer;
4500            readBufferPos = input.readBufferPos;
4501            readBufferLength = input.readBufferLength;
4502            line = input.line;
4503            linePrev = input.linePrev;
4504            characterEncoding = input.charecterEncoding;
4505            readBufferOverflow = input.readBufferOverflow;
4506            is = input.is;
4507            currentByteCount = input.currentByteCount;
4508            column = input.column;
4509            columnPrev = input.columnPrev;
4510            nextCharOnNewLine = input.nextCharOnNewLine;
4511            reader = input.reader;
4512            prev = input.prev;
4513            normalizationChecker = input.normalizationChecker;
4514            characterHandler = input.characterHandler;
4515        }
4516    
4517        /**
4518         * Return true if we can read the expected character.
4519         * <p>
4520         * Note that the character will be removed from the input stream on success,
4521         * but will be put back on failure. Do not attempt to read the character
4522         * again if the method succeeds.
4523         * 
4524         * @param delim
4525         *            The character that should appear next. For a insensitive
4526         *            match, you must supply this in upper-case.
4527         * @return true if the character was successfully read, or false if it was
4528         *         not.
4529         * @see #tryRead (String)
4530         */
4531        private boolean tryRead(char delim) throws SAXException, IOException {
4532            char c;
4533    
4534            // Read the character
4535            c = readCh();
4536    
4537            // Test for a match, and push the character
4538            // back if the match fails.
4539            if (c == delim) {
4540                return true;
4541            } else {
4542                unread(c);
4543                return false;
4544            }
4545        }
4546    
4547        /**
4548         * Return true if we can read the expected string.
4549         * <p>
4550         * This is simply a convenience method.
4551         * <p>
4552         * Note that the string will be removed from the input stream on success,
4553         * but will be put back on failure. Do not attempt to read the string again
4554         * if the method succeeds.
4555         * <p>
4556         * This method will push back a character rather than an array whenever
4557         * possible (probably the majority of cases).
4558         * 
4559         * @param delim
4560         *            The string that should appear next.
4561         * @return true if the string was successfully read, or false if it was not.
4562         * @see #tryRead (char)
4563         */
4564        private boolean tryRead(String delim) throws SAXException, IOException {
4565            return tryRead(delim.toCharArray());
4566        }
4567    
4568        private boolean tryRead(char[] ch) throws SAXException, IOException {
4569            char c;
4570    
4571            // Compare the input, character-
4572            // by character.
4573            int saveLine = line;
4574            int saveColumn = column;
4575            int saveLinePrev = linePrev;
4576            int saveColumnPrev = columnPrev;
4577            boolean saveNextCharOnNewLine = nextCharOnNewLine;
4578            
4579            for (int i = 0; i < ch.length; i++) {
4580                c = readCh();
4581                if (c != ch[i]) {
4582                    unread(c);
4583                    if (i != 0) {
4584                        unread(ch, i);
4585                    }
4586                    line = saveLine;
4587                    column = saveColumn;
4588                    linePrev = saveLinePrev;
4589                    columnPrev = saveColumnPrev;
4590                    nextCharOnNewLine = saveNextCharOnNewLine;
4591                    return false;
4592                }
4593            }
4594            return true;
4595        }
4596    
4597        /**
4598         * Return true if we can read some whitespace.
4599         * <p>
4600         * This is simply a convenience method.
4601         * <p>
4602         * This method will push back a character rather than an array whenever
4603         * possible (probably the majority of cases).
4604         * 
4605         * @return true if whitespace was found.
4606         */
4607        private boolean tryWhitespace() throws SAXException, IOException {
4608            char c;
4609            c = readCh();
4610            if (isWhitespace(c)) {
4611                skipWhitespace();
4612                return true;
4613            } else {
4614                unread(c);
4615                return false;
4616            }
4617        }
4618    
4619        private void parseUntil(char[] delim) throws SAXException, IOException {
4620            char c;
4621            int startLine = line;
4622    
4623            try {
4624                while (!tryRead(delim)) {
4625                    c = readCh();
4626                    dataBufferAppend(c);
4627                }
4628            } catch (EOFException e) {
4629                fatal("end of input while looking for delimiter "
4630                        + "(started on line " + startLine + ')', null, new String(
4631                        delim));
4632            }
4633        }
4634    
4635        // ////////////////////////////////////////////////////////////////////
4636        // Low-level I/O.
4637        // ////////////////////////////////////////////////////////////////////
4638    
4639        /**
4640         * Prefetch US-ASCII XML/text decl from input stream into read buffer.
4641         * Doesn't buffer more than absolutely needed, so that when an encoding decl
4642         * says we need to create an InputStreamReader, we can discard our buffer
4643         * and reset(). Caller knows the first chars of the decl exist in the input
4644         * stream.
4645         */
4646        private void prefetchASCIIEncodingDecl() throws SAXException, IOException {
4647            int ch;
4648            readBufferPos = readBufferLength = 0;
4649    
4650            is.mark(readBuffer.length);
4651            while (true) {
4652                ch = is.read();
4653                readBuffer[readBufferLength++] = (char) ch;
4654                switch (ch) {
4655                    case (int) '>':
4656                        return;
4657                    case -1:
4658                        fatal(
4659                                "file ends before end of XML or encoding declaration.",
4660                                null, "?>");
4661                }
4662                if (readBuffer.length == readBufferLength) {
4663                    fatal("unfinished XML or encoding declaration");
4664                }
4665            }
4666        }
4667    
4668        /**
4669         * Read a chunk of data from an external input source.
4670         * <p>This is simply a front-end that fills the rawReadBuffer
4671         * with bytes, then calls the appropriate encoding handler.
4672         * @see #characterEncoding
4673         * @see #rawReadBuffer
4674         * @see #readBuffer
4675         * @see #filterCR
4676         * @see #copyUtf8ReadBuffer
4677         * @see #copyIso8859_1ReadBuffer
4678         * @see #copyUcs_2ReadBuffer
4679         * @see #copyUcs_4ReadBuffer
4680         */
4681        private void readDataChunk() throws SAXException, IOException {
4682            int count;
4683    
4684            // See if we have any overflow (filterCR sets for CR at end)
4685            if (readBufferOverflow > -1) {
4686                readBuffer[0] = (char) readBufferOverflow;
4687                readBufferOverflow = -1;
4688                readBufferPos = 1;
4689                sawCR = true;
4690            } else {
4691                readBufferPos = 0;
4692                sawCR = false;
4693            }
4694    
4695            try {
4696                count = reader.read(readBuffer, readBufferPos, READ_BUFFER_MAX
4697                        - readBufferPos);
4698            } catch (CharacterCodingException cce) {
4699                // 2006-04-25 hsivonen
4700                fatal("Input data does not conform to the input encoding. The input encoding was "
4701                        + characterEncoding + ".");
4702                return; // never happens
4703            }
4704            if (characterHandler != null && count > 0) {
4705                characterHandler.characters(readBuffer, readBufferPos, count);
4706            }
4707            if (normalizationChecker != null && count > 0) {
4708                normalizationChecker.characters(readBuffer, readBufferPos, count);
4709            }
4710            if (count < 0) {
4711                readBufferLength = readBufferPos;
4712            } else {
4713                readBufferLength = readBufferPos + count;
4714            }
4715            if (readBufferLength > 0) {
4716                filterCR(count >= 0);
4717            }
4718            sawCR = false;
4719        }
4720    
4721        /**
4722         * Filter carriage returns in the read buffer.
4723         * CRLF becomes LF; CR becomes LF.
4724         * @param moreData true iff more data might come from the same source
4725         * @see #readDataChunk
4726         * @see #readBuffer
4727         * @see #readBufferOverflow
4728         */
4729        private void filterCR(boolean moreData) {
4730            int i, j;
4731    
4732            readBufferOverflow = -1;
4733    
4734            loop: for (i = j = readBufferPos; j < readBufferLength; i++, j++) {
4735                switch (readBuffer[j]) {
4736                    case '\r':
4737                        if (j == readBufferLength - 1) {
4738                            if (moreData) {
4739                                readBufferOverflow = '\r';
4740                                readBufferLength--;
4741                            } else // CR at end of buffer
4742                            {
4743                                readBuffer[i++] = '\n';
4744                            }
4745                            break loop;
4746                        } else if (readBuffer[j + 1] == '\n') {
4747                            j++;
4748                        }
4749                        readBuffer[i] = '\n';
4750                        break;
4751    
4752                    case '\n':
4753                    default:
4754                        readBuffer[i] = readBuffer[j];
4755                        break;
4756                }
4757            }
4758            readBufferLength = i;
4759        }
4760    
4761        private void warnAboutPrivateUseChar() throws SAXException {
4762            if (!alreadyWarnedAboutPrivateUseCharacters) {
4763                handler.warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)");
4764                alreadyWarnedAboutPrivateUseCharacters = true;
4765            }
4766        }
4767    
4768        // copied from fi.iki.hsivonen.htmlparser
4769    
4770        private boolean isPrivateUse(char c) {
4771            return c >= '\uE000' && c <= '\uF8FF';
4772        }
4773    
4774        private boolean isPrivateUse(int c) {
4775            return (c >= 0xE000 && c <= 0xF8FF) || (c >= 0xF0000 && c <= 0xFFFFD)
4776                    || (c >= 0x100000 && c <= 0x10FFFD);
4777        }
4778    
4779        private boolean isAstralPrivateUse(int c) {
4780            return (c >= 0xF0000 && c <= 0xFFFFD)
4781                    || (c >= 0x100000 && c <= 0x10FFFD);
4782        }
4783    
4784        private boolean isNonCharacter(int c) {
4785            return (c & 0xFFFE) == 0xFFFE;
4786        }
4787    
4788        //////////////////////////////////////////////////////////////////////
4789        // Local Variables.
4790        //////////////////////////////////////////////////////////////////////
4791    
4792        /**
4793         * Re-initialize the variables for each parse.
4794         * @throws SAXException 
4795         */
4796        private void initializeVariables() throws SAXException {
4797            prev = '\u0000';
4798            // First line
4799            line = 0;
4800            column = 1;
4801            linePrev = 0;
4802            columnPrev = 1;
4803            nextCharOnNewLine = true;
4804    
4805            // Set up the buffers for data and names
4806            dataBufferPos = 0;
4807            dataBuffer = new char[DATA_BUFFER_INITIAL];
4808            nameBufferPos = 0;
4809            nameBuffer = new char[NAME_BUFFER_INITIAL];
4810    
4811            // Set up the DTD hash tables
4812            elementInfo = new HashMap<String, ElementDecl>();
4813            entityInfo = new HashMap<String, EntityInfo>();
4814            notationInfo = new HashMap<String, String>();
4815            skippedPE = false;
4816    
4817            // Set up the variables for the current
4818            // element context.
4819            currentElement = null;
4820            currentElementContent = CONTENT_UNDECLARED;
4821    
4822            // Set up the input variables
4823            sourceType = INPUT_NONE;
4824            inputStack = new LinkedList<Input>();
4825            entityStack = new LinkedList<String>();
4826            tagAttributePos = 0;
4827            tagAttributes = new String[100];
4828            rawReadBuffer = new byte[READ_BUFFER_MAX];
4829            readBufferOverflow = -1;
4830    
4831            inLiteral = false;
4832            expandPE = false;
4833            peIsError = false;
4834    
4835            doReport = false;
4836    
4837            inCDATA = false;
4838    
4839            symbolTable = new Object[SYMBOL_TABLE_LENGTH][];
4840    
4841            if (handler.checkNormalization) {
4842                normalizationChecker = new NormalizationChecker(handler);
4843                normalizationChecker.setErrorHandler(handler.getErrorHandler());
4844                normalizationChecker.start();
4845            } else {
4846                normalizationChecker = null;
4847            }
4848            if (handler.characterHandler != null) {
4849                characterHandler = handler.characterHandler;
4850                handler.characterHandler = null;
4851                characterHandler.start();
4852            } else {
4853                characterHandler = null;
4854            }
4855        }
4856    
4857        static class ExternalIdentifiers {
4858    
4859            String publicId;
4860    
4861            String systemId;
4862    
4863            String baseUri;
4864    
4865            ExternalIdentifiers() {
4866            }
4867    
4868            ExternalIdentifiers(String publicId, String systemId, String baseUri) {
4869                this.publicId = publicId;
4870                this.systemId = systemId;
4871                this.baseUri = baseUri;
4872            }
4873    
4874        }
4875    
4876        static class EntityInfo {
4877    
4878            int type;
4879    
4880            ExternalIdentifiers ids;
4881    
4882            String value;
4883    
4884            String notationName;
4885    
4886        }
4887    
4888        static class AttributeDecl {
4889    
4890            String type;
4891    
4892            String value;
4893    
4894            int valueType;
4895    
4896            String enumeration;
4897    
4898            String defaultValue;
4899    
4900        }
4901    
4902        static class ElementDecl {
4903    
4904            int contentType;
4905    
4906            String contentModel;
4907    
4908            HashMap<String, AttributeDecl> attributes;
4909    
4910        }
4911    
4912        static class Input {
4913            CharacterHandler characterHandler;
4914    
4915            boolean nextCharOnNewLine;
4916    
4917            int columnPrev;
4918    
4919            int linePrev;
4920    
4921            char prev;
4922    
4923            int sourceType;
4924    
4925            char[] readBuffer;
4926    
4927            int readBufferPos;
4928    
4929            int readBufferLength;
4930    
4931            int line;
4932    
4933            String charecterEncoding;
4934    
4935            int readBufferOverflow;
4936    
4937            InputStream is;
4938    
4939            int currentByteCount;
4940    
4941            int column;
4942    
4943            Reader reader;
4944    
4945            NormalizationChecker normalizationChecker;
4946        }
4947    
4948    }