001    /*
002     * Copyright (c) 2005, 2006, 2007 Henri Sivonen
003     * Copyright (c) 2007 Mozilla Foundation
004     * Portions of comments Copyright 2004-2007 Apple Computer, Inc., Mozilla 
005     * Foundation, and Opera Software ASA.
006     *
007     * Permission is hereby granted, free of charge, to any person obtaining a 
008     * copy of this software and associated documentation files (the "Software"), 
009     * to deal in the Software without restriction, including without limitation 
010     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
011     * and/or sell copies of the Software, and to permit persons to whom the 
012     * Software is furnished to do so, subject to the following conditions:
013     *
014     * The above copyright notice and this permission notice shall be included in 
015     * all copies or substantial portions of the Software.
016     *
017     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
018     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
019     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
020     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
021     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
022     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
023     * DEALINGS IN THE SOFTWARE.
024     */
025    
026    /*
027     * The comments following this one that use the same comment syntax as this 
028     * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 
029     * amended as of June 23 2007.
030     * That document came with this statement:
031     * "© Copyright 2004-2007 Apple Computer, Inc., Mozilla Foundation, and 
032     * Opera Software ASA. You are granted a license to use, reproduce and 
033     * create derivative works of this document."
034     */
035    
036    package nu.validator.htmlparser.impl;
037    
038    import java.io.IOException;
039    import java.io.InputStream;
040    import java.io.Reader;
041    import java.nio.charset.Charset;
042    import java.nio.charset.CharsetDecoder;
043    import java.nio.charset.IllegalCharsetNameException;
044    import java.nio.charset.UnsupportedCharsetException;
045    import java.util.Arrays;
046    import java.util.regex.Matcher;
047    import java.util.regex.Pattern;
048    
049    import nu.validator.htmlparser.common.XmlViolationPolicy;
050    
051    import org.xml.sax.Attributes;
052    import org.xml.sax.ErrorHandler;
053    import org.xml.sax.InputSource;
054    import org.xml.sax.Locator;
055    import org.xml.sax.SAXException;
056    import org.xml.sax.SAXParseException;
057    
058    /**
059     * An implementatition of
060     * http://www.whatwg.org/specs/web-apps/current-work/multipage/section-tokenisation.html
061     * 
062     * This class implements the <code>Locator</code> interface. This is not an
063     * incidental implementation detail: Users of this class are encouraged to make
064     * use of the <code>Locator</code> nature.
065     * 
066     * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
067     * can be configured to treat these conditions as fatal or to coerce the infoset
068     * to something that XML 1.0 allows.
069     * 
070     * @version $Id: Tokenizer.java 166 2007-10-14 19:42:57Z hsivonen $
071     * @author hsivonen
072     */
073    public final class Tokenizer implements Locator {
074    
075        private static final Pattern NCNAME_PATTERN = Pattern.compile("(?:[\\u0041-\\u005A]|[\\u0061-\\u007A]|[\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u00FF]|[\\u0100-\\u0131]|[\\u0134-\\u013E]|[\\u0141-\\u0148]|[\\u014A-\\u017E]|[\\u0180-\\u01C3]|[\\u01CD-\\u01F0]|[\\u01F4-\\u01F5]|[\\u01FA-\\u0217]|[\\u0250-\\u02A8]|[\\u02BB-\\u02C1]|\\u0386|[\\u0388-\\u038A]|\\u038C|[\\u038E-\\u03A1]|[\\u03A3-\\u03CE]|[\\u03D0-\\u03D6]|\\u03DA|\\u03DC|\\u03DE|\\u03E0|[\\u03E2-\\u03F3]|[\\u0401-\\u040C]|[\\u040E-\\u044F]|[\\u0451-\\u045C]|[\\u045E-\\u0481]|[\\u0490-\\u04C4]|[\\u04C7-\\u04C8]|[\\u04CB-\\u04CC]|[\\u04D0-\\u04EB]|[\\u04EE-\\u04F5]|[\\u04F8-\\u04F9]|[\\u0531-\\u0556]|\\u0559|[\\u0561-\\u0586]|[\\u05D0-\\u05EA]|[\\u05F0-\\u05F2]|[\\u0621-\\u063A]|[\\u0641-\\u064A]|[\\u0671-\\u06B7]|[\\u06BA-\\u06BE]|[\\u06C0-\\u06CE]|[\\u06D0-\\u06D3]|\\u06D5|[\\u06E5-\\u06E6]|[\\u0905-\\u0939]|\\u093D|[\\u0958-\\u0961]|[\\u0985-\\u098C]|[\\u098F-\\u0990]|[\\u0993-\\u09A8]|[\\u09AA-\\u09B0]|\\u09B2|[\\u09B6-\\u09B9]|[\\u09DC-\\u09DD]|[\\u09DF-\\u09E1]|[\\u09F0-\\u09F1]|[\\u0A05-\\u0A0A]|[\\u0A0F-\\u0A10]|[\\u0A13-\\u0A28]|[\\u0A2A-\\u0A30]|[\\u0A32-\\u0A33]|[\\u0A35-\\u0A36]|[\\u0A38-\\u0A39]|[\\u0A59-\\u0A5C]|\\u0A5E|[\\u0A72-\\u0A74]|[\\u0A85-\\u0A8B]|\\u0A8D|[\\u0A8F-\\u0A91]|[\\u0A93-\\u0AA8]|[\\u0AAA-\\u0AB0]|[\\u0AB2-\\u0AB3]|[\\u0AB5-\\u0AB9]|\\u0ABD|\\u0AE0|[\\u0B05-\\u0B0C]|[\\u0B0F-\\u0B10]|[\\u0B13-\\u0B28]|[\\u0B2A-\\u0B30]|[\\u0B32-\\u0B33]|[\\u0B36-\\u0B39]|\\u0B3D|[\\u0B5C-\\u0B5D]|[\\u0B5F-\\u0B61]|[\\u0B85-\\u0B8A]|[\\u0B8E-\\u0B90]|[\\u0B92-\\u0B95]|[\\u0B99-\\u0B9A]|\\u0B9C|[\\u0B9E-\\u0B9F]|[\\u0BA3-\\u0BA4]|[\\u0BA8-\\u0BAA]|[\\u0BAE-\\u0BB5]|[\\u0BB7-\\u0BB9]|[\\u0C05-\\u0C0C]|[\\u0C0E-\\u0C10]|[\\u0C12-\\u0C28]|[\\u0C2A-\\u0C33]|[\\u0C35-\\u0C39]|[\\u0C60-\\u0C61]|[\\u0C85-\\u0C8C]|[\\u0C8E-\\u0C90]|[\\u0C92-\\u0CA8]|[\\u0CAA-\\u0CB3]|[\\u0CB5-\\u0CB9]|\\u0CDE|[\\u0CE0-\\u0CE1]|[\\u0D05-\\u0D0C]|[\\u0D0E-\\u0D10]|[\\u0D12-\\u0D28]|[\\u0D2A-\\u0D39]|[\\u0D60-\\u0D61]|[\\u0E01-\\u0E2E]|\\u0E30|[\\u0E32-\\u0E33]|[\\u0E40-\\u0E45]|[\\u0E81-\\u0E82]|\\u0E84|[\\u0E87-\\u0E88]|\\u0E8A|\\u0E8D|[\\u0E94-\\u0E97]|[\\u0E99-\\u0E9F]|[\\u0EA1-\\u0EA3]|\\u0EA5|\\u0EA7|[\\u0EAA-\\u0EAB]|[\\u0EAD-\\u0EAE]|\\u0EB0|[\\u0EB2-\\u0EB3]|\\u0EBD|[\\u0EC0-\\u0EC4]|[\\u0F40-\\u0F47]|[\\u0F49-\\u0F69]|[\\u10A0-\\u10C5]|[\\u10D0-\\u10F6]|\\u1100|[\\u1102-\\u1103]|[\\u1105-\\u1107]|\\u1109|[\\u110B-\\u110C]|[\\u110E-\\u1112]|\\u113C|\\u113E|\\u1140|\\u114C|\\u114E|\\u1150|[\\u1154-\\u1155]|\\u1159|[\\u115F-\\u1161]|\\u1163|\\u1165|\\u1167|\\u1169|[\\u116D-\\u116E]|[\\u1172-\\u1173]|\\u1175|\\u119E|\\u11A8|\\u11AB|[\\u11AE-\\u11AF]|[\\u11B7-\\u11B8]|\\u11BA|[\\u11BC-\\u11C2]|\\u11EB|\\u11F0|\\u11F9|[\\u1E00-\\u1E9B]|[\\u1EA0-\\u1EF9]|[\\u1F00-\\u1F15]|[\\u1F18-\\u1F1D]|[\\u1F20-\\u1F45]|[\\u1F48-\\u1F4D]|[\\u1F50-\\u1F57]|\\u1F59|\\u1F5B|\\u1F5D|[\\u1F5F-\\u1F7D]|[\\u1F80-\\u1FB4]|[\\u1FB6-\\u1FBC]|\\u1FBE|[\\u1FC2-\\u1FC4]|[\\u1FC6-\\u1FCC]|[\\u1FD0-\\u1FD3]|[\\u1FD6-\\u1FDB]|[\\u1FE0-\\u1FEC]|[\\u1FF2-\\u1FF4]|[\\u1FF6-\\u1FFC]|\\u2126|[\\u212A-\\u212B]|\\u212E|[\\u2180-\\u2182]|[\\u3041-\\u3094]|[\\u30A1-\\u30FA]|[\\u3105-\\u312C]|[\\uAC00-\\uD7A3]|[\\u4E00-\\u9FA5]|\\u3007|[\\u3021-\\u3029]|_)(?:[\\u0030-\\u0039]|[\\u0660-\\u0669]|[\\u06F0-\\u06F9]|[\\u0966-\\u096F]|[\\u09E6-\\u09EF]|[\\u0A66-\\u0A6F]|[\\u0AE6-\\u0AEF]|[\\u0B66-\\u0B6F]|[\\u0BE7-\\u0BEF]|[\\u0C66-\\u0C6F]|[\\u0CE6-\\u0CEF]|[\\u0D66-\\u0D6F]|[\\u0E50-\\u0E59]|[\\u0ED0-\\u0ED9]|[\\u0F20-\\u0F29]|[\\u0041-\\u005A]|[\\u0061-\\u007A]|[\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u00FF]|[\\u0100-\\u0131]|[\\u0134-\\u013E]|[\\u0141-\\u0148]|[\\u014A-\\u017E]|[\\u0180-\\u01C3]|[\\u01CD-\\u01F0]|[\\u01F4-\\u01F5]|[\\u01FA-\\u0217]|[\\u0250-\\u02A8]|[\\u02BB-\\u02C1]|\\u0386|[\\u0388-\\u038A]|\\u038C|[\\u038E-\\u03A1]|[\\u03A3-\\u03CE]|[\\u03D0-\\u03D6]|\\u03DA|\\u03DC|\\u03DE|\\u03E0|[\\u03E2-\\u03F3]|[\\u0401-\\u040C]|[\\u040E-\\u044F]|[\\u0451-\\u045C]|[\\u045E-\\u0481]|[\\u0490-\\u04C4]|[\\u04C7-\\u04C8]|[\\u04CB-\\u04CC]|[\\u04D0-\\u04EB]|[\\u04EE-\\u04F5]|[\\u04F8-\\u04F9]|[\\u0531-\\u0556]|\\u0559|[\\u0561-\\u0586]|[\\u05D0-\\u05EA]|[\\u05F0-\\u05F2]|[\\u0621-\\u063A]|[\\u0641-\\u064A]|[\\u0671-\\u06B7]|[\\u06BA-\\u06BE]|[\\u06C0-\\u06CE]|[\\u06D0-\\u06D3]|\\u06D5|[\\u06E5-\\u06E6]|[\\u0905-\\u0939]|\\u093D|[\\u0958-\\u0961]|[\\u0985-\\u098C]|[\\u098F-\\u0990]|[\\u0993-\\u09A8]|[\\u09AA-\\u09B0]|\\u09B2|[\\u09B6-\\u09B9]|[\\u09DC-\\u09DD]|[\\u09DF-\\u09E1]|[\\u09F0-\\u09F1]|[\\u0A05-\\u0A0A]|[\\u0A0F-\\u0A10]|[\\u0A13-\\u0A28]|[\\u0A2A-\\u0A30]|[\\u0A32-\\u0A33]|[\\u0A35-\\u0A36]|[\\u0A38-\\u0A39]|[\\u0A59-\\u0A5C]|\\u0A5E|[\\u0A72-\\u0A74]|[\\u0A85-\\u0A8B]|\\u0A8D|[\\u0A8F-\\u0A91]|[\\u0A93-\\u0AA8]|[\\u0AAA-\\u0AB0]|[\\u0AB2-\\u0AB3]|[\\u0AB5-\\u0AB9]|\\u0ABD|\\u0AE0|[\\u0B05-\\u0B0C]|[\\u0B0F-\\u0B10]|[\\u0B13-\\u0B28]|[\\u0B2A-\\u0B30]|[\\u0B32-\\u0B33]|[\\u0B36-\\u0B39]|\\u0B3D|[\\u0B5C-\\u0B5D]|[\\u0B5F-\\u0B61]|[\\u0B85-\\u0B8A]|[\\u0B8E-\\u0B90]|[\\u0B92-\\u0B95]|[\\u0B99-\\u0B9A]|\\u0B9C|[\\u0B9E-\\u0B9F]|[\\u0BA3-\\u0BA4]|[\\u0BA8-\\u0BAA]|[\\u0BAE-\\u0BB5]|[\\u0BB7-\\u0BB9]|[\\u0C05-\\u0C0C]|[\\u0C0E-\\u0C10]|[\\u0C12-\\u0C28]|[\\u0C2A-\\u0C33]|[\\u0C35-\\u0C39]|[\\u0C60-\\u0C61]|[\\u0C85-\\u0C8C]|[\\u0C8E-\\u0C90]|[\\u0C92-\\u0CA8]|[\\u0CAA-\\u0CB3]|[\\u0CB5-\\u0CB9]|\\u0CDE|[\\u0CE0-\\u0CE1]|[\\u0D05-\\u0D0C]|[\\u0D0E-\\u0D10]|[\\u0D12-\\u0D28]|[\\u0D2A-\\u0D39]|[\\u0D60-\\u0D61]|[\\u0E01-\\u0E2E]|\\u0E30|[\\u0E32-\\u0E33]|[\\u0E40-\\u0E45]|[\\u0E81-\\u0E82]|\\u0E84|[\\u0E87-\\u0E88]|\\u0E8A|\\u0E8D|[\\u0E94-\\u0E97]|[\\u0E99-\\u0E9F]|[\\u0EA1-\\u0EA3]|\\u0EA5|\\u0EA7|[\\u0EAA-\\u0EAB]|[\\u0EAD-\\u0EAE]|\\u0EB0|[\\u0EB2-\\u0EB3]|\\u0EBD|[\\u0EC0-\\u0EC4]|[\\u0F40-\\u0F47]|[\\u0F49-\\u0F69]|[\\u10A0-\\u10C5]|[\\u10D0-\\u10F6]|\\u1100|[\\u1102-\\u1103]|[\\u1105-\\u1107]|\\u1109|[\\u110B-\\u110C]|[\\u110E-\\u1112]|\\u113C|\\u113E|\\u1140|\\u114C|\\u114E|\\u1150|[\\u1154-\\u1155]|\\u1159|[\\u115F-\\u1161]|\\u1163|\\u1165|\\u1167|\\u1169|[\\u116D-\\u116E]|[\\u1172-\\u1173]|\\u1175|\\u119E|\\u11A8|\\u11AB|[\\u11AE-\\u11AF]|[\\u11B7-\\u11B8]|\\u11BA|[\\u11BC-\\u11C2]|\\u11EB|\\u11F0|\\u11F9|[\\u1E00-\\u1E9B]|[\\u1EA0-\\u1EF9]|[\\u1F00-\\u1F15]|[\\u1F18-\\u1F1D]|[\\u1F20-\\u1F45]|[\\u1F48-\\u1F4D]|[\\u1F50-\\u1F57]|\\u1F59|\\u1F5B|\\u1F5D|[\\u1F5F-\\u1F7D]|[\\u1F80-\\u1FB4]|[\\u1FB6-\\u1FBC]|\\u1FBE|[\\u1FC2-\\u1FC4]|[\\u1FC6-\\u1FCC]|[\\u1FD0-\\u1FD3]|[\\u1FD6-\\u1FDB]|[\\u1FE0-\\u1FEC]|[\\u1FF2-\\u1FF4]|[\\u1FF6-\\u1FFC]|\\u2126|[\\u212A-\\u212B]|\\u212E|[\\u2180-\\u2182]|[\\u3041-\\u3094]|[\\u30A1-\\u30FA]|[\\u3105-\\u312C]|[\\uAC00-\\uD7A3]|[\\u4E00-\\u9FA5]|\\u3007|[\\u3021-\\u3029]|_|\\.|-|[\\u0300-\\u0345]|[\\u0360-\\u0361]|[\\u0483-\\u0486]|[\\u0591-\\u05A1]|[\\u05A3-\\u05B9]|[\\u05BB-\\u05BD]|\\u05BF|[\\u05C1-\\u05C2]|\\u05C4|[\\u064B-\\u0652]|\\u0670|[\\u06D6-\\u06DC]|[\\u06DD-\\u06DF]|[\\u06E0-\\u06E4]|[\\u06E7-\\u06E8]|[\\u06EA-\\u06ED]|[\\u0901-\\u0903]|\\u093C|[\\u093E-\\u094C]|\\u094D|[\\u0951-\\u0954]|[\\u0962-\\u0963]|[\\u0981-\\u0983]|\\u09BC|\\u09BE|\\u09BF|[\\u09C0-\\u09C4]|[\\u09C7-\\u09C8]|[\\u09CB-\\u09CD]|\\u09D7|[\\u09E2-\\u09E3]|\\u0A02|\\u0A3C|\\u0A3E|\\u0A3F|[\\u0A40-\\u0A42]|[\\u0A47-\\u0A48]|[\\u0A4B-\\u0A4D]|[\\u0A70-\\u0A71]|[\\u0A81-\\u0A83]|\\u0ABC|[\\u0ABE-\\u0AC5]|[\\u0AC7-\\u0AC9]|[\\u0ACB-\\u0ACD]|[\\u0B01-\\u0B03]|\\u0B3C|[\\u0B3E-\\u0B43]|[\\u0B47-\\u0B48]|[\\u0B4B-\\u0B4D]|[\\u0B56-\\u0B57]|[\\u0B82-\\u0B83]|[\\u0BBE-\\u0BC2]|[\\u0BC6-\\u0BC8]|[\\u0BCA-\\u0BCD]|\\u0BD7|[\\u0C01-\\u0C03]|[\\u0C3E-\\u0C44]|[\\u0C46-\\u0C48]|[\\u0C4A-\\u0C4D]|[\\u0C55-\\u0C56]|[\\u0C82-\\u0C83]|[\\u0CBE-\\u0CC4]|[\\u0CC6-\\u0CC8]|[\\u0CCA-\\u0CCD]|[\\u0CD5-\\u0CD6]|[\\u0D02-\\u0D03]|[\\u0D3E-\\u0D43]|[\\u0D46-\\u0D48]|[\\u0D4A-\\u0D4D]|\\u0D57|\\u0E31|[\\u0E34-\\u0E3A]|[\\u0E47-\\u0E4E]|\\u0EB1|[\\u0EB4-\\u0EB9]|[\\u0EBB-\\u0EBC]|[\\u0EC8-\\u0ECD]|[\\u0F18-\\u0F19]|\\u0F35|\\u0F37|\\u0F39|\\u0F3E|\\u0F3F|[\\u0F71-\\u0F84]|[\\u0F86-\\u0F8B]|[\\u0F90-\\u0F95]|\\u0F97|[\\u0F99-\\u0FAD]|[\\u0FB1-\\u0FB7]|\\u0FB9|[\\u20D0-\\u20DC]|\\u20E1|[\\u302A-\\u302F]|\\u3099|\\u309A|\\u00B7|\\u02D0|\\u02D1|\\u0387|\\u0640|\\u0E46|\\u0EC6|\\u3005|[\\u3031-\\u3035]|[\\u309D-\\u309E]|[\\u30FC-\\u30FE])*");
076    
077        /**
078         * Magic value for UTF-16 operations.
079         */
080        private static final int LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
081    
082        /**
083         * Magic value for UTF-16 operations.
084         */
085        private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
086    
087        /**
088         * UTF-16 code unit array containing less than and greater than for emitting
089         * those characters on certain parse errors.
090         */
091        private static final char[] LT_GT = { '<', '>' };
092    
093        /**
094         * UTF-16 code unit array containing less than and solidus for emitting
095         * those characters on certain parse errors.
096         */
097        private static final char[] LT_SOLIDUS = { '<', '/' };
098    
099        /**
100         * Array version of U+FFFD.
101         */
102        private static final char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
103    
104        /**
105         * Array version of space.
106         */
107        private static final char[] SPACE = { ' ' };
108    
109        /**
110         * Array version of line feed.
111         */
112        private static final char[] LF = { '\n' };
113    
114        /**
115         * Buffer growth parameter.
116         */
117        private static final int BUFFER_GROW_BY = 1024;
118    
119        /**
120         * Lexically sorted void element names
121         */
122        private static final String[] VOID_ELEMENTS = { "area", "base", "br",
123                "col", "embed", "hr", "img", "input", "link", "meta", "param" };
124    
125        /**
126         * "octype" as <code>char[]</code>
127         */
128        private static final char[] OCTYPE = "octype".toCharArray();
129    
130        /**
131         * "ublic" as <code>char[]</code>
132         */
133        private static final char[] UBLIC = "ublic".toCharArray();
134    
135        /**
136         * "ystem" as <code>char[]</code>
137         */
138        private static final char[] YSTEM = "ystem".toCharArray();
139    
140        /**
141         * The token handler.
142         */
143        private final TokenHandler tokenHandler;
144    
145        /**
146         * The error handler.
147         */
148        private ErrorHandler errorHandler;
149    
150        /**
151         * The input UTF-16 code unit stream. If a byte stream was given, this
152         * object is an instance of <code>HtmlInputStreamReader</code>.
153         */
154        private Reader reader;
155    
156        /**
157         * The main input buffer that the tokenizer reads from. Filled from
158         * <code>reader</code>.
159         */
160        private char[] buf = new char[2048];
161    
162        /**
163         * The index of the last <code>char</code> read from <code>buf</code>.
164         */
165        private int pos;
166    
167        /**
168         * The index of the first <code>char</code> in <code>buf</code> that is
169         * part of a coalesced run of character tokens or <code>-1</code> if there
170         * is not a current run being coalesced.
171         */
172        private int cstart;
173    
174        /**
175         * The number of <code>char</code>s in <code>buf</code> that have
176         * meaning. (The rest of the array is garbage and should not be examined.)
177         */
178        private int bufLen;
179    
180        /**
181         * The previous <code>char</code> read from the buffer with infoset
182         * alteration applied except for CR. Used for CRLF normalization and
183         * surrogate pair checking.
184         */
185        private char prev;
186    
187        /**
188         * Lookbehind buffer for magic RCDATA/CDATA escaping.
189         */
190        private final char[] prevFour = new char[4];
191    
192        /**
193         * Points to the last <code>char</code> written to <code>prevFour</code>.
194         */
195        private int prevFourPtr = 0;
196    
197        /**
198         * Single code unit buffer for reconsuming an input character. If
199         * <code>-1</code> the next <code>read()</code> returns from the real
200         * buffer, otherwise from here.
201         */
202        private int unreadBuffer = -1;
203    
204        /**
205         * The current line number in the current resource being parsed. (First line
206         * is 1.) Passed on as locator data.
207         */
208        private int line;
209    
210        private int linePrev;
211        
212        /**
213         * The current column number in the current resource being tokenized. (First
214         * column is 1, counted by UTF-16 code units.) Passed on as locator data.
215         */
216        private int col;
217        
218        private int colPrev;
219        
220        private boolean nextCharOnNewLine;
221        
222        /**
223         * The SAX public id for the resource being tokenized. (Only passed to back
224         * as part of locator data.)
225         */
226        private String publicId;
227    
228        /**
229         * The SAX system id for the resource being tokenized. (Only passed to back
230         * as part of locator data.)
231         */
232        private String systemId;
233    
234        /**
235         * Buffer for short identifiers.
236         */
237        private char[] strBuf = new char[64];
238    
239        /**
240         * Number of significant <code>char</code>s in <code>strBuf</code>.
241         */
242        private int strBufLen = 0;
243    
244        /**
245         * Buffer for long strings.
246         */
247        private char[] longStrBuf = new char[1024];
248    
249        /**
250         * Number of significant <code>char</code>s in <code>longStrBuf</code>.
251         */
252        private int longStrBufLen = 0;
253    
254        /**
255         * If not U+0000, a pending code unit to be appended to
256         * <code>longStrBuf</code>.
257         */
258        private char longStrBufPending = '\u0000';
259    
260        /**
261         * The attribute holder.
262         */
263        private AttributesImpl attributes;
264    
265        /**
266         * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
267         */
268        private final char[] bmpChar = new char[1];
269    
270        /**
271         * Buffer for expanding astral NCRs.
272         */
273        private final char[] astralChar = new char[2];
274    
275        /**
276         * Keeps track of PUA warnings.
277         */
278        private boolean alreadyWarnedAboutPrivateUseCharacters;
279    
280        /**
281         * http://www.whatwg.org/specs/web-apps/current-work/#content2
282         */
283        private ContentModelFlag contentModelFlag = ContentModelFlag.PCDATA;
284    
285        /**
286         * http://www.whatwg.org/specs/web-apps/current-work/#escape
287         */
288        private boolean escapeFlag = false;
289    
290        /**
291         * The element whose end tag closes the current CDATA or RCDATA element.
292         */
293        private String contentModelElement = "";
294    
295        /**
296         * <code>true</code> if tokenizing an end tag
297         */
298        private boolean endTag;
299    
300        /**
301         * The current tag token name.
302         */
303        private String tagName = null;
304    
305        /**
306         * The current attribute name.
307         */
308        private String attributeName = null;
309    
310        /**
311         * Whether comment tokens are emitted.
312         */
313        private boolean wantsComments = false;
314    
315        /**
316         * If <code>false</code>, <code>addAttribute*()</code> are no-ops.
317         */
318        private boolean shouldAddAttributes;
319    
320        /**
321         * <code>true</code> when in text content or in attribute value.
322         */
323        private boolean inContent;
324    
325        /**
326         * <code>true</code> when HTML4-specific additional errors are requested.
327         */
328        private boolean html4;
329    
330        /**
331         * Whether non-ASCII causes an error.
332         */
333        private boolean nonAsciiProhibited;
334    
335        /**
336         * Used together with <code>nonAsciiProhibited</code>.
337         */
338        private boolean alreadyComplainedAboutNonAscii;
339    
340        /**
341         * Whether the stream is past the first 512 bytes.
342         */
343        private boolean metaBoundaryPassed;
344    
345        /**
346         * The name of the current doctype token.
347         */
348        private String doctypeName;
349    
350        /**
351         * The public id of the current doctype token.
352         */
353        private String publicIdentifier;
354    
355        /**
356         * The system id of the current doctype token.
357         */
358        private String systemIdentifier;
359    
360        /**
361         * Used for NFC checking if non-<code>null</code>, source code capture,
362         * etc.
363         */
364        private CharacterHandler[] characterHandlers = new CharacterHandler[0];
365    
366        /**
367         * The policy for vertical tab and form feed.
368         */
369        private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALLOW;
370    
371        /**
372         * The policy for non-space non-XML characters.
373         */
374        private XmlViolationPolicy contentNonXmlCharPolicy = XmlViolationPolicy.ALLOW;
375    
376        /**
377         * The policy for comments.
378         */
379        private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALLOW;
380    
381        private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALLOW;
382    
383        private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALLOW;
384    
385        private boolean swallowBom;
386    
387        private boolean html4ModeCompatibleWithXhtml1Schemata;
388    
389        private boolean mappingLangToXmlLang;
390    
391        private XmlViolationPolicy bogusXmlnsPolicy;
392    
393        // start public API
394    
395        /**
396         * The constuctor.
397         * 
398         * @param tokenHandler
399         *            the handler for receiving tokens
400         */
401        public Tokenizer(TokenHandler tokenHandler) {
402            this.tokenHandler = tokenHandler;
403        }
404    
405        /**
406         * Turns NFC checking on or off.
407         * 
408         * @param enable
409         *            <code>true</code> if checking on
410         */
411        public void setCheckingNormalization(boolean enable) {
412            if (enable) {
413                if (isCheckingNormalization()) {
414                    return;
415                } else {
416                    NormalizationChecker normalizationChecker = new NormalizationChecker(
417                            this);
418                    normalizationChecker.setErrorHandler(errorHandler);
419    
420                }
421            } else {
422                if (isCheckingNormalization()) {
423                    CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length - 1];
424                    boolean skipped = false;
425                    int j = 0;
426                    for (int i = 0; i < characterHandlers.length; i++) {
427                        CharacterHandler ch = characterHandlers[i];
428                        if (!(!skipped && (ch instanceof NormalizationChecker))) {
429                            newHandlers[j] = ch;
430                            j++;
431                        }
432                    }
433                    characterHandlers = newHandlers;
434                } else {
435                    return;
436                }
437            }
438        }
439    
440        public void addCharacterHandler(CharacterHandler characterHandler) {
441            if (characterHandler == null) {
442                throw new IllegalArgumentException("Null argument.");
443            }
444            CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length + 1];
445            System.arraycopy(characterHandlers, 0, newHandlers, 0,
446                    characterHandlers.length);
447            newHandlers[characterHandlers.length] = characterHandler;
448            characterHandlers = newHandlers;
449        }
450    
451        /**
452         * Query if checking normalization.
453         * 
454         * @return <code>true</code> if checking on
455         */
456        public boolean isCheckingNormalization() {
457            for (int i = 0; i < characterHandlers.length; i++) {
458                CharacterHandler ch = characterHandlers[i];
459                if (ch instanceof NormalizationChecker) {
460                    return true;
461                }
462            }
463            return false;
464        }
465    
466        /**
467         * Sets the error handler.
468         * 
469         * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
470         */
471        public void setErrorHandler(ErrorHandler eh) {
472            this.errorHandler = eh;
473            for (int i = 0; i < characterHandlers.length; i++) {
474                CharacterHandler ch = characterHandlers[i];
475                if (ch instanceof NormalizationChecker) {
476                    NormalizationChecker nc = (NormalizationChecker) ch;
477                    nc.setErrorHandler(eh);
478                }
479            }
480        }
481    
482        /**
483         * Returns the commentPolicy.
484         * 
485         * @return the commentPolicy
486         */
487        public XmlViolationPolicy getCommentPolicy() {
488            return commentPolicy;
489        }
490    
491        /**
492         * Sets the commentPolicy.
493         * 
494         * @param commentPolicy
495         *            the commentPolicy to set
496         */
497        public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
498            this.commentPolicy = commentPolicy;
499        }
500    
501        /**
502         * Returns the contentNonXmlCharPolicy.
503         * 
504         * @return the contentNonXmlCharPolicy
505         */
506        public XmlViolationPolicy getContentNonXmlCharPolicy() {
507            return contentNonXmlCharPolicy;
508        }
509    
510        /**
511         * Sets the contentNonXmlCharPolicy.
512         * 
513         * @param contentNonXmlCharPolicy
514         *            the contentNonXmlCharPolicy to set
515         */
516        public void setContentNonXmlCharPolicy(
517                XmlViolationPolicy contentNonXmlCharPolicy) {
518            this.contentNonXmlCharPolicy = contentNonXmlCharPolicy;
519        }
520    
521        /**
522         * Returns the contentSpacePolicy.
523         * 
524         * @return the contentSpacePolicy
525         */
526        public XmlViolationPolicy getContentSpacePolicy() {
527            return contentSpacePolicy;
528        }
529    
530        /**
531         * Sets the contentSpacePolicy.
532         * 
533         * @param contentSpacePolicy
534         *            the contentSpacePolicy to set
535         */
536        public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
537            this.contentSpacePolicy = contentSpacePolicy;
538        }
539    
540        /**
541         * Sets the xmlnsPolicy.
542         * 
543         * @param xmlnsPolicy
544         *            the xmlnsPolicy to set
545         */
546        public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
547            if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
548                throw new IllegalArgumentException("Can't use FATAL here.");
549            }
550            this.xmlnsPolicy = xmlnsPolicy;
551        }
552    
553        public void setNamePolicy(XmlViolationPolicy namePolicy) {
554            this.namePolicy = namePolicy;
555        }
556    
557        /**
558         * Sets the bogusXmlnsPolicy.
559         * 
560         * @param bogusXmlnsPolicy
561         *            the bogusXmlnsPolicy to set
562         */
563        public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) {
564            this.bogusXmlnsPolicy = bogusXmlnsPolicy;
565        }
566    
567        /**
568         * Sets the html4ModeCompatibleWithXhtml1Schemata.
569         * 
570         * @param html4ModeCompatibleWithXhtml1Schemata
571         *            the html4ModeCompatibleWithXhtml1Schemata to set
572         */
573        public void setHtml4ModeCompatibleWithXhtml1Schemata(
574                boolean html4ModeCompatibleWithXhtml1Schemata) {
575            this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
576        }
577    
578        /**
579         * Runs the tokenization. This is the main entry point.
580         * 
581         * @param is
582         *            the input source
583         * @throws SAXException
584         *             on fatal error (if configured to treat XML violations as
585         *             fatal) or if the token handler threw
586         * @throws IOException
587         *             if the stream threw
588         */
589        public void tokenize(InputSource is) throws SAXException, IOException {
590            if (is == null) {
591                throw new IllegalArgumentException("InputSource was null.");
592            }
593            swallowBom = true;
594            this.systemId = is.getSystemId();
595            this.publicId = is.getPublicId();
596            this.reader = is.getCharacterStream();
597            CharsetDecoder decoder = decoderFromExternalDeclaration(is.getEncoding());
598            if (this.reader == null) {
599                InputStream inputStream = is.getByteStream();
600                if (inputStream == null) {
601                    throw new SAXException("Both streams in InputSource were null.");
602                }
603                if (decoder == null) {
604                    this.reader = new HtmlInputStreamReader(inputStream,
605                            errorHandler, this, this);
606                } else {
607                    this.reader = new HtmlInputStreamReader(inputStream,
608                            errorHandler, this, this, decoder);
609                }
610            }
611            contentModelFlag = ContentModelFlag.PCDATA;
612            escapeFlag = false;
613            inContent = true;
614            pos = -1;
615            cstart = -1;
616            line = linePrev = 0;
617            col = colPrev = 1;
618            nextCharOnNewLine = true;
619            prev = '\u0000';
620            bufLen = 0;
621            nonAsciiProhibited = false;
622            alreadyComplainedAboutNonAscii = false;
623            html4 = false;
624            alreadyWarnedAboutPrivateUseCharacters = false;
625            metaBoundaryPassed = false;
626            tokenHandler.start(this);
627            for (int i = 0; i < characterHandlers.length; i++) {
628                CharacterHandler ch = characterHandlers[i];
629                ch.start();
630            }
631            wantsComments = tokenHandler.wantsComments();
632            try {
633                if (swallowBom) {
634                    // Swallow the BOM
635                    char c = read();
636                    if (c == '\uFEFF') {
637                        line = linePrev = 0;
638                        col = colPrev = 1;
639                        nextCharOnNewLine = true;
640                    } else {
641                        unread(c);
642                    }
643                }
644                dataState();
645            } finally {
646                systemIdentifier = null;
647                publicIdentifier = null;
648                doctypeName = null;
649                tagName = null;
650                attributeName = null;
651                tokenHandler.eof();
652                for (int i = 0; i < characterHandlers.length; i++) {
653                    CharacterHandler ch = characterHandlers[i];
654                    ch.end();
655                }
656                reader.close();
657            }
658        }
659    
660        // For the token handler to call
661        /**
662         * Sets the content model flag and the associated element name.
663         * 
664         * @param contentModelFlag
665         *            the flag
666         * @param contentModelElement
667         *            the element causing the flag to be set
668         */
669        public void setContentModelFlag(ContentModelFlag contentModelFlag,
670                String contentModelElement) {
671            this.contentModelFlag = contentModelFlag;
672            this.contentModelElement = contentModelElement;
673        }
674    
675        // start Locator impl
676    
677        /**
678         * @see org.xml.sax.Locator#getPublicId()
679         */
680        public String getPublicId() {
681            return publicId;
682        }
683    
684        /**
685         * @see org.xml.sax.Locator#getSystemId()
686         */
687        public String getSystemId() {
688            return systemId;
689        }
690    
691        /**
692         * @see org.xml.sax.Locator#getLineNumber()
693         */
694        public int getLineNumber() {
695            if (line > 0) {
696                return line;
697            } else {
698                return -1;
699            }
700        }
701    
702        /**
703         * @see org.xml.sax.Locator#getColumnNumber()
704         */
705        public int getColumnNumber() {
706            if (col > 0) {
707                return col;
708            } else {
709                return -1;
710            }
711        }
712    
713        // end Locator impl
714    
715        // end public API
716    
717        void notifyAboutMetaBoundary() {
718            metaBoundaryPassed = true;
719        }
720    
721        void turnOnAdditionalHtml4Errors() {
722            html4 = true;
723        }
724    
725        void dontSwallowBom() {
726            swallowBom = false;
727        }
728    
729        void noEncodingDeclared() {
730            nonAsciiProhibited = true;
731        }
732    
733        AttributesImpl newAttributes() {
734            if (mappingLangToXmlLang) {
735                return new XmlLangAttributesImpl();
736            } else {
737                return new AttributesImpl();
738            }
739        }
740    
741        /**
742         * Clears the smaller buffer.
743         */
744        private void clearStrBuf() {
745            strBufLen = 0;
746        }
747    
748        /**
749         * Appends to the smaller buffer.
750         * 
751         * @param c
752         *            the UTF-16 code unit to append
753         */
754        private void appendStrBuf(char c) {
755            if (strBufLen == strBuf.length) {
756                char[] newBuf = new char[strBuf.length + BUFFER_GROW_BY];
757                System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
758                strBuf = newBuf;
759            }
760            strBuf[strBufLen++] = c;
761        }
762    
763        /**
764         * The smaller buffer as a string.
765         * 
766         * @return the smaller buffer as a string
767         */
768        private String strBufToString() {
769            return new String(strBuf, 0, strBufLen);
770        }
771    
772        /**
773         * Emits the smaller buffer as character tokens.
774         * 
775         * @throws SAXException
776         *             if the token handler threw
777         */
778        private void emitStrBuf() throws SAXException {
779            if (strBufLen > 0) {
780                tokenHandler.characters(strBuf, 0, strBufLen);
781            }
782        }
783    
784        private boolean isNcname(String str) {
785            Matcher m = NCNAME_PATTERN.matcher(str);
786            return m.matches();
787        }
788    
789        /**
790         * Clears the larger buffer.
791         */
792        private void clearLongStrBuf() {
793            longStrBufLen = 0;
794            longStrBufPending = '\u0000';
795        }
796    
797        /**
798         * Appends to the larger buffer.
799         * 
800         * @param c
801         *            the UTF-16 code unit to append
802         */
803        private void appendLongStrBuf(char c) {
804            if (longStrBufLen == longStrBuf.length) {
805                char[] newBuf = new char[longStrBuf.length + BUFFER_GROW_BY];
806                System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
807                longStrBuf = newBuf;
808            }
809            longStrBuf[longStrBufLen++] = c;
810        }
811    
812        /**
813         * Appends to the larger buffer when it is used to buffer a comment. Checks
814         * for two consecutive hyphens.
815         * 
816         * @param c
817         *            the UTF-16 code unit to append
818         * @throws SAXException
819         */
820        private void appendToComment(char c) throws SAXException {
821            if (longStrBufPending == '-' && c == '-') {
822                if (commentPolicy == XmlViolationPolicy.FATAL) {
823                    fatal("This document is not mappable to XML 1.0 without data loss to \u201C--\u201D in a comment.");
824                } else {
825                    warn("This document is not mappable to XML 1.0 without data loss to \u201C--\u201D in a comment.");
826                    if (wantsComments) {
827                        if (commentPolicy == XmlViolationPolicy.ALLOW) {
828                            appendLongStrBuf('-');
829                        } else {
830                            appendLongStrBuf('-');
831                            appendLongStrBuf(' ');
832                        }
833                    }
834                    longStrBufPending = '-';
835                }
836            } else {
837                if (longStrBufPending != '\u0000') {
838                    if (wantsComments) {
839                        appendLongStrBuf(longStrBufPending);
840                    }
841                    longStrBufPending = '\u0000';
842                }
843                if (c == '-') {
844                    longStrBufPending = '-';
845                } else {
846                    if (wantsComments) {
847                        appendLongStrBuf(c);
848                    }
849                }
850            }
851        }
852    
853        /**
854         * Appends to the larger buffer.
855         * 
856         * @param arr
857         *            the UTF-16 code units to append
858         */
859        private void appendLongStrBuf(char[] arr) {
860            for (int i = 0; i < arr.length; i++) {
861                appendLongStrBuf(arr[i]);
862            }
863        }
864    
865        /**
866         * Append the contents of the smaller buffer to the larger one.
867         */
868        private void appendStrBufToLongStrBuf() {
869            for (int i = 0; i < strBufLen; i++) {
870                appendLongStrBuf(strBuf[i]);
871            }
872        }
873    
874        /**
875         * The larger buffer as a string.
876         * 
877         * @return the larger buffer as a string
878         */
879        private String longStrBufToString() {
880            if (longStrBufPending != '\u0000') {
881                appendLongStrBuf(longStrBufPending);
882            }
883            return new String(longStrBuf, 0, longStrBufLen);
884        }
885    
886        /**
887         * Emits the current comment token.
888         * 
889         * @throws SAXException
890         */
891        private void emitComment() throws SAXException {
892            if (wantsComments) {
893                if (longStrBufPending != '\u0000') {
894                    appendLongStrBuf(longStrBufPending);
895                }
896            }
897            tokenHandler.comment(longStrBuf, longStrBufLen);
898        }
899    
900        /**
901         * Unreads a code unit so that it is returned the next time
902         * <code>read()</code> is called.
903         * 
904         * @param c
905         *            the code unit to unread
906         */
907        private void unread(char c) {
908            unreadBuffer = c;
909        }
910    
911        /**
912         * Reads the next UTF-16 code unit.
913         * 
914         * @return the next code unit
915         * @throws SAXException
916         * @throws IOException
917         */
918        private char read() throws SAXException, IOException {
919            for (;;) { // the loop is here for the CRLF case
920                if (unreadBuffer != -1) {
921                    char c = (char) unreadBuffer;
922                    unreadBuffer = -1;
923                    return c;
924                }
925                assert (bufLen > -1);
926                pos++;
927                assert pos <= bufLen;
928                linePrev = line;
929                colPrev = col;
930                if (nextCharOnNewLine) {
931                    line++;
932                    col = 1;
933                    nextCharOnNewLine = false;
934                } else {
935                    col++;
936                }
937                if (pos == bufLen) {
938                    boolean charDataContinuation = false;
939                    if (cstart > -1) {
940                        flushChars();
941                        charDataContinuation = true;
942                    }
943                    bufLen = reader.read(buf);
944                    assert bufLen <= buf.length;
945                    if (bufLen == -1) {
946                        return '\u0000';
947                    } else {
948                        for (int i = 0; i < characterHandlers.length; i++) {
949                            CharacterHandler ch = characterHandlers[i];
950                            ch.characters(buf, 0, bufLen);
951                        }
952                    }
953                    if (charDataContinuation) {
954                        cstart = 0;
955                    }
956                    pos = 0;
957                }
958                char c = buf[pos];
959                if (c > '\u007F' && nonAsciiProhibited
960                        && !alreadyComplainedAboutNonAscii) {
961                    err("The character encoding of the document was not explicit but the document contains non-ASCII.");
962                }
963                switch (c) {
964                    case '\n':
965                        /*
966                         * U+000D CARRIAGE RETURN (CR) characters, and U+000A LINE
967                         * FEED (LF) characters, are treated specially. Any CR
968                         * characters that are followed by LF characters must be
969                         * removed, and any CR characters not followed by LF
970                         * characters must be converted to LF characters.
971                         */
972                        if (prev == '\r') {
973                            // swallow the LF
974                            if (cstart != -1) {
975                                flushChars();
976                                cstart = pos + 1;
977                            }
978                            col = colPrev;
979                            line = linePrev;
980                            nextCharOnNewLine = true;
981                            prev = c;
982                            continue;
983                        } else {
984                            nextCharOnNewLine = true;
985                        }
986                        break;
987                    case '\r':
988                        c = buf[pos] = '\n';
989                        nextCharOnNewLine = true;
990                        prev = '\r';
991                        if (contentModelFlag != ContentModelFlag.PCDATA) {
992                            prevFourPtr++;
993                            prevFourPtr %= 4;
994                            prevFour[prevFourPtr] = c;
995                        }
996                        return c;
997                    case '\u0000':
998                        /*
999                         * All U+0000 NULL characters in the input must be replaced
1000                         * by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
1001                         * characters is a parse error.
1002                         */
1003                        err("Found U+0000 in the character stream.");
1004                        c = buf[pos] = '\uFFFD';
1005                        break;
1006                    case '\u000B':
1007                    case '\u000C':
1008                        if (inContent) {
1009                            if (contentNonXmlCharPolicy == XmlViolationPolicy.FATAL) {
1010                                fatal("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
1011                            } else {
1012                                if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) {
1013                                    c = buf[pos] = ' ';
1014                                }
1015                                warn("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
1016                            }
1017                        }
1018                        break;
1019                    default:
1020                        if ((c & 0xFC00) == 0xDC00) {
1021                            // Got a low surrogate. See if prev was high surrogate
1022                            if ((prev & 0xFC00) == 0xD800) {
1023                                int intVal = (prev << 10) + c + SURROGATE_OFFSET;
1024                                if (isNonCharacter(intVal)) {
1025                                    warn("Astral non-character.");
1026                                }
1027                                if (isAstralPrivateUse(intVal)) {
1028                                    warnAboutPrivateUseChar();
1029                                }
1030                            } else {
1031                                // XXX figure out what to do about lone high
1032                                // surrogates
1033                                err("Found low surrogate without high surrogate.");
1034                                c = buf[pos] = '\uFFFD';
1035                            }
1036                        } else if (inContent && (c < ' ' || isNonCharacter(c))
1037                                && (c != '\t')) {
1038                            if (contentNonXmlCharPolicy == XmlViolationPolicy.FATAL) {
1039                                fatal("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
1040                            } else {
1041                                if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) {
1042                                    c = buf[pos] = '\uFFFD';
1043                                }
1044                                warn("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
1045                            }
1046                        } else if (isPrivateUse(c)) {
1047                            warnAboutPrivateUseChar();
1048                        }
1049                }
1050                prev = c;
1051                if (contentModelFlag != ContentModelFlag.PCDATA) {
1052                    prevFourPtr++;
1053                    prevFourPtr %= 4;
1054                    prevFour[prevFourPtr] = c;
1055                }
1056                return c;
1057            }
1058        }
1059    
1060        /**
1061         * Emits a warning about private use characters if the warning has not been
1062         * emitted yet.
1063         * 
1064         * @throws SAXException
1065         */
1066        private void warnAboutPrivateUseChar() throws SAXException {
1067            if (!alreadyWarnedAboutPrivateUseCharacters) {
1068                warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)");
1069                alreadyWarnedAboutPrivateUseCharacters = true;
1070            }
1071        }
1072    
1073        /**
1074         * Tells if the argument is a BMP PUA character.
1075         * 
1076         * @param c
1077         *            the UTF-16 code unit to check
1078         * @return <code>true</code> if PUA character
1079         */
1080        private boolean isPrivateUse(char c) {
1081            return c >= '\uE000' && c <= '\uF8FF';
1082        }
1083    
1084        /**
1085         * Tells if the argument is an astral PUA character.
1086         * 
1087         * @param c
1088         *            the code point to check
1089         * @return <code>true</code> if astral private use
1090         */
1091        private boolean isAstralPrivateUse(int c) {
1092            return (c >= 0xF0000 && c <= 0xFFFFD)
1093                    || (c >= 0x100000 && c <= 0x10FFFD);
1094        }
1095    
1096        /**
1097         * Tells if the argument is a non-character (works for BMP and astral).
1098         * 
1099         * @param c
1100         *            the code point to check
1101         * @return <code>true</code> if non-character
1102         */
1103        private boolean isNonCharacter(int c) {
1104            return (c & 0xFFFE) == 0xFFFE;
1105        }
1106    
1107        /**
1108         * Flushes coalesced character tokens.
1109         * 
1110         * @throws SAXException
1111         */
1112        private void flushChars() throws SAXException, IOException {
1113            if (cstart != -1) {
1114                if (pos > cstart) {
1115                    int currLine = line;
1116                    int currCol = col;
1117                    line = linePrev;
1118                    col = colPrev;
1119                    try {
1120                        tokenHandler.characters(buf, cstart, pos - cstart);
1121                    } finally {
1122                        line = currLine;
1123                        col = currCol;
1124                    }
1125                }
1126            }
1127            cstart = -1;
1128        }
1129    
1130        /**
1131         * Reports an condition that would make the infoset incompatible with XML
1132         * 1.0 as fatal.
1133         * 
1134         * @param message
1135         *            the message
1136         * @throws SAXException
1137         * @throws SAXParseException
1138         */
1139        private void fatal(String message) throws SAXException {
1140            SAXParseException spe = new SAXParseException(message, this);
1141            if (errorHandler != null) {
1142                errorHandler.fatalError(spe);
1143            }
1144            throw spe;
1145        }
1146    
1147        /**
1148         * Reports a Parse Error.
1149         * 
1150         * @param message
1151         *            the message
1152         * @throws SAXException
1153         */
1154        private void err(String message) throws SAXException {
1155            if (errorHandler == null) {
1156                return;
1157            }
1158            SAXParseException spe = new SAXParseException(message, this);
1159            errorHandler.error(spe);
1160        }
1161    
1162        /**
1163         * Reports a warning
1164         * 
1165         * @param message
1166         *            the message
1167         * @throws SAXException
1168         */
1169        private void warn(String message) throws SAXException {
1170            if (errorHandler == null) {
1171                return;
1172            }
1173            SAXParseException spe = new SAXParseException(message, this);
1174            errorHandler.warning(spe);
1175        }
1176    
1177        /**
1178         * Initializes a decoder from external decl.
1179         */
1180        private CharsetDecoder decoderFromExternalDeclaration(String encoding)
1181                throws SAXException {
1182            if (encoding == null) {
1183                return null;
1184            }
1185            encoding = encoding.toUpperCase();
1186            if ("ISO-8859-1".equals(encoding)) {
1187                encoding = "Windows-1252";
1188            }
1189            if ("UTF-16".equals(encoding) || "UTF-32".equals(encoding)) {
1190                swallowBom = false;
1191            }
1192            try {
1193                Charset cs = Charset.forName(encoding);
1194                String canonName = cs.name();
1195                if (canonName.startsWith("X-") || canonName.startsWith("x-")
1196                        || canonName.startsWith("Mac")) {
1197                    if (encoding.startsWith("X-")) {
1198                        err("The encoding \u201C"
1199                                + encoding
1200                                + "\u201D is not an IANA-registered encoding. (Charmod C022)");
1201                    } else {
1202                        err("The encoding \u201C"
1203                                + encoding
1204                                + "\u201D is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
1205                    }
1206                } else if (!canonName.equalsIgnoreCase(encoding)) {
1207                    err("The encoding \u201C"
1208                            + encoding
1209                            + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
1210                            + canonName + "\u201D. (Charmod C024)");
1211                }
1212                if (EncodingInfo.isObscure(canonName)) {
1213                    warn("The character encoding \u201C"
1214                            + encoding
1215                            + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
1216                }
1217                return cs.newDecoder();
1218            } catch (IllegalCharsetNameException e) {
1219                err("Illegal character encoding name: \u201C" + encoding
1220                        + "\u201D. Will sniff.");
1221            } catch (UnsupportedCharsetException e) {
1222                err("Unsupported character encoding name: \u201C" + encoding
1223                        + "\u201D. Will sniff.");
1224                swallowBom = true;
1225            }
1226            return null; // keep the compiler happy
1227        }
1228    
1229        private boolean currentIsVoid() {
1230            return Arrays.binarySearch(VOID_ELEMENTS, tagName) > -1;
1231        }
1232    
1233        /**
1234         * Data state
1235         * 
1236         * @throws IOException
1237         * @throws SAXException
1238         * 
1239         */
1240        private void dataState() throws SAXException, IOException {
1241            char c = '\u0000';
1242            for (;;) {
1243                c = read();
1244                if (c == '&'
1245                        && (contentModelFlag == ContentModelFlag.PCDATA || (contentModelFlag == ContentModelFlag.RCDATA)
1246                                && !escapeFlag)) {
1247                    /*
1248                     * U+0026 AMPERSAND (&) When the content model flag is set to
1249                     * one of the PCDATA or RCDATA states: switch to the entity data
1250                     * state. Otherwise: treat it as per the "anything else" entry
1251                     * below.
1252                     */
1253                    flushChars();
1254                    entityDataState();
1255                    continue;
1256                } else if (c == '<'
1257                        && ((contentModelFlag == ContentModelFlag.PCDATA) || (escapeFlag == false && (contentModelFlag == ContentModelFlag.CDATA || contentModelFlag == ContentModelFlag.RCDATA)))) {
1258                    /*
1259                     * U+003C LESS-THAN SIGN (<) When the content model flag is set
1260                     * to the PCDATA state: switch to the tag open state. When the
1261                     * content model flag is set to either the RCDATA state or the
1262                     * CDATA state and the escape flag is false: switch to the tag
1263                     * open state. Otherwise: treat it as per the "anything else"
1264                     * entry below.
1265                     */
1266                    flushChars();
1267                    resetAttributes();
1268                    inContent = false;
1269                    tagOpenState();
1270                    inContent = true;
1271                    continue;
1272                } else if (c == '\u0000') {
1273                    /*
1274                     * EOF Emit an end-of-file token.
1275                     */
1276                    flushChars();
1277                    return; // eof() called in parent finally block
1278                } else {
1279                    if (c == '-'
1280                            && (escapeFlag == false)
1281                            && (contentModelFlag == ContentModelFlag.RCDATA || contentModelFlag == ContentModelFlag.CDATA)
1282                            && lastLtExclHyph()) {
1283                        /*
1284                         * U+002D HYPHEN-MINUS (-) If the content model flag is set
1285                         * to either the RCDATA state or the CDATA state, and the
1286                         * escape flag is false, and there are at least three
1287                         * characters before this one in the input stream, and the
1288                         * last four characters in the input stream, including this
1289                         * one, are U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK,
1290                         * U+002D HYPHEN-MINUS, and U+002D HYPHEN-MINUS ("<!--"),
1291                         * then set the escape flag to true.
1292                         * 
1293                         * In any case, emit the input character as a character
1294                         * token. Stay in the data state.
1295                         */
1296                        escapeFlag = true;
1297                    } else if (c == '>' && escapeFlag && lastHyphHyph()) {
1298                        /*
1299                         * U+003E GREATER-THAN SIGN (>) If the content model flag is
1300                         * set to either the RCDATA state or the CDATA state, and
1301                         * the escape flag is true, and the last three characters in
1302                         * the input stream including this one are U+002D
1303                         * HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN
1304                         * SIGN ("-->"), set the escape flag to false.
1305                         * 
1306                         * In any case, emit the input character as a character
1307                         * token. Stay in the data state.
1308                         */
1309                        escapeFlag = false;
1310                    }
1311                    /*
1312                     * Anything else Emit the input character as a character token.
1313                     */
1314                    if (cstart == -1) {
1315                        // start coalescing character tokens
1316                        cstart = pos;
1317                    }
1318                    /*
1319                     * Stay in the data state.
1320                     */
1321                    continue;
1322                }
1323            }
1324        }
1325    
1326        private boolean lastHyphHyph() {
1327            return prevFour[(prevFourPtr - 1 + 4) % 4] == '-'
1328                    && prevFour[(prevFourPtr - 2 + 4) % 4] == '-';
1329        }
1330    
1331        private boolean lastLtExclHyph() {
1332            return prevFour[(prevFourPtr - 1 + 4) % 4] == '-'
1333                    && prevFour[(prevFourPtr - 2 + 4) % 4] == '!'
1334                    && prevFour[(prevFourPtr - 3 + 4) % 4] == '<';
1335        }
1336    
1337        /**
1338         * 
1339         * Entity data state
1340         * 
1341         * @throws IOException
1342         * @throws SAXException
1343         */
1344        private void entityDataState() throws SAXException, IOException {
1345            /*
1346             * (This cannot happen if the content model flag is set to the CDATA
1347             * state.)
1348             * 
1349             * Attempt to consume an entity.
1350             */
1351            consumeEntity(false);
1352            /*
1353             * If nothing is returned, emit a U+0026 AMPERSAND character token.
1354             * 
1355             * Otherwise, emit the character token that was returned.
1356             */
1357            // Handled by consumeEntity()
1358            /*
1359             * Finally, switch to the data state.
1360             */
1361            return;
1362        }
1363    
1364        /**
1365         * Tag open state
1366         * 
1367         * @throws IOException
1368         * @throws SAXException
1369         */
1370        private void tagOpenState() throws SAXException, IOException {
1371            /*
1372             * The behaviour of this state depends on the content model flag.
1373             */
1374            // this can't happen in PLAINTEXT, so using not PCDATA as the condition
1375            if (contentModelFlag != ContentModelFlag.PCDATA) {
1376                /*
1377                 * If the content model flag is set to the RCDATA or CDATA states
1378                 * Consume the next input character.
1379                 */
1380                char c = read();
1381                if (c == '/') {
1382                    /*
1383                     * If it is a U+002F SOLIDUS (/) character, switch to the close
1384                     * tag open state.
1385                     */
1386                    closeTagOpenState();
1387                    return;
1388                } else {
1389                    /*
1390                     * Otherwise, emit a U+003C LESS-THAN SIGN character token
1391                     */
1392                    tokenHandler.characters(LT_GT, 0, 1);
1393                    /*
1394                     * and reconsume the current input character in the data state.
1395                     */
1396                    unread(c);
1397                    return;
1398                }
1399            } else {
1400                /*
1401                 * If the content model flag is set to the PCDATA state Consume the
1402                 * next input character:
1403                 */
1404                char c = read();
1405                if (c == '!') {
1406                    /*
1407                     * U+0021 EXCLAMATION MARK (!) Switch to the markup declaration
1408                     * open state.
1409                     */
1410                    markupDeclarationOpenState();
1411                    return;
1412                } else if (c == '/') {
1413                    /* U+002F SOLIDUS (/) Switch to the close tag open state. */
1414                    closeTagOpenState();
1415                    return;
1416                } else if (c >= 'A' && c <= 'Z') {
1417                    /*
1418                     * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL
1419                     * LETTER Z Create a new start tag token,
1420                     */
1421                    endTag = false;
1422                    /*
1423                     * set its tag name to the lowercase version of the input
1424                     * character (add 0x0020 to the character's code point),
1425                     */
1426                    clearStrBuf();
1427                    appendStrBuf((char) (c + 0x20));
1428                    /* then switch to the tag name state. */
1429                    tagNameState();
1430                    /*
1431                     * (Don't emit the token yet; further details will be filled in
1432                     * before it is emitted.)
1433                     */
1434                    return;
1435                } else if (c >= 'a' && c <= 'z') {
1436                    /*
1437                     * U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL
1438                     * LETTER Z Create a new start tag token,
1439                     */
1440                    endTag = false;
1441                    /*
1442                     * set its tag name to the input character,
1443                     */
1444                    clearStrBuf();
1445                    appendStrBuf(c);
1446                    /* then switch to the tag name state. */
1447                    tagNameState();
1448                    /*
1449                     * (Don't emit the token yet; further details will be filled in
1450                     * before it is emitted.)
1451                     */
1452                    return;
1453                } else if (c == '>') {
1454                    /*
1455                     * U+003E GREATER-THAN SIGN (>) Parse error.
1456                     */
1457                    err("Bad character \u201C>\u201D in the tag open state.");
1458                    /*
1459                     * Emit a U+003C LESS-THAN SIGN character token and a U+003E
1460                     * GREATER-THAN SIGN character token.
1461                     */
1462                    tokenHandler.characters(LT_GT, 0, 2);
1463                    /* Switch to the data state. */
1464                    return;
1465                } else if (c == '?') {
1466                    /*
1467                     * U+003F QUESTION MARK (?) Parse error.
1468                     */
1469                    err("Bad character \u201C?\u201D in the tag open state.");
1470                    /*
1471                     * Switch to the bogus comment state.
1472                     */
1473                    clearLongStrBuf();
1474                    appendLongStrBuf(c);
1475                    bogusCommentState();
1476                    return;
1477                } else {
1478                    /*
1479                     * Anything else Parse error.
1480                     */
1481                    err("Bad character \u201C" + c
1482                            + "\u201D in the tag open state.");
1483                    /*
1484                     * Emit a U+003C LESS-THAN SIGN character token
1485                     */
1486                    tokenHandler.characters(LT_GT, 0, 1);
1487                    /*
1488                     * and reconsume the current input character in the data state.
1489                     */
1490                    unread(c);
1491                    return;
1492                }
1493            }
1494        }
1495    
1496        /**
1497         * Close tag open state
1498         * 
1499         * @throws IOException
1500         * @throws SAXException
1501         */
1502        private void closeTagOpenState() throws SAXException, IOException {
1503            // this can't happen in PLAINTEXT, so using not PCDATA as the condition
1504            if (contentModelFlag != ContentModelFlag.PCDATA
1505                    && contentModelElement != null) {
1506                /*
1507                 * If the content model flag is set to the RCDATA or CDATA states
1508                 * but no start tag token has ever been emitted by this instance of
1509                 * the tokeniser (fragment case), or, if the content model flag is
1510                 * set to the RCDATA or CDATA states and the next few characters do
1511                 * not match the tag name of the last start tag token emitted (case
1512                 * insensitively), or if they do but they are not immediately
1513                 * followed by one of the following characters: + U+0009 CHARACTER
1514                 * TABULATION + U+000A LINE FEED (LF) + U+000B LINE TABULATION +
1515                 * U+000C FORM FEED (FF) + U+0020 SPACE + U+003E GREATER-THAN SIGN
1516                 * (>) + U+002F SOLIDUS (/) + EOF
1517                 * 
1518                 * ...then emit a U+003C LESS-THAN SIGN character token, a U+002F
1519                 * SOLIDUS character token, and switch to the data state to process
1520                 * the next input character.
1521                 */
1522                // Let's implement the above without lookahead. strBuf holds
1523                // characters that need to be emitted if looking for an end tag
1524                // fails.
1525                // Duplicating the relevant part of tag name state here as well.
1526                clearStrBuf();
1527                for (int i = 0; i < contentModelElement.length(); i++) {
1528                    char e = contentModelElement.charAt(i);
1529                    char c = read();
1530                    char folded = c;
1531                    if (c >= 'A' && c <= 'Z') {
1532                        folded += 0x20;
1533                    }
1534                    if (folded != e) {
1535                        if (i > 0 || (folded >= 'a' && folded <= 'z')) {
1536                            if (html4) {
1537                                if (!"iframe".equals(contentModelElement)) {
1538                                    err((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
1539                                            : "RCDATA")
1540                                            + " element \u201C"
1541                                            + contentModelElement
1542                                            + "\u201D contained the string \u201C</\u201D, but it was not the start of the end tag. (HTML4-only error)");
1543                                }
1544                            } else {
1545                                warn((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
1546                                        : "RCDATA")
1547                                        + " element \u201C"
1548                                        + contentModelElement
1549                                        + "\u201D contained the string \u201C</\u201D, but this did not close the element.");
1550                            }
1551                        }
1552                        tokenHandler.characters(LT_SOLIDUS, 0, 2);
1553                        emitStrBuf();
1554                        unread(c);
1555                        return;
1556                    }
1557                    appendStrBuf(c);
1558                }
1559                endTag = true;
1560                tagName = contentModelElement;
1561                char c = read();
1562                switch (c) {
1563                    case ' ':
1564                    case '\t':
1565                    case '\n':
1566                    case '\u000B':
1567                    case '\u000C':
1568                        /*
1569                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
1570                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
1571                         * to the before attribute name state.
1572                         */
1573                        beforeAttributeNameState();
1574                        return;
1575                    case '>':
1576                        /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
1577                        emitCurrentTagToken();
1578                        /*
1579                         * Switch to the data state.
1580                         */
1581                        return;
1582                    case '\u0000':
1583                        /*
1584                         * EOF Parse error.
1585                         */
1586                        err("Expected \u201C>\u201D but saw end of file instead.");
1587                        /*
1588                         * Emit the current tag token.
1589                         */
1590                        emitCurrentTagToken();
1591                        /* Reconsume the character in the data state. */
1592                        unread(c);
1593                        return;
1594                    case '/':
1595                        /*
1596                         * U+002F SOLIDUS (/) Parse error unless this is a permitted
1597                         * slash.
1598                         */
1599                        // never permitted here
1600                        err("Stray \u201C/\u201D in end tag.");
1601                        /* Switch to the before attribute name state. */
1602                        beforeAttributeNameState();
1603                        return;
1604                    default:
1605                        if (html4) {
1606                            err((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
1607                                    : "RCDATA")
1608                                    + " element \u201C"
1609                                    + contentModelElement
1610                                    + "\u201D contained the string \u201C</\u201D, but it was not the start of the end tag. (HTML4-only error)");
1611                        } else {
1612                            warn((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
1613                                    : "RCDATA")
1614                                    + " element \u201C"
1615                                    + contentModelElement
1616                                    + "\u201D contained the string \u201C</\u201D, but this did not close the element.");
1617                        }
1618                        tokenHandler.characters(LT_SOLIDUS, 0, 2);
1619                        emitStrBuf();
1620                        cstart = pos; // don't drop the character
1621                        return;
1622                }
1623            } else {
1624                /*
1625                 * Otherwise, if the content model flag is set to the PCDATA state,
1626                 * or if the next few characters do match that tag name, consume the
1627                 * next input character:
1628                 */
1629                char c = read();
1630                if (c >= 'A' && c <= 'Z') {
1631                    /*
1632                     * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL
1633                     * LETTER Z Create a new end tag token,
1634                     */
1635                    endTag = true;
1636                    clearStrBuf();
1637                    /*
1638                     * set its tag name to the lowercase version of the input
1639                     * character (add 0x0020 to the character's code point),
1640                     */
1641                    appendStrBuf((char) (c + 0x20));
1642                    /*
1643                     * then switch to the tag name state. (Don't emit the token yet;
1644                     * further details will be filled in before it is emitted.)
1645                     */
1646                    tagNameState();
1647                    return;
1648                } else if (c >= 'a' && c <= 'z') {
1649                    /*
1650                     * U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL
1651                     * LETTER Z Create a new end tag token,
1652                     */
1653                    endTag = true;
1654                    clearStrBuf();
1655                    /*
1656                     * set its tag name to the input character,
1657                     */
1658                    appendStrBuf(c);
1659                    /*
1660                     * then switch to the tag name state. (Don't emit the token yet;
1661                     * further details will be filled in before it is emitted.)
1662                     */
1663                    tagNameState();
1664                    return;
1665                } else if (c == '>') {
1666                    /* U+003E GREATER-THAN SIGN (>) Parse error. */
1667                    err("Saw \u201C</>\u201D.");
1668                    /*
1669                     * Switch to the data state.
1670                     */
1671                    return;
1672                } else if (c == '\u0000') {
1673                    /* EOF Parse error. */
1674                    err("Saw \u201C</\u201D immediately before end of file.");
1675                    /*
1676                     * Emit a U+003C LESS-THAN SIGN character token and a U+002F
1677                     * SOLIDUS character token.
1678                     */
1679                    tokenHandler.characters(LT_SOLIDUS, 0, 2);
1680                    /*
1681                     * Reconsume the EOF character in the data state.
1682                     */
1683                    unread(c);
1684                    return;
1685                } else {
1686                    /* Anything else Parse error. */
1687                    err("Garbage after \u201C</\u201D.");
1688                    /*
1689                     * Switch to the bogus comment state.
1690                     */
1691                    clearLongStrBuf();
1692                    appendToComment(c);
1693                    bogusCommentState();
1694                    return;
1695                }
1696            }
1697        }
1698    
1699        /**
1700         * Tag name state
1701         * 
1702         * @throws IOException
1703         * @throws SAXException
1704         */
1705        private void tagNameState() throws SAXException, IOException {
1706            for (;;) {
1707                /*
1708                 * Consume the next input character:
1709                 */
1710                char c = read();
1711                switch (c) {
1712                    case ' ':
1713                    case '\t':
1714                    case '\n':
1715                    case '\u000B':
1716                    case '\u000C':
1717                        /*
1718                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
1719                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
1720                         * to the before attribute name state.
1721                         */
1722                        tagName = strBufToElementNameString();
1723                        beforeAttributeNameState();
1724                        return;
1725                    case '>':
1726                        /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
1727                        tagName = strBufToElementNameString();
1728                        emitCurrentTagToken();
1729                        /*
1730                         * Switch to the data state.
1731                         */
1732                        return;
1733                    case '\u0000':
1734                        /*
1735                         * EOF Parse error.
1736                         */
1737                        err("End of file seen when looking for tag name");
1738                        /*
1739                         * Emit the current tag token.
1740                         */
1741                        tagName = strBufToElementNameString();
1742                        emitCurrentTagToken();
1743                        /*
1744                         * Reconsume the EOF character in the data state.
1745                         */
1746                        unread(c);
1747                        return;
1748                    case '/':
1749                        /*
1750                         * U+002F SOLIDUS (/) Parse error unless this is a permitted
1751                         * slash.
1752                         */
1753                        tagName = strBufToElementNameString();
1754                        parseErrorUnlessPermittedSlash();
1755                        /*
1756                         * Switch to the before attribute name state.
1757                         */
1758                        beforeAttributeNameState();
1759                        return;
1760                    default:
1761                        if (c >= 'A' && c <= 'Z') {
1762                            /*
1763                             * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
1764                             * CAPITAL LETTER Z Append the lowercase version of the
1765                             * current input character (add 0x0020 to the
1766                             * character's code point) to the current tag token's
1767                             * tag name.
1768                             */
1769                            appendStrBuf((char) (c + 0x20));
1770                        } else {
1771                            /*
1772                             * Anything else Append the current input character to
1773                             * the current tag token's tag name.
1774                             */
1775                            appendStrBuf(c);
1776                        }
1777                        /*
1778                         * Stay in the tag name state.
1779                         */
1780                        continue;
1781                }
1782            }
1783        }
1784    
1785        private String strBufToElementNameString() {
1786            // TODO Generate a better interning function
1787            return strBufToString().intern();
1788        }
1789    
1790        /**
1791         * This method implements a wrapper loop for the attribute-related states to
1792         * avoid recursion to an arbitrary depth.
1793         * 
1794         * @throws IOException
1795         * @throws SAXException
1796         */
1797        private void beforeAttributeNameState() throws SAXException, IOException {
1798            while (beforeAttributeNameStateImpl()) {
1799                // Spin.
1800            }
1801        }
1802    
1803        /**
1804         * 
1805         */
1806        private void resetAttributes() {
1807            attributes = null; // XXX figure out reuse
1808        }
1809    
1810        /**
1811         * Before attribute name state
1812         * 
1813         * @throws IOException
1814         * @throws SAXException
1815         */
1816        private boolean beforeAttributeNameStateImpl() throws SAXException,
1817                IOException {
1818            /*
1819             * Consume the next input character:
1820             */
1821            for (;;) {
1822                char c = read();
1823                switch (c) {
1824                    case ' ':
1825                    case '\t':
1826                    case '\n':
1827                    case '\u000B':
1828                    case '\u000C':
1829                        /*
1830                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
1831                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
1832                         * in the before attribute name state.
1833                         */
1834                        continue;
1835                    case '>':
1836                        /*
1837                         * U+003E GREATER-THAN SIGN (>) Emit the current tag token.
1838                         */
1839                        emitCurrentTagToken();
1840                        /*
1841                         * Switch to the data state.
1842                         */
1843                        return false;
1844                    case '/':
1845                        /*
1846                         * U+002F SOLIDUS (/) Parse error unless this is a permitted
1847                         * slash.
1848                         */
1849                        parseErrorUnlessPermittedSlash();
1850                        /*
1851                         * Stay in the before attribute name state.
1852                         */
1853                        continue;
1854                    case '\u0000':
1855                        /* EOF Parse error. */
1856                        err("Saw end of file without the previous tag ending with \u201C>\u201C.");
1857                        /*
1858                         * Emit the current tag token.
1859                         */
1860                        emitCurrentTagToken();
1861                        /*
1862                         * Reconsume the EOF character in the data state.
1863                         */
1864                        unread(c);
1865                        return false;
1866                    default:
1867                        /*
1868                         * Anything else Start a new attribute in the current tag
1869                         * token.
1870                         */
1871                        clearStrBuf();
1872    
1873                        if (c >= 'A' && c <= 'Z') {
1874                            /*
1875                             * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
1876                             * CAPITAL LETTER Z Set that attribute's name to the
1877                             * lowercase version of the current input character (add
1878                             * 0x0020 to the character's code point)
1879                             */
1880                            appendStrBuf((char) (c + 0x20));
1881                        } else {
1882                            /*
1883                             * Set that attribute's name to the current input
1884                             * character,
1885                             */
1886                            appendStrBuf(c);
1887                        }
1888                        /*
1889                         * and its value to the empty string.
1890                         */
1891                        // Will do later.
1892                        /*
1893                         * Switch to the attribute name state.
1894                         */
1895                        return attributeNameState();
1896                }
1897            }
1898        }
1899    
1900        private void parseErrorUnlessPermittedSlash() throws SAXException,
1901                IOException {
1902            /*
1903             * A permitted slash is a U+002F SOLIDUS character that is immediately
1904             * followed by a U+003E GREATER-THAN SIGN, if, and only if, the current
1905             * token being processed is a start tag token whose tag name is one of
1906             * the following: base, link, meta, hr, br, img, embed, param, area,
1907             * col, input
1908             */
1909            if (endTag) {
1910                err("Stray \u201C/\u201D in an end tag.");
1911                return;
1912            }
1913            char c = read();
1914            int saveLine = line;
1915            int saveCol = col;
1916            line = linePrev;
1917            col = colPrev;
1918            if (c == '>') {
1919                if (!currentIsVoid() && !html4) {
1920                    if (html4) {
1921                        err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is not permitted in HTML4.");
1922                    } else {
1923                        err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is only permitted on void elements.");
1924                    }
1925                } else if (html4) {
1926                    err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is not permitted in HTML4. (HTML4-only error)");
1927                }
1928            } else {
1929                err("Stray \u201C/\u201D in tag.");
1930            }
1931            line = saveLine;
1932            col = saveCol;
1933            unread(c);
1934        }
1935    
1936        private void emitCurrentTagToken() throws SAXException {
1937            if (namePolicy != XmlViolationPolicy.ALLOW) {
1938                if (!isNcname(tagName)) {
1939                    if (namePolicy == XmlViolationPolicy.FATAL) {
1940                        fatal((endTag ? "End" : "Start") + " tag \u201C" + tagName
1941                                + "\u201D has a non-NCName name.");
1942                    } else {
1943                        warn((endTag ? "End" : "Start") + " tag \u201C" + tagName
1944                                + "\u201D has a non-NCName name. Ignoring token.");
1945                        return;
1946                    }
1947                }
1948            }
1949            Attributes attrs = (attributes == null ? EmptyAttributes.EMPTY_ATTRIBUTES
1950                    : attributes);
1951            if (endTag) {
1952                /*
1953                 * When an end tag token is emitted, the content model flag must be
1954                 * switched to the PCDATA state.
1955                 */
1956                escapeFlag = false;
1957                contentModelFlag = ContentModelFlag.PCDATA;
1958                if (attrs.getLength() != 0) {
1959                    /*
1960                     * When an end tag token is emitted with attributes, that is a
1961                     * parse error.
1962                     */
1963                    err("End tag had attributes.");
1964                }
1965                tokenHandler.endTag(tagName, attrs);
1966            } else {
1967                tokenHandler.startTag(tagName, attrs);
1968            }
1969        }
1970    
1971        /**
1972         * Attribute name state
1973         * 
1974         * @throws IOException
1975         * @throws SAXException
1976         */
1977        private boolean attributeNameState() throws SAXException, IOException {
1978            for (;;) {
1979                /*
1980                 * Consume the next input character:
1981                 */
1982                char c = read();
1983                switch (c) {
1984                    case ' ':
1985                    case '\t':
1986                    case '\n':
1987                    case '\u000B':
1988                    case '\u000C':
1989                        /*
1990                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
1991                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
1992                         * to the after attribute name state.
1993                         */
1994                        attributeNameComplete();
1995                        return afterAttributeNameState();
1996                    case '=':
1997                        /*
1998                         * U+003D EQUALS SIGN (=) Switch to the before attribute
1999                         * value state.
2000                         */
2001                        attributeNameComplete();
2002                        return beforeAttributeValueState();
2003                    case '>':
2004                        /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
2005                        attributeNameComplete();
2006                        addAttributeWithoutValue();
2007                        emitCurrentTagToken();
2008                        /*
2009                         * Switch to the data state.
2010                         */
2011                        return false;
2012                    case '/':
2013                        /*
2014                         * U+002F SOLIDUS (/) Parse error unless this is a permitted
2015                         * slash.
2016                         */
2017                        parseErrorUnlessPermittedSlash();
2018                        /* Switch to the before attribute name state. */
2019                        attributeNameComplete();
2020                        addAttributeWithoutValue();
2021                        return true;
2022                    case '\u0000':
2023                        /*
2024                         * EOF Parse error.
2025                         */
2026                        err("End of file occurred in an attribute name.");
2027                        /*
2028                         * Emit the current tag token.
2029                         */
2030                        attributeNameComplete();
2031                        addAttributeWithoutValue();
2032                        emitCurrentTagToken();
2033                        /* Reconsume the EOF character in the data state. */
2034                        unread(c);
2035                        return false;
2036                    default:
2037                        if (c >= 'A' && c <= 'Z') {
2038                            /*
2039                             * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
2040                             * CAPITAL LETTER Z Append the lowercase version of the
2041                             * current input character (add 0x0020 to the
2042                             * character's code point) to the current attribute's
2043                             * name.
2044                             */
2045                            appendStrBuf((char) (c + 0x20));
2046                        } else {
2047                            /*
2048                             * Anything else Append the current input character to
2049                             * the current attribute's name.
2050                             */
2051                            appendStrBuf(c);
2052                        }
2053                }
2054                /*
2055                 * Stay in the attribute name state.
2056                 */
2057                continue;
2058            }
2059        }
2060    
2061        private void attributeNameComplete() throws SAXException {
2062            attributeName = strBufToString();
2063            if (attributes == null) {
2064                attributes = newAttributes();
2065            }
2066            /*
2067             * When the user agent leaves the attribute name state (and before
2068             * emitting the tag token, if appropriate), the complete attribute's
2069             * name must be compared to the other attributes on the same token; if
2070             * there is already an attribute on the token with the exact same name,
2071             * then this is a parse error and the new attribute must be dropped,
2072             * along with the value that gets associated with it (if any).
2073             */
2074            if (attributes.getIndex(attributeName) == -1) {
2075                if (namePolicy == XmlViolationPolicy.ALLOW) {
2076                    shouldAddAttributes = true;
2077                } else {
2078                    if (isNcname(attributeName)) {
2079                        shouldAddAttributes = true;
2080                    } else {
2081                        if (namePolicy == XmlViolationPolicy.FATAL) {
2082                            fatal("Attribute name \u201C" + attributeName
2083                                    + "\u201D is not an NCName.");
2084                        } else {
2085                            shouldAddAttributes = false;
2086                            warn("Attribute name \u201C"
2087                                    + attributeName
2088                                    + "\u201D is not an NCName. Ignoring the attribute.");
2089                        }
2090                    }
2091                }
2092            } else {
2093                shouldAddAttributes = false;
2094                err("Duplicate attribute \u201C" + attributeName + "\u201D.");
2095            }
2096        }
2097    
2098        private void addAttributeWithoutValue() throws SAXException {
2099            if (metaBoundaryPassed && "charset".equals(attributeName)
2100                    && "meta".equals(tagName)) {
2101                err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
2102            }
2103            if (shouldAddAttributes) {
2104                if (html4) {
2105                    if (AttributeInfo.isBoolean(attributeName)) {
2106                        if (html4ModeCompatibleWithXhtml1Schemata) {
2107                            attributes.addAttribute(attributeName, attributeName);
2108                        } else {
2109                            attributes.addAttribute(attributeName, "");
2110                        }
2111                    } else {
2112                        err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
2113                        attributes.addAttribute(attributeName, "");
2114                    }
2115                } else {
2116                    if ("src".equals(attributeName) || "href".equals(attributeName)) {
2117                        warn("Attribute \u201C"
2118                                + attributeName
2119                                + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
2120                    }
2121                    attributes.addAttribute(attributeName, "");
2122                }
2123            }
2124        }
2125    
2126        private void addAttributeWithValue() throws SAXException {
2127            if (metaBoundaryPassed && "meta" == tagName
2128                    && "charset".equals(attributeName)) {
2129                err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
2130            }
2131            if (shouldAddAttributes) {
2132                String value = longStrBufToString();
2133                if (!endTag) {
2134                    if ("xmlns".equals(attributeName)) {
2135                        if ("html" == tagName
2136                                && "http://www.w3.org/1999/xhtml".equals(value)) {
2137                            if (xmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) {
2138                                return;
2139                            }
2140                        } else {
2141                            if (bogusXmlnsPolicy == XmlViolationPolicy.FATAL) {
2142                                fatal("Forbidden attribute \u201C"
2143                                        + attributeName
2144                                        + "\u201D is not mappable to namespace-aware XML 1.0.");
2145                            } else {
2146                                warn("Forbidden attribute \u201C"
2147                                        + attributeName
2148                                        + "\u201D is not mappable to namespace-aware XML 1.0.");
2149                                if (bogusXmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) {
2150                                    return;
2151                                }
2152                            }
2153                        }
2154                    } else if (attributeName.startsWith("xmlns:")) {
2155                        if (bogusXmlnsPolicy == XmlViolationPolicy.FATAL) {
2156                            fatal("Forbidden attribute \u201C"
2157                                    + attributeName
2158                                    + "\u201D is not mappable to namespace-aware XML 1.0.");
2159                        } else {
2160                            warn("Forbidden attribute \u201C"
2161                                    + attributeName
2162                                    + "\u201D is not mappable to namespace-aware XML 1.0.");
2163                            if (bogusXmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) {
2164                                return;
2165                            }
2166                        }
2167                    } else if (html4 && html4ModeCompatibleWithXhtml1Schemata && AttributeInfo.isCaseFolded(attributeName)) {
2168                        value = toAsciiLowerCase(value);
2169                    }
2170                }
2171                attributes.addAttribute(attributeName, value);
2172            }
2173        }
2174        
2175        private String toAsciiLowerCase(String str) {
2176            if (str == null) {
2177                return null;
2178            }
2179            char[] b = new char[str.length()];
2180            for (int i = 0; i < str.length(); i++) {
2181                char c = str.charAt(i);
2182                if (c >= 'A' && c <= 'Z') {
2183                    c += 0x20;
2184                }
2185                b[i] = c;
2186            }
2187            return new String(b);
2188        }
2189    
2190        /**
2191         * After attribute name state
2192         * 
2193         * @throws IOException
2194         * @throws SAXException
2195         */
2196        private boolean afterAttributeNameState() throws SAXException, IOException {
2197            for (;;) {
2198                /*
2199                 * Consume the next input character:
2200                 */
2201                char c = read();
2202                switch (c) {
2203                    case ' ':
2204                    case '\t':
2205                    case '\n':
2206                    case '\u000B':
2207                    case '\u000C':
2208                        /*
2209                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
2210                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
2211                         * in the after attribute name state.
2212                         */
2213                        continue;
2214                    case '=':
2215                        /*
2216                         * U+003D EQUALS SIGN (=) Switch to the before attribute
2217                         * value state.
2218                         */
2219                        return beforeAttributeValueState();
2220                    case '>':
2221                        /*
2222                         * U+003E GREATER-THAN SIGN (>) Emit the current tag token.
2223                         */
2224                        addAttributeWithoutValue();
2225                        emitCurrentTagToken();
2226                        /*
2227                         * Switch to the data state.
2228                         */
2229                        return false;
2230                    case '/':
2231                        /*
2232                         * U+002F SOLIDUS (/) Parse error unless this is a permitted
2233                         * slash.
2234                         */
2235                        addAttributeWithoutValue();
2236                        parseErrorUnlessPermittedSlash();
2237                        /* Switch to the before attribute name state. */
2238                        return true;
2239                    case '\u0000':
2240                        /* EOF Parse error. */
2241                        err("Saw end of file without the previous tag ending with \u201C>\u201C.");
2242                        /*
2243                         * Emit the current tag token.
2244                         */
2245                        addAttributeWithoutValue();
2246                        emitCurrentTagToken();
2247                        /*
2248                         * Reconsume the character in the data state.
2249                         */
2250                        unread(c);
2251                        return false;
2252                    default:
2253                        /*
2254                         * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
2255                         * CAPITAL LETTER Z Start a new attribute in the current tag
2256                         * token. Set that attribute's name to the lowercase version
2257                         * of the current input character (add 0x0020 to the
2258                         * character's code point), and its value to the empty
2259                         * string. Switch to the attribute name state.
2260                         * 
2261                         * Anything else Start a new attribute in the current tag
2262                         * token. Set that attribute's name to the current input
2263                         * character, and its value to the empty string. Switch to
2264                         * the attribute name state.
2265                         */
2266                        // let's do this by respinning through the attribute loop
2267                        addAttributeWithoutValue();
2268                        unread(c);
2269                        return true;
2270                }
2271            }
2272        }
2273    
2274        /**
2275         * Before attribute value state
2276         * 
2277         * @throws IOException
2278         * @throws SAXException
2279         */
2280        private boolean beforeAttributeValueState() throws SAXException,
2281                IOException {
2282            clearLongStrBuf();
2283            for (;;) {
2284                /*
2285                 * Consume the next input character:
2286                 */
2287                char c = read();
2288                switch (c) {
2289                    case ' ':
2290                    case '\t':
2291                    case '\n':
2292                    case '\u000B':
2293                    case '\u000C':
2294                        /*
2295                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
2296                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
2297                         * in the before attribute value state.
2298                         */
2299                        continue;
2300                    case '"':
2301                        /*
2302                         * U+0022 QUOTATION MARK (") Switch to the attribute value
2303                         * (double-quoted) state.
2304                         */
2305                        return attributeValueDoubleQuotedState();
2306                    case '&':
2307                        /*
2308                         * U+0026 AMPERSAND (&) Switch to the attribute value
2309                         * (unquoted) state and reconsume this input character.
2310                         */
2311                        unread(c);
2312                        return attributeValueUnquotedState();
2313                    case '\'':
2314                        /*
2315                         * U+0027 APOSTROPHE (') Switch to the attribute value
2316                         * (single-quoted) state.
2317                         */
2318                        return attributeValueSingleQuotedState();
2319                    case '>':
2320                        /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
2321                        addAttributeWithoutValue();
2322                        emitCurrentTagToken();
2323                        /*
2324                         * Switch to the data state.
2325                         */
2326                        return false;
2327                    case '\u0000':
2328                        /* EOF Parse error. */
2329                        err("Saw end of file without the previous tag ending with \u201C>\u201C.");
2330                        /*
2331                         * Emit the current tag token.
2332                         */
2333                        addAttributeWithoutValue();
2334                        emitCurrentTagToken();
2335                        /*
2336                         * Reconsume the character in the data state.
2337                         */
2338                        unread(c);
2339                        return false;
2340                    default:
2341                        if (html4
2342                                && !((c >= 'a' && c <= 'z')
2343                                        || (c >= 'A' && c <= 'Z')
2344                                        || (c >= '0' && c <= '9') || c == '.'
2345                                        || c == '-' || c == '_' || c == ':')) {
2346                            err("Non-name character in an unquoted attribute value. (This is an HTML4-only error.)");
2347                        }
2348                        /*
2349                         * Anything else Append the current input character to the
2350                         * current attribute's value.
2351                         */
2352                        appendLongStrBuf(c);
2353                        /*
2354                         * Switch to the attribute value (unquoted) state.
2355                         */
2356                        return attributeValueUnquotedState();
2357                }
2358            }
2359        }
2360    
2361        /**
2362         * Attribute value (double-quoted) state
2363         * 
2364         * @throws IOException
2365         * @throws SAXException
2366         */
2367        private boolean attributeValueDoubleQuotedState() throws SAXException,
2368                IOException {
2369            inContent = true;
2370            for (;;) {
2371                /*
2372                 * Consume the next input character:
2373                 */
2374                char c = read();
2375                switch (c) {
2376                    case '"':
2377                        /*
2378                         * U+0022 QUOTATION MARK (") Switch to the before attribute
2379                         * name state.
2380                         */
2381                        addAttributeWithValue();
2382                        inContent = false;
2383                        return true;
2384                    case '&':
2385                        /*
2386                         * U+0026 AMPERSAND (&) Switch to the entity in attribute
2387                         * value state.
2388                         */
2389                        entityInAttributeValueState();
2390                        continue;
2391                    case '\u0000':
2392                        /* EOF Parse error. */
2393                        err("End of file reached when inside a quoted attribute value.");
2394                        /* Emit the current tag token. */
2395                        addAttributeWithValue();
2396                        emitCurrentTagToken();
2397                        /*
2398                         * Reconsume the character in the data state.
2399                         */
2400                        unread(c);
2401                        inContent = false;
2402                        return false;
2403                    default:
2404                        /*
2405                         * Anything else Append the current input character to the
2406                         * current attribute's value.
2407                         */
2408                        appendLongStrBuf(c);
2409                        /*
2410                         * Stay in the attribute value (double-quoted) state.
2411                         */
2412                        continue;
2413                }
2414            }
2415        }
2416    
2417        /**
2418         * Attribute value (single-quoted) state
2419         * 
2420         * @throws SAXException
2421         * @throws IOException
2422         */
2423        private boolean attributeValueSingleQuotedState() throws SAXException,
2424                IOException {
2425            inContent = true;
2426            for (;;) {
2427                /*
2428                 * Consume the next input character:
2429                 */
2430                char c = read();
2431                switch (c) {
2432                    case '\'':
2433                        /*
2434                         * U+0027 APOSTROPHE (') Switch to the before attribute name
2435                         * state.
2436                         */
2437                        addAttributeWithValue();
2438                        inContent = false;
2439                        return true;
2440                    case '&':
2441                        /*
2442                         * U+0026 AMPERSAND (&) Switch to the entity in attribute
2443                         * value state.
2444                         */
2445                        entityInAttributeValueState();
2446                        continue;
2447                    case '\u0000':
2448                        /* EOF Parse error. */
2449                        err("End of file reached when inside a quoted attribute value.");
2450                        /* Emit the current tag token. */
2451                        addAttributeWithValue();
2452                        emitCurrentTagToken();
2453                        /*
2454                         * Reconsume the character in the data state.
2455                         */
2456                        unread(c);
2457                        inContent = false;
2458                        return false;
2459                    default:
2460                        /*
2461                         * Anything else Append the current input character to the
2462                         * current attribute's value.
2463                         */
2464                        appendLongStrBuf(c);
2465                        /*
2466                         * Stay in the attribute value (double-quoted) state.
2467                         */
2468                        continue;
2469                }
2470            }
2471        }
2472    
2473        /**
2474         * Attribute value (unquoted) state
2475         * 
2476         * @throws IOException
2477         * @throws SAXException
2478         */
2479        private boolean attributeValueUnquotedState() throws SAXException,
2480                IOException {
2481            inContent = true;
2482            for (;;) {
2483                /*
2484                 * Consume the next input character:
2485                 */
2486                char c = read();
2487                switch (c) {
2488                    case ' ':
2489                    case '\t':
2490                    case '\n':
2491                    case '\u000B':
2492                    case '\u000C':
2493                        /*
2494                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
2495                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
2496                         * to the before attribute name state.
2497                         */
2498                        addAttributeWithValue();
2499                        inContent = false;
2500                        return true;
2501                    case '&':
2502                        /*
2503                         * U+0026 AMPERSAND (&) Switch to the entity in attribute
2504                         * value state.
2505                         */
2506                        entityInAttributeValueState();
2507                        continue;
2508                    case '>':
2509                        /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
2510                        addAttributeWithValue();
2511                        emitCurrentTagToken();
2512                        /*
2513                         * Switch to the data state.
2514                         */
2515                        inContent = false;
2516                        return false;
2517                    case '\u0000':
2518                        /* EOF Parse error. */
2519                        err("Saw end of file without the previous tag ending with \u201C>\u201C.");
2520                        /*
2521                         * Emit the current tag token.
2522                         */
2523                        addAttributeWithValue();
2524                        emitCurrentTagToken();
2525                        /*
2526                         * Reconsume the character in the data state.
2527                         */
2528                        unread(c);
2529                        inContent = false;
2530                        return false;
2531                    case '<':
2532                        warn("\u201C<\u201D in an unquoted attribute value. This does not end the tag.");
2533                        // fall through
2534                    default:
2535                        if (html4
2536                                && !((c >= 'a' && c <= 'z')
2537                                        || (c >= 'A' && c <= 'Z')
2538                                        || (c >= '0' && c <= '9') || c == '.'
2539                                        || c == '-' || c == '_' || c == ':')) {
2540                            err("Non-name character in an unquoted attribute value. (This is an HTML4-only error.)");
2541                        }
2542                        /*
2543                         * Anything else Append the current input character to the
2544                         * current attribute's value.
2545                         */
2546                        appendLongStrBuf(c);
2547                        /*
2548                         * Stay in the attribute value (unquoted) state.
2549                         */
2550                        continue;
2551                }
2552            }
2553        }
2554    
2555        /**
2556         * Entity in attribute value state
2557         * 
2558         * @throws IOException
2559         * @throws SAXException
2560         */
2561        private void entityInAttributeValueState() throws SAXException, IOException {
2562            /*
2563             * Attempt to consume an entity.
2564             */
2565            consumeEntity(true);
2566            /*
2567             * If nothing is returned, append a U+0026 AMPERSAND character to the
2568             * current attribute's value.
2569             * 
2570             * Otherwise, append the returned character token to the current
2571             * attribute's value.
2572             */
2573            // handled in consumeEntity();
2574            /*
2575             * Finally, switch back to the attribute value state that you were in
2576             * when were switched into this state.
2577             */
2578            return;
2579        }
2580    
2581        /**
2582         * Bogus comment state
2583         * 
2584         * @throws IOException
2585         * @throws SAXException
2586         */
2587        private void bogusCommentState() throws SAXException, IOException {
2588            /*
2589             * (This can only happen if the content model flag is set to the PCDATA
2590             * state.)
2591             * 
2592             * Consume every character up to the first U+003E GREATER-THAN SIGN
2593             * character (>) or the end of the file (EOF), whichever comes first.
2594             * Emit a comment token whose data is the concatenation of all the
2595             * characters starting from and including the character that caused the
2596             * state machine to switch into the bogus comment state, up to and
2597             * including the last consumed character before the U+003E character, if
2598             * any, or up to the end of the file otherwise. (If the comment was
2599             * started by the end of the file (EOF), the token is empty.)
2600             * 
2601             * Switch to the data state.
2602             * 
2603             * If the end of the file was reached, reconsume the EOF character.
2604             */
2605            for (;;) {
2606                char c = read();
2607                switch (c) {
2608                    case '>':
2609                        emitComment();
2610                        return;
2611                    case '\u0000':
2612                        emitComment();
2613                        unread(c);
2614                        return;
2615                    default:
2616                        appendToComment(c);
2617                }
2618            }
2619        }
2620    
2621        /**
2622         * Markup declaration open state
2623         * 
2624         * @throws IOException
2625         * @throws SAXException
2626         */
2627        private void markupDeclarationOpenState() throws SAXException, IOException {
2628            /*
2629             * (This can only happen if the content model flag is set to the PCDATA
2630             * state.)
2631             */
2632            clearLongStrBuf();
2633            /*
2634             * If the next two characters are both U+002D HYPHEN-MINUS (-)
2635             * characters, consume those two characters, create a comment token
2636             * whose data is the empty string, and switch to the comment start
2637             * state.
2638             * 
2639             * Otherwise if the next seven characters are a case-insensitive match
2640             * for the word "DOCTYPE", then consume those characters and switch to
2641             * the DOCTYPE state.
2642             * 
2643             * Otherwise, is is a parse error. Switch to the bogus comment state.
2644             * The next character that is consumed, if any, is the first character
2645             * that will be in the comment.
2646             */
2647            char c = read();
2648            switch (c) {
2649                case '-':
2650                    c = read();
2651                    if (c == '-') {
2652                        commentStates();
2653                        return;
2654                    } else {
2655                        err("Bogus comment.");
2656                        appendToComment('-');
2657                        unread(c);
2658                        bogusCommentState();
2659                        return;
2660                    }
2661                case 'd':
2662                case 'D':
2663                    appendToComment(c);
2664                    for (int i = 0; i < OCTYPE.length; i++) {
2665                        c = read();
2666                        char folded = c;
2667                        if (c >= 'A' && c <= 'Z') {
2668                            folded += 0x20;
2669                        }
2670                        if (folded == OCTYPE[i]) {
2671                            appendToComment(c);
2672                        } else {
2673                            err("Bogus comment.");
2674                            unread(c);
2675                            bogusCommentState();
2676                            return;
2677                        }
2678                    }
2679                    doctypeState();
2680                    return;
2681                default:
2682                    err("Bogus comment.");
2683                    unread(c);
2684                    bogusCommentState();
2685                    return;
2686            }
2687        }
2688    
2689        private enum CommentState {
2690            COMMENT_START_STATE, COMMENT_START_DASH_STATE, COMMENT_STATE, COMMENT_END_DASH_STATE, COMMENT_END_STATE
2691        }
2692    
2693        /**
2694         * Comment start state, Comment start dash state, Comment state, Comment end
2695         * dash state and Comment end state
2696         * 
2697         * @throws IOException
2698         * @throws SAXException
2699         */
2700        private void commentStates() throws SAXException, IOException {
2701            CommentState state = CommentState.COMMENT_START_STATE;
2702            for (;;) {
2703                char c = read();
2704                switch (state) {
2705                    case COMMENT_START_STATE:
2706                        /*
2707                         * Comment start state
2708                         * 
2709                         * 
2710                         * Consume the next input character:
2711                         */
2712                        switch (c) {
2713                            case '-':
2714                                /*
2715                                 * U+002D HYPHEN-MINUS (-) Switch to the comment
2716                                 * start dash state.
2717                                 */
2718                                state = CommentState.COMMENT_START_DASH_STATE;
2719                                continue;
2720                            case '>':
2721                                /*
2722                                 * U+003E GREATER-THAN SIGN (>) Parse error.
2723                                 */
2724                                err("Premature end of comment.");
2725                                /* Emit the comment token. */
2726                                emitComment();
2727                                /*
2728                                 * Switch to the data state.
2729                                 */
2730                                return;
2731                            case '\u0000':
2732                                /*
2733                                 * EOF Parse error.
2734                                 */
2735                                err("End of file inside comment.");
2736                                /* Emit the comment token. */
2737                                emitComment();
2738                                /*
2739                                 * Reconsume the EOF character in the data state.
2740                                 */
2741                                unread(c);
2742                                return;
2743                            default:
2744                                /*
2745                                 * Anything else Append the input character to the
2746                                 * comment token's data.
2747                                 */
2748                                appendToComment(c);
2749                                /*
2750                                 * Switch to the comment state.
2751                                 */
2752                                state = CommentState.COMMENT_STATE;
2753                                continue;
2754                        }
2755                    case COMMENT_START_DASH_STATE:
2756                        /*
2757                         * Comment start dash state
2758                         * 
2759                         * Consume the next input character:
2760                         */
2761                        switch (c) {
2762                            case '-':
2763                                /*
2764                                 * U+002D HYPHEN-MINUS (-) Switch to the comment end
2765                                 * state
2766                                 */
2767                                state = CommentState.COMMENT_END_STATE;
2768                                continue;
2769                            case '>':
2770                                /*
2771                                 * U+003E GREATER-THAN SIGN (>) Parse error.
2772                                 */
2773                                err("Premature end of comment.");
2774                                /* Emit the comment token. */
2775                                emitComment();
2776                                /*
2777                                 * Switch to the data state.
2778                                 */
2779                                return;
2780                            case '\u0000':
2781                                /*
2782                                 * EOF Parse error.
2783                                 */
2784                                err("End of file inside comment.");
2785                                /* Emit the comment token. */
2786                                emitComment();
2787                                /*
2788                                 * Reconsume the EOF character in the data state.
2789                                 */
2790                                unread(c);
2791                                return;
2792                            default:
2793                                /*
2794                                 * Anything else Append a U+002D HYPHEN-MINUS (-)
2795                                 * character and the input character to the comment
2796                                 * token's data.
2797                                 */
2798                                appendToComment('-');
2799                                appendToComment(c);
2800                                /*
2801                                 * Switch to the comment state.
2802                                 */
2803                                state = CommentState.COMMENT_STATE;
2804                                continue;
2805                        }
2806                    case COMMENT_STATE:
2807                        /*
2808                         * Comment state Consume the next input character:
2809                         */
2810                        switch (c) {
2811                            case '-':
2812                                /*
2813                                 * U+002D HYPHEN-MINUS (-) Switch to the comment end
2814                                 * dash state
2815                                 */
2816                                state = CommentState.COMMENT_END_DASH_STATE;
2817                                continue;
2818                            case '\u0000':
2819                                /*
2820                                 * EOF Parse error.
2821                                 */
2822                                err("End of file inside comment.");
2823                                /* Emit the comment token. */
2824                                emitComment();
2825                                /*
2826                                 * Reconsume the EOF character in the data state.
2827                                 */
2828                                unread(c);
2829                                return;
2830                            default:
2831                                /*
2832                                 * Anything else Append the input character to the
2833                                 * comment token's data.
2834                                 */
2835                                appendToComment(c);
2836                                /*
2837                                 * Stay in the comment state.
2838                                 */
2839                                continue;
2840                        }
2841                    case COMMENT_END_DASH_STATE:
2842                        /*
2843                         * Comment end dash state Consume the next input character:
2844                         */
2845                        switch (c) {
2846                            case '-':
2847                                /*
2848                                 * U+002D HYPHEN-MINUS (-) Switch to the comment end
2849                                 * state
2850                                 */
2851                                state = CommentState.COMMENT_END_STATE;
2852                                continue;
2853                            case '\u0000':
2854                                /*
2855                                 * EOF Parse error.
2856                                 */
2857                                err("End of file inside comment.");
2858                                /* Emit the comment token. */
2859                                emitComment();
2860                                /*
2861                                 * Reconsume the EOF character in the data state.
2862                                 */
2863                                unread(c);
2864                                return;
2865                            default:
2866                                /*
2867                                 * Anything else Append a U+002D HYPHEN-MINUS (-)
2868                                 * character and the input character to the comment
2869                                 * token's data.
2870                                 */
2871                                appendToComment('-');
2872                                appendToComment(c);
2873                                /*
2874                                 * Switch to the comment state.
2875                                 */
2876                                state = CommentState.COMMENT_STATE;
2877                                continue;
2878                        }
2879                    case COMMENT_END_STATE:
2880                        /*
2881                         * Comment end dash state Consume the next input character:
2882                         */
2883                        switch (c) {
2884                            case '>':
2885                                /*
2886                                 * U+003E GREATER-THAN SIGN (>) Emit the comment
2887                                 * token.
2888                                 */
2889                                emitComment();
2890                                /*
2891                                 * Switch to the data state.
2892                                 */
2893                                return;
2894                            case '-':
2895                                /* U+002D HYPHEN-MINUS (-) Parse error. */
2896                                err("Consecutive hyphens did not terminate a comment.");
2897                                /*
2898                                 * Append a U+002D HYPHEN-MINUS (-) character to the
2899                                 * comment token's data.
2900                                 */
2901                                appendToComment('-');
2902                                /*
2903                                 * Stay in the comment end state.
2904                                 */
2905                                continue;
2906                            case '\u0000':
2907                                /*
2908                                 * EOF Parse error.
2909                                 */
2910                                err("End of file inside comment.");
2911                                /* Emit the comment token. */
2912                                emitComment();
2913                                /*
2914                                 * Reconsume the EOF character in the data state.
2915                                 */
2916                                unread(c);
2917                                return;
2918                            default:
2919                                /*
2920                                 * Anything else Parse error.
2921                                 */
2922                                err("Consecutive hyphens did not terminate a comment.");
2923                                /*
2924                                 * Append two U+002D HYPHEN-MINUS (-) characters and
2925                                 * the input character to the comment token's data.
2926                                 */
2927                                appendToComment('-');
2928                                appendToComment('-');
2929                                appendToComment(c);
2930                                /*
2931                                 * Switch to the comment state.
2932                                 */
2933                                state = CommentState.COMMENT_STATE;
2934                                continue;
2935                        }
2936                }
2937            }
2938        }
2939    
2940        /**
2941         * DOCTYPE state
2942         * 
2943         * @throws IOException
2944         * @throws SAXException
2945         */
2946        private void doctypeState() throws SAXException, IOException {
2947            systemIdentifier = null;
2948            publicIdentifier = null;
2949            doctypeName = null;
2950            /*
2951             * Consume the next input character:
2952             */
2953            char c = read();
2954            switch (c) {
2955                case ' ':
2956                case '\t':
2957                case '\n':
2958                case '\u000B':
2959                case '\u000C':
2960                    /*
2961                     * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B LINE
2962                     * TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch to the
2963                     * before DOCTYPE name state.
2964                     */
2965                    beforeDoctypeNameState();
2966                    return;
2967                default:
2968                    /*
2969                     * Anything else Parse error.
2970                     */
2971                    err("Missing space before doctype name.");
2972                    /*
2973                     * Reconsume the current character in the before DOCTYPE name
2974                     * state.
2975                     */
2976                    unread(c);
2977                    beforeDoctypeNameState();
2978                    return;
2979            }
2980        }
2981    
2982        /**
2983         * Before DOCTYPE name state
2984         * 
2985         * @throws IOException
2986         * @throws SAXException
2987         */
2988        private void beforeDoctypeNameState() throws SAXException, IOException {
2989            for (;;) {
2990                /*
2991                 * Consume the next input character:
2992                 */
2993                char c = read();
2994                switch (c) {
2995                    case ' ':
2996                    case '\t':
2997                    case '\n':
2998                    case '\u000B':
2999                    case '\u000C':
3000                        /*
3001                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3002                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3003                         * in the before DOCTYPE name state.
3004                         */
3005                        continue;
3006                    case '>':
3007                        /*
3008                         * U+003E GREATER-THAN SIGN (>) Parse error.
3009                         */
3010                        err("Nameless doctype.");
3011                        /*
3012                         * Create a new DOCTYPE token. Set its correctness flag to
3013                         * incorrect. Emit the token.
3014                         */
3015                        tokenHandler.doctype("", null, null, false);
3016                        /*
3017                         * Switch to the data state.
3018                         */
3019                        return;
3020                    case '\u0000':
3021                        /* EOF Parse error. */
3022                        err("End of file inside doctype.");
3023                        /*
3024                         * Create a new DOCTYPE token. Set its correctness flag to
3025                         * incorrect. Emit the token.
3026                         */
3027                        tokenHandler.doctype("", null, null, false);
3028                        /*
3029                         * Reconsume the EOF character in the data state.
3030                         */
3031                        unread(c);
3032                        return;
3033                    default:
3034                        /* Anything else Create a new DOCTYPE token. */
3035                        clearStrBuf();
3036                        /*
3037                         * Set the token's name name to the current input character.
3038                         */
3039                        appendStrBuf(c);
3040                        /*
3041                         * Switch to the DOCTYPE name state.
3042                         */
3043                        doctypeNameState();
3044                        return;
3045                }
3046            }
3047        }
3048    
3049        /**
3050         * DOCTYPE name state
3051         * 
3052         * @throws IOException
3053         * @throws SAXException
3054         */
3055        private void doctypeNameState() throws SAXException, IOException {
3056            for (;;) {
3057                /*
3058                 * First, consume the next input character:
3059                 */
3060                char c = read();
3061                switch (c) {
3062                    case ' ':
3063                    case '\t':
3064                    case '\n':
3065                    case '\u000B':
3066                    case '\u000C':
3067                        /*
3068                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3069                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
3070                         * to the after DOCTYPE name state.
3071                         */
3072                        doctypeName = strBufToString();
3073                        afterDoctypeNameState();
3074                        return;
3075                    case '>':
3076                        /*
3077                         * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE
3078                         * token.
3079                         */
3080                        tokenHandler.doctype(strBufToString(), null, null, true);
3081                        /*
3082                         * Switch to the data state.
3083                         */
3084                        return;
3085                    case '\u0000':
3086                        /* EOF Parse error. */
3087                        err("End of file inside doctype.");
3088                        /*
3089                         * Set the DOCTYPE token's correctness flag to incorrect.
3090                         * Emit that DOCTYPE token.
3091                         */
3092                        tokenHandler.doctype(strBufToString(), null, null, false);
3093                        /*
3094                         * Reconsume the EOF character in the data state.
3095                         */
3096                        unread(c);
3097                        return;
3098                    default:
3099                        /*
3100                         * Anything else Append the current input character to the
3101                         * current DOCTYPE token's name.
3102                         */
3103                        appendStrBuf(c);
3104                        /*
3105                         * Stay in the DOCTYPE name state.
3106                         */
3107                        continue;
3108                }
3109            }
3110        }
3111    
3112        /**
3113         * After DOCTYPE name state
3114         * 
3115         * @throws IOException
3116         * @throws SAXException
3117         */
3118        private void afterDoctypeNameState() throws SAXException, IOException {
3119            for (;;) {
3120                /*
3121                 * Consume the next input character:
3122                 */
3123                char c = read();
3124                switch (c) {
3125                    case ' ':
3126                    case '\t':
3127                    case '\n':
3128                    case '\u000B':
3129                    case '\u000C':
3130                        /*
3131                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3132                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3133                         * in the after DOCTYPE name state.
3134                         */
3135                        continue;
3136                    case '>':
3137                        /*
3138                         * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE
3139                         * token.
3140                         */
3141                        tokenHandler.doctype(doctypeName, null, null, true);
3142                        /*
3143                         * Switch to the data state.
3144                         */
3145                        return;
3146                    case '\u0000':
3147                        /* EOF Parse error. */
3148                        err("End of file inside doctype.");
3149                        /*
3150                         * Set the DOCTYPE token's correctness flag to incorrect.
3151                         * Emit that DOCTYPE token.
3152                         */
3153                        tokenHandler.doctype(doctypeName, null, null, false);
3154                        /*
3155                         * Reconsume the EOF character in the data state.
3156                         */
3157                        unread(c);
3158                        return;
3159                    case 'p':
3160                    case 'P':
3161                        /*
3162                         * If the next six characters are a case-insensitive match
3163                         * for the word "PUBLIC", then consume those characters and
3164                         * switch to the before DOCTYPE public identifier state.
3165                         */
3166                        for (int i = 0; i < UBLIC.length; i++) {
3167                            c = read();
3168                            char folded = c;
3169                            if (c >= 'A' && c <= 'Z') {
3170                                folded += 0x20;
3171                            }
3172                            if (folded != UBLIC[i]) {
3173                                err("Bogus doctype.");
3174                                unread(c);
3175                                bogusDoctypeState();
3176                                return;
3177                            }
3178                        }
3179                        beforeDoctypePublicIdentifierState();
3180                        return;
3181                    case 's':
3182                    case 'S':
3183                        /*
3184                         * Otherwise, if the next six characters are a
3185                         * case-insensitive match for the word "SYSTEM", then
3186                         * consume those characters and switch to the before DOCTYPE
3187                         * system identifier state.
3188                         */
3189                        for (int i = 0; i < YSTEM.length; i++) {
3190                            c = read();
3191                            char folded = c;
3192                            if (c >= 'A' && c <= 'Z') {
3193                                folded += 0x20;
3194                            }
3195                            if (folded != YSTEM[i]) {
3196                                err("Bogus doctype.");
3197                                unread(c);
3198                                bogusDoctypeState();
3199                                return;
3200                            }
3201                        }
3202                        beforeDoctypeSystemIdentifierState();
3203                        return;
3204                    default:
3205                        /*
3206                         * Otherwise, this is the parse error.
3207                         */
3208                        err("Bogus doctype.");
3209                        /*
3210                         * Switch to the bogus DOCTYPE state.
3211                         */
3212                        bogusDoctypeState();
3213                        return;
3214                }
3215            }
3216        }
3217    
3218        /**
3219         * Before DOCTYPE public identifier state
3220         * 
3221         * @throws IOException
3222         * @throws SAXException
3223         */
3224        private void beforeDoctypePublicIdentifierState() throws SAXException,
3225                IOException {
3226            for (;;) {
3227                /*
3228                 * Consume the next input character:
3229                 */
3230                char c = read();
3231                switch (c) {
3232                    case ' ':
3233                    case '\t':
3234                    case '\n':
3235                    case '\u000B':
3236                    case '\u000C':
3237                        /*
3238                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3239                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3240                         * in the before DOCTYPE public identifier state.
3241                         */
3242                        continue;
3243                    case '"':
3244                        /*
3245                         * U+0022 QUOTATION MARK (") Set the DOCTYPE token's public
3246                         * identifier to the empty string,
3247                         */
3248                        clearLongStrBuf();
3249                        /*
3250                         * then switch to the DOCTYPE public identifier
3251                         * (double-quoted) state.
3252                         */
3253                        doctypePublicIdentifierDoubleQuotedState();
3254                        return;
3255                    case '\'':
3256                        /*
3257                         * U+0027 APOSTROPHE (') Set the DOCTYPE token's public
3258                         * identifier to the empty string,
3259                         */
3260                        clearLongStrBuf();
3261                        /*
3262                         * then switch to the DOCTYPE public identifier
3263                         * (single-quoted) state.
3264                         */
3265                        doctypePublicIdentifierSingleQuotedState();
3266                        return;
3267                    case '>':
3268                        /* U+003E GREATER-THAN SIGN (>) Parse error. */
3269                        err("Expected a public identifier but the doctype ended.");
3270                        /*
3271                         * Set the DOCTYPE token's correctness flag to incorrect.
3272                         * Emit that DOCTYPE token.
3273                         */
3274                        tokenHandler.doctype(doctypeName, null, null, false);
3275                        /*
3276                         * Switch to the data state.
3277                         */
3278                        return;
3279                    case '\u0000':
3280                        /* EOF Parse error. */
3281                        err("End of file inside a doctype.");
3282                        /*
3283                         * Set the DOCTYPE token's correctness flag to incorrect.
3284                         * Emit that DOCTYPE token.
3285                         */
3286                        tokenHandler.doctype(doctypeName, null, null, false);
3287                        /*
3288                         * Reconsume the EOF character in the data state.
3289                         */
3290                        unread(c);
3291                        return;
3292                    default:
3293                        /* Anything else Parse error. */
3294                        err("Bogus doctype.");
3295                        /*
3296                         * Switch to the bogus DOCTYPE state.
3297                         */
3298                        bogusDoctypeState();
3299                        return;
3300                }
3301            }
3302        }
3303    
3304        /**
3305         * DOCTYPE public identifier (double-quoted) state
3306         * 
3307         * @throws IOException
3308         * @throws SAXException
3309         */
3310        private void doctypePublicIdentifierDoubleQuotedState()
3311                throws SAXException, IOException {
3312            for (;;) {
3313                /*
3314                 * Consume the next input character:
3315                 */
3316                char c = read();
3317                switch (c) {
3318                    case '"':
3319                        /*
3320                         * U+0022 QUOTATION MARK (") Switch to the after DOCTYPE
3321                         * public identifier state.
3322                         */
3323                        publicIdentifier = longStrBufToString();
3324                        afterDoctypePublicIdentifierState();
3325                        return;
3326                    case '\u0000':
3327                        /* EOF Parse error. */
3328                        err("End of file inside public identifier.");
3329                        /*
3330                         * Set the DOCTYPE token's correctness flag to incorrect.
3331                         * Emit that DOCTYPE token.
3332                         */
3333                        tokenHandler.doctype(doctypeName, longStrBufToString(),
3334                                null, false);
3335                        /*
3336                         * Reconsume the EOF character in the data state.
3337                         */
3338                        unread(c);
3339                        return;
3340                    default:
3341                        /*
3342                         * Anything else Append the current input character to the
3343                         * current DOCTYPE token's public identifier.
3344                         */
3345                        appendLongStrBuf(c);
3346                        /*
3347                         * Stay in the DOCTYPE public identifier (double-quoted)
3348                         * state.
3349                         */
3350                        continue;
3351                }
3352            }
3353        }
3354    
3355        /**
3356         * DOCTYPE public identifier (single-quoted) state
3357         * 
3358         * @throws IOException
3359         * @throws SAXException
3360         */
3361        private void doctypePublicIdentifierSingleQuotedState()
3362                throws SAXException, IOException {
3363            for (;;) {
3364                /*
3365                 * Consume the next input character:
3366                 */
3367                char c = read();
3368                switch (c) {
3369                    case '\'':
3370                        /*
3371                         * U+0027 APOSTROPHE (') Switch to the after DOCTYPE public
3372                         * identifier state.
3373                         */
3374                        publicIdentifier = longStrBufToString();
3375                        afterDoctypePublicIdentifierState();
3376                        return;
3377                    case '\u0000':
3378                        /* EOF Parse error. */
3379                        err("End of file inside public identifier.");
3380                        /*
3381                         * Set the DOCTYPE token's correctness flag to incorrect.
3382                         * Emit that DOCTYPE token.
3383                         */
3384                        tokenHandler.doctype(doctypeName, longStrBufToString(),
3385                                null, false);
3386                        /*
3387                         * Reconsume the EOF character in the data state.
3388                         */
3389                        unread(c);
3390                        return;
3391                    default:
3392                        /*
3393                         * Anything else Append the current input character to the
3394                         * current DOCTYPE token's public identifier.
3395                         */
3396                        appendLongStrBuf(c);
3397                        /*
3398                         * Stay in the DOCTYPE public identifier (single-quoted)
3399                         * state.
3400                         */
3401                        continue;
3402                }
3403            }
3404        }
3405    
3406        /**
3407         * After DOCTYPE public identifier state
3408         * 
3409         * @throws IOException
3410         * @throws SAXException
3411         * 
3412         */
3413        private void afterDoctypePublicIdentifierState() throws SAXException,
3414                IOException {
3415            for (;;) {
3416                /*
3417                 * Consume the next input character:
3418                 */
3419                char c = read();
3420                switch (c) {
3421                    case ' ':
3422                    case '\t':
3423                    case '\n':
3424                    case '\u000B':
3425                    case '\u000C':
3426                        /*
3427                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3428                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3429                         * in the after DOCTYPE public identifier state.
3430                         */
3431                        continue;
3432                    case '"':
3433                        /*
3434                         * U+0022 QUOTATION MARK (") Set the DOCTYPE token's system
3435                         * identifier to the empty string,
3436                         */
3437                        clearLongStrBuf();
3438                        /*
3439                         * then switch to the DOCTYPE system identifier
3440                         * (double-quoted) state.
3441                         */
3442                        doctypeSystemIdentifierDoubleQuotedState();
3443                        return;
3444                    case '\'':
3445                        /*
3446                         * U+0027 APOSTROPHE (') Set the DOCTYPE token's system
3447                         * identifier to the empty string,
3448                         */
3449                        clearLongStrBuf();
3450                        /*
3451                         * then switch to the DOCTYPE system identifier
3452                         * (single-quoted) state.
3453                         */
3454                        doctypeSystemIdentifierSingleQuotedState();
3455                        return;
3456                    case '>':
3457                        /*
3458                         * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE
3459                         * token.
3460                         */
3461                        tokenHandler.doctype(doctypeName, publicIdentifier, null,
3462                                true);
3463                        /*
3464                         * Switch to the data state.
3465                         */
3466                        return;
3467                    case '\u0000':
3468                        /* EOF Parse error. */
3469                        err("End of file inside doctype.");
3470                        /*
3471                         * Set the DOCTYPE token's correctness flag to incorrect.
3472                         * Emit that DOCTYPE token.
3473                         */
3474                        tokenHandler.doctype(doctypeName, publicIdentifier, null,
3475                                false);
3476                        /*
3477                         * Reconsume the EOF character in the data state.
3478                         */
3479                        unread(c);
3480                        return;
3481                    default:
3482                        /* Anything else Parse error. */
3483                        err("Bogus doctype.");
3484                        /*
3485                         * Switch to the bogus DOCTYPE state.
3486                         */
3487                        bogusDoctypeState();
3488                        return;
3489                }
3490            }
3491        }
3492    
3493        /**
3494         * Before DOCTYPE system identifier state
3495         * 
3496         * @throws IOException
3497         * @throws SAXException
3498         */
3499        private void beforeDoctypeSystemIdentifierState() throws SAXException,
3500                IOException {
3501            for (;;) {
3502                /*
3503                 * Consume the next input character:
3504                 */
3505                char c = read();
3506                switch (c) {
3507                    case ' ':
3508                    case '\t':
3509                    case '\n':
3510                    case '\u000B':
3511                    case '\u000C':
3512                        /*
3513                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3514                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3515                         * in the before DOCTYPE system identifier state.
3516                         */
3517                        continue;
3518                    case '"':
3519                        /*
3520                         * U+0022 QUOTATION MARK (") Set the DOCTYPE token's system
3521                         * identifier to the empty string,
3522                         */
3523                        clearLongStrBuf();
3524                        /*
3525                         * then switch to the DOCTYPE system identifier
3526                         * (double-quoted) state.
3527                         */
3528                        doctypeSystemIdentifierDoubleQuotedState();
3529                        return;
3530                    case '\'':
3531                        /*
3532                         * U+0027 APOSTROPHE (') Set the DOCTYPE token's system
3533                         * identifier to the empty string,
3534                         */
3535                        clearLongStrBuf();
3536                        /*
3537                         * then switch to the DOCTYPE system identifier
3538                         * (single-quoted) state.
3539                         */
3540                        doctypeSystemIdentifierSingleQuotedState();
3541                        return;
3542                    case '>':
3543                        /* U+003E GREATER-THAN SIGN (>) Parse error. */
3544                        err("Expected a system identifier but the doctype ended.");
3545                        /*
3546                         * Set the DOCTYPE token's correctness flag to incorrect.
3547                         * Emit that DOCTYPE token.
3548                         */
3549                        tokenHandler.doctype(doctypeName, null, null, false);
3550                        /*
3551                         * Switch to the data state.
3552                         */
3553                        return;
3554                    case '\u0000':
3555                        /* EOF Parse error. */
3556                        err("End of file inside a doctype.");
3557                        /*
3558                         * Set the DOCTYPE token's correctness flag to incorrect.
3559                         * Emit that DOCTYPE token.
3560                         */
3561                        tokenHandler.doctype(doctypeName, null, null, false);
3562                        /*
3563                         * Reconsume the EOF character in the data state.
3564                         */
3565                        unread(c);
3566                        return;
3567                    default:
3568                        /* Anything else Parse error. */
3569                        err("Bogus doctype.");
3570                        /*
3571                         * Switch to the bogus DOCTYPE state.
3572                         */
3573                        bogusDoctypeState();
3574                        return;
3575                }
3576            }
3577        }
3578    
3579        /**
3580         * DOCTYPE system identifier (double-quoted) state
3581         * 
3582         * @throws IOException
3583         * @throws SAXException
3584         */
3585        private void doctypeSystemIdentifierDoubleQuotedState()
3586                throws SAXException, IOException {
3587            for (;;) {
3588                /*
3589                 * Consume the next input character:
3590                 */
3591                char c = read();
3592                switch (c) {
3593                    case '"':
3594                        /*
3595                         * U+0022 QUOTATION MARK (") Switch to the after DOCTYPE
3596                         * system identifier state.
3597                         */
3598                        systemIdentifier = longStrBufToString();
3599                        afterDoctypeSystemIdentifierState();
3600                        return;
3601                    case '\u0000':
3602                        /* EOF Parse error. */
3603                        err("End of file inside system identifier.");
3604                        /*
3605                         * Set the DOCTYPE token's correctness flag to incorrect.
3606                         * Emit that DOCTYPE token.
3607                         */
3608                        tokenHandler.doctype(doctypeName, publicIdentifier,
3609                                longStrBufToString(), false);
3610                        /*
3611                         * Reconsume the EOF character in the data state.
3612                         */
3613                        unread(c);
3614                        return;
3615                    default:
3616                        /*
3617                         * Anything else Append the current input character to the
3618                         * current DOCTYPE token's system identifier.
3619                         */
3620                        appendLongStrBuf(c);
3621                        /*
3622                         * Stay in the DOCTYPE system identifier (double-quoted)
3623                         * state.
3624                         */
3625                        continue;
3626                }
3627            }
3628        }
3629    
3630        /**
3631         * DOCTYPE system identifier (single-quoted) state
3632         * 
3633         * @throws IOException
3634         * @throws SAXException
3635         */
3636        private void doctypeSystemIdentifierSingleQuotedState()
3637                throws SAXException, IOException {
3638            for (;;) {
3639                /*
3640                 * Consume the next input character:
3641                 */
3642                char c = read();
3643                switch (c) {
3644                    case '\'':
3645                        /*
3646                         * U+0027 APOSTROPHE (') Switch to the after DOCTYPE system
3647                         * identifier state.
3648                         */
3649                        systemIdentifier = longStrBufToString();
3650                        afterDoctypeSystemIdentifierState();
3651                        return;
3652                    case '\u0000':
3653                        /* EOF Parse error. */
3654                        err("End of file inside system identifier.");
3655                        /*
3656                         * Set the DOCTYPE token's correctness flag to incorrect.
3657                         * Emit that DOCTYPE token.
3658                         */
3659                        tokenHandler.doctype(doctypeName, publicIdentifier,
3660                                longStrBufToString(), false);
3661                        /*
3662                         * Reconsume the EOF character in the data state.
3663                         */
3664                        unread(c);
3665                        return;
3666                    default:
3667                        /*
3668                         * Anything else Append the current input character to the
3669                         * current DOCTYPE token's system identifier.
3670                         */
3671                        appendLongStrBuf(c);
3672                        /*
3673                         * Stay in the DOCTYPE system identifier (double-quoted)
3674                         * state.
3675                         */
3676                        continue;
3677                }
3678            }
3679        }
3680    
3681        /**
3682         * After DOCTYPE system identifier state
3683         * 
3684         * @throws IOException
3685         * @throws SAXException
3686         */
3687        private void afterDoctypeSystemIdentifierState() throws SAXException,
3688                IOException {
3689            for (;;) {
3690                /*
3691                 * Consume the next input character:
3692                 */
3693                char c = read();
3694                switch (c) {
3695                    case ' ':
3696                    case '\t':
3697                    case '\n':
3698                    case '\u000B':
3699                    case '\u000C':
3700                        /*
3701                         * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3702                         * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3703                         * in the after DOCTYPE system identifier state.
3704                         */
3705                        continue;
3706                    case '>':
3707                        /*
3708                         * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE
3709                         * token.
3710                         */
3711                        tokenHandler.doctype(doctypeName, publicIdentifier,
3712                                systemIdentifier, true);
3713                        /*
3714                         * Switch to the data state.
3715                         */
3716                        return;
3717                    case '\u0000':
3718                        /* EOF Parse error. */
3719                        err("End of file inside doctype.");
3720                        /*
3721                         * Set the DOCTYPE token's correctness flag to incorrect.
3722                         * Emit that DOCTYPE token.
3723                         */
3724                        tokenHandler.doctype(doctypeName, publicIdentifier,
3725                                systemIdentifier, false);
3726                        /*
3727                         * Reconsume the EOF character in the data state.
3728                         */
3729                        unread(c);
3730                        return;
3731                    default:
3732                        /* Anything else Parse error. */
3733                        err("Bogus doctype.");
3734                        /*
3735                         * Switch to the bogus DOCTYPE state.
3736                         */
3737                        bogusDoctypeState();
3738                        return;
3739                }
3740            }
3741        }
3742    
3743        /**
3744         * Bogus DOCTYPE state
3745         * 
3746         * @throws IOException
3747         * @throws SAXException
3748         */
3749        private void bogusDoctypeState() throws SAXException, IOException {
3750            for (;;) {
3751                /*
3752                 * Consume the next input character:
3753                 */
3754                char c = read();
3755                switch (c) {
3756                    case '>':
3757                        /*
3758                         * U+003E GREATER-THAN SIGN (>) Set the DOCTYPE token's
3759                         * correctness flag to incorrect. Emit that DOCTYPE token.
3760                         */
3761                        tokenHandler.doctype(doctypeName, publicIdentifier,
3762                                systemIdentifier, false);
3763                        /*
3764                         * Switch to the data state.
3765                         */
3766                        return;
3767                    case '\u0000':
3768                        /* EOF Parse error. */
3769                        err("End of file inside doctype.");
3770                        /*
3771                         * Set the DOCTYPE token's correctness flag to incorrect.
3772                         * Emit that DOCTYPE token.
3773                         */
3774                        tokenHandler.doctype(doctypeName, publicIdentifier,
3775                                systemIdentifier, false);
3776                        /*
3777                         * Reconsume the EOF character in the data state.
3778                         */
3779                        unread(c);
3780                        return;
3781                    default:
3782                        /*
3783                         * Anything else Stay in the bogus DOCTYPE state.
3784                         */
3785                        continue;
3786                }
3787            }
3788        }
3789    
3790        /**
3791         * Consume entity
3792         * 
3793         * Unlike the definition is the spec, this method does not return a value
3794         * and never requires the caller to backtrack. This method takes care of
3795         * emitting characters or appending to the current attribute value. It also
3796         * takes care of that in the case when consuming the entity fails.
3797         * 
3798         * @throws IOException
3799         * @throws SAXException
3800         */
3801        private void consumeEntity(boolean inAttribute) throws SAXException,
3802                IOException {
3803            clearStrBuf();
3804            appendStrBuf('&');
3805            /*
3806             * This section defines how to consume an entity. This definition is
3807             * used when parsing entities in text and in attributes.
3808             * 
3809             * The behaviour depends on the identity of the next character (the one
3810             * immediately after the U+0026 AMPERSAND character):
3811             */
3812            char c = read();
3813            switch (c) {
3814                case ' ':
3815                case '\t':
3816                case '\n':
3817                case '\u000B':
3818                case '\u000C':
3819                case '<':
3820                case '&':
3821                case '\u0000':
3822                    /*
3823                     * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B LINE
3824                     * TABULATION U+000C FORM FEED (FF) U+0020 SPACE U+003C
3825                     * LESS-THAN SIGN U+0026 AMPERSAND EOF Not an entity. No
3826                     * characters are consumed, and nothing is returned. (This is
3827                     * not an error, either.)
3828                     */
3829                    if (inAttribute) {
3830                        appendStrBufToLongStrBuf();
3831                    } else {
3832                        emitStrBuf();
3833                    }
3834                    unread(c);
3835                    return;
3836                case '#':
3837                    /*
3838                     * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER SIGN.
3839                     */
3840                    appendStrBuf('#');
3841                    consumeNCR(inAttribute);
3842                    return;
3843                default:
3844                    unread(c);
3845                    int entCol = -1;
3846                    int lo = 0;
3847                    int hi = (Entities.NAMES.length - 1);
3848                    int candidate = -1;
3849                    int strBufMark = 0;
3850                    outer: for (;;) {
3851                        entCol++;
3852                        c = read();
3853                        /*
3854                         * Anything else Consume the maximum number of characters
3855                         * possible, with the consumed characters case-sensitively
3856                         * matching one of the identifiers in the first column of
3857                         * the entities table.
3858                         */
3859                        hiloop: for (;;) {
3860                            if (hi == -1) {
3861                                break;
3862                            }
3863                            if (entCol == Entities.NAMES[hi].length()) {
3864                                break hiloop;
3865                            }
3866                            if (entCol > Entities.NAMES[hi].length()) {
3867                                break outer;
3868                            } else if (c < Entities.NAMES[hi].charAt(entCol)) {
3869                                hi--;
3870                            } else {
3871                                break hiloop;
3872                            }
3873                        }
3874    
3875                        loloop: for (;;) {
3876                            if (hi < lo) {
3877                                break outer;
3878                            }
3879                            if (entCol == Entities.NAMES[lo].length()) {
3880                                candidate = lo;
3881                                strBufMark = strBufLen;
3882                                lo++;
3883                            } else if (entCol > Entities.NAMES[lo].length()) {
3884                                break outer;
3885                            } else if (c > Entities.NAMES[lo].charAt(entCol)) {
3886                                lo++;
3887                            } else {
3888                                break loloop;
3889                            }
3890                        }
3891                        if (hi < lo) {
3892                            break outer;
3893                        }
3894                        appendStrBuf(c);
3895                    }
3896                    unread(c);
3897                    // TODO warn about apos (IE) and TRADE (Opera)
3898                    if (candidate == -1) {
3899                        /* If no match can be made, then this is a parse error. */
3900                        err("Text after \u201C&\u201D did not match an entity name.");
3901                        /*
3902                         * No characters are consumed, and nothing is returned.
3903                         */
3904                        if (inAttribute) {
3905                            appendStrBufToLongStrBuf();
3906                        } else {
3907                            emitStrBuf();
3908                        }
3909                        return;
3910                    } else {
3911                        if (!Entities.NAMES[candidate].endsWith(";")) {
3912                            /*
3913                             * If the last character matched is not a U+003B
3914                             * SEMICOLON (;), there is a parse error.
3915                             */
3916                            err("Entity reference was not terminated by a semicolon.");
3917                            if (inAttribute) {
3918                                /*
3919                                 * If the entity is being consumed as part of an
3920                                 * attribute, and the last character matched is not
3921                                 * a U+003B SEMICOLON (;),
3922                                 */
3923                                if (strBufMark == strBufLen) {
3924                                    c = read();
3925                                    unread(c);
3926                                } else {
3927                                    c = strBuf[strBufMark];
3928                                }
3929                                if ((c >= '0' && c <= '9')
3930                                        || (c >= 'A' && c <= 'Z')
3931                                        || (c >= 'a' && c <= 'z')) {
3932                                    /*
3933                                     * and the next character is in the range U+0030
3934                                     * DIGIT ZERO to U+0039 DIGIT NINE, U+0041 LATIN
3935                                     * CAPITAL LETTER A to U+005A LATIN CAPITAL
3936                                     * LETTER Z, or U+0061 LATIN SMALL LETTER A to
3937                                     * U+007A LATIN SMALL LETTER Z, then, for
3938                                     * historical reasons, all the characters that
3939                                     * were matched after the U+0026 AMPERSAND (&)
3940                                     * must be unconsumed, and nothing is returned.
3941                                     */
3942                                    appendStrBufToLongStrBuf();
3943                                    return;
3944                                }
3945                            }
3946                        }
3947    
3948                        /*
3949                         * Otherwise, return a character token for the character
3950                         * corresponding to the entity name (as given by the second
3951                         * column of the entities table).
3952                         */
3953                        char[] val = Entities.VALUES[candidate];
3954                        emitOrAppend(val, inAttribute);
3955                        // this is so complicated!
3956                        if (strBufMark < strBufLen) {
3957                            if (inAttribute) {
3958                                for (int i = strBufMark; i < strBufLen; i++) {
3959                                    appendLongStrBuf(strBuf[i]);
3960                                }
3961                            } else {
3962                                tokenHandler.characters(strBuf, strBufMark,
3963                                        strBufLen - strBufMark);
3964                            }
3965                        }
3966                        return;
3967                        /*
3968                         * If the markup contains I'm &notit; I tell you, the entity
3969                         * is parsed as "not", as in, I'm ¬it; I tell you. But if
3970                         * the markup was I'm &notin; I tell you, the entity would
3971                         * be parsed as "notin;", resulting in I'm ∉ I tell you.
3972                         */
3973                    }
3974    
3975            }
3976        }
3977    
3978        private void consumeNCR(boolean inAttribute) throws SAXException,
3979                IOException {
3980            int prevValue = -1;
3981            int value = 0;
3982            boolean seenDigits = false;
3983            boolean hex = false;
3984            /*
3985             * The behaviour further depends on the character after the U+0023
3986             * NUMBER SIGN:
3987             */
3988            char c = read();
3989            if (c == 'x' || c == 'X') {
3990                /*
3991                 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL LETTER X Consume
3992                 * the X.
3993                 * 
3994                 * Follow the steps below, but using the range of characters U+0030
3995                 * DIGIT ZERO through to U+0039 DIGIT NINE, U+0061 LATIN SMALL
3996                 * LETTER A through to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
3997                 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL LETTER F (in
3998                 * other words, 0-9, A-F, a-f).
3999                 * 
4000                 * When it comes to interpreting the number, interpret it as a
4001                 * hexadecimal number.
4002                 */
4003                appendStrBuf(c);
4004                hex = true;
4005            } else {
4006                unread(c);
4007                /*
4008                 * Anything else Follow the steps below, but using the range of
4009                 * characters U+0030 DIGIT ZERO through to U+0039 DIGIT NINE (i.e.
4010                 * just 0-9).
4011                 * 
4012                 * When it comes to interpreting the number, interpret it as a
4013                 * decimal number.
4014                 */
4015            }
4016            for (;;) {
4017                // Deal with overflow gracefully
4018                if (value < prevValue) {
4019                    value = 0x110000; // Value above Unicode range but within int
4020                    // range
4021                }
4022                prevValue = value;
4023                /*
4024                 * Consume as many characters as match the range of characters given
4025                 * above.
4026                 */
4027                c = read();
4028                if (c >= '0' && c <= '9') {
4029                    seenDigits = true;
4030                    if (hex) {
4031                        value *= 16;
4032                    } else {
4033                        value *= 10;
4034                    }
4035                    value += c - '0';
4036                } else if (hex && c >= 'A' && c <= 'F') {
4037                    seenDigits = true;
4038                    value *= 16;
4039                    value += c - 'A' + 10;
4040                } else if (hex && c >= 'a' && c <= 'f') {
4041                    seenDigits = true;
4042                    value *= 16;
4043                    value += c - 'a' + 10;
4044                } else if (c == ';') {
4045                    if (seenDigits) {
4046                        handleNCRValue(value, inAttribute);
4047                        return;
4048                    } else {
4049                        err("No digits after \u201C" + strBufToString() + "\u201D.");
4050                        appendStrBuf(';');
4051                        if (inAttribute) {
4052                            appendStrBufToLongStrBuf();
4053                        } else {
4054                            emitStrBuf();
4055                        }
4056                        return;
4057                    }
4058                } else {
4059                    /*
4060                     * If no characters match the range, then don't consume any
4061                     * characters (and unconsume the U+0023 NUMBER SIGN character
4062                     * and, if appropriate, the X character). This is a parse error;
4063                     * nothing is returned.
4064                     * 
4065                     * Otherwise, if the next character is a U+003B SEMICOLON,
4066                     * consume that too. If it isn't, there is a parse error.
4067                     */
4068                    unread(c);
4069                    if (seenDigits) {
4070                        err("Character reference was not terminated by a semicolon.");
4071                        handleNCRValue(value, inAttribute);
4072                        return;
4073                    } else {
4074                        err("No digits after \u201C" + strBufToString() + "\u201D.");
4075                        if (inAttribute) {
4076                            appendStrBufToLongStrBuf();
4077                        } else {
4078                            emitStrBuf();
4079                        }
4080                        return;
4081                    }
4082                }
4083            }
4084        }
4085    
4086        private void handleNCRValue(int value, boolean inAttribute)
4087                throws SAXException, IOException {
4088            /*
4089             * If one or more characters match the range, then take them all and
4090             * interpret the string of characters as a number (either hexadecimal or
4091             * decimal as appropriate).
4092             */
4093            if (value >= 0x80 && value <= 0x9f) {
4094                /*
4095                 * If that number is one of the numbers in the first column of the
4096                 * following table, then this is a parse error.
4097                 */
4098                err("A numeric character reference expanded to the C1 controls range.");
4099                /*
4100                 * Find the row with that number in the first column, and return a
4101                 * character token for the Unicode character given in the second
4102                 * column of that row.
4103                 */
4104                char[] val = Entities.WINDOWS_1252[value - 0x80];
4105                emitOrAppend(val, inAttribute);
4106                return;
4107            } else if (value == 0x0D) {
4108                err("A numeric character reference expanded to carriage return.");
4109                emitOrAppend(LF, inAttribute);
4110                return;
4111            } else if (value == 0) {
4112                /*
4113                 * Otherwise, if the number is zero, if the number is higher than
4114                 * 0x10FFFF, or if it's one of the surrogate characters (characters
4115                 * in the range 0xD800 to 0xDFFF), then this is a parse error;
4116                 * return a character token for the U+FFFD REPLACEMENT CHARACTER
4117                 * character instead.
4118                 */
4119                err("Character reference expands to U+0000.");
4120                emitOrAppend(REPLACEMENT_CHARACTER, inAttribute);
4121                return;
4122            } else if ((contentSpacePolicy != XmlViolationPolicy.ALLOW)
4123                    && (value == 0xB || value == 0xC)) {
4124                if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
4125                    emitOrAppend(SPACE, inAttribute);
4126                } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
4127                    fatal("A character reference expanded to a space character that is not legal XML 1.0 white space.");
4128                }
4129            } else if ((value & 0xF800) == 0xD800) {
4130                err("Character reference expands to a surrogate.");
4131                emitOrAppend(REPLACEMENT_CHARACTER, inAttribute);
4132                return;
4133            } else if (value <= 0xFFFF) {
4134                /*
4135                 * Otherwise, return a character token for the Unicode character
4136                 * whose code point is that number.
4137                 */
4138                char c = (char) value;
4139                if (c < '\t' || (c > '\r' && c < ' ') || isNonCharacter(c)) {
4140                    if (contentNonXmlCharPolicy != XmlViolationPolicy.FATAL) {
4141                        if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) {
4142                            c = '\uFFFD';
4143                        }
4144                        warn("Character reference expanded to a character that is not a legal XML 1.0 character.");
4145                    } else {
4146                        fatal("Character reference expanded to a character that is not a legal XML 1.0 character.");
4147                    }
4148                }
4149                if (isPrivateUse(c)) {
4150                    warnAboutPrivateUseChar();
4151                }
4152                bmpChar[0] = c;
4153                emitOrAppend(bmpChar, inAttribute);
4154                return;
4155            } else if (value <= 0x10FFFF) {
4156                if (isNonCharacter(value)) {
4157                    warn("Character reference expands to an astral non-character.");
4158                }
4159                if (isAstralPrivateUse(value)) {
4160                    warnAboutPrivateUseChar();
4161                }
4162                astralChar[0] = (char) (LEAD_OFFSET + (value >> 10));
4163                astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
4164                emitOrAppend(astralChar, inAttribute);
4165                return;
4166            } else {
4167                err("Character reference outside the permissible Unicode range.");
4168                emitOrAppend(REPLACEMENT_CHARACTER, inAttribute);
4169                return;
4170            }
4171        }
4172    
4173        /**
4174         * @param val
4175         * @throws SAXException
4176         * @throws IOException
4177         */
4178        private void emitOrAppend(char[] val, boolean inAttribute)
4179                throws SAXException, IOException {
4180            if (inAttribute) {
4181                appendLongStrBuf(val);
4182            } else {
4183                tokenHandler.characters(val, 0, val.length);
4184            }
4185        }
4186    
4187        /**
4188         * Returns the mappingLangToXmlLang.
4189         * 
4190         * @return the mappingLangToXmlLang
4191         */
4192        public boolean isMappingLangToXmlLang() {
4193            return mappingLangToXmlLang;
4194        }
4195    
4196        /**
4197         * Sets the mappingLangToXmlLang.
4198         * 
4199         * @param mappingLangToXmlLang
4200         *            the mappingLangToXmlLang to set
4201         */
4202        public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
4203            this.mappingLangToXmlLang = mappingLangToXmlLang;
4204        }
4205    }