001 /*
002 * Copyright (c) 2005, 2006, 2007 Henri Sivonen
003 * Copyright (c) 2007 Mozilla Foundation
004 * Portions of comments Copyright 2004-2007 Apple Computer, Inc., Mozilla
005 * Foundation, and Opera Software ASA.
006 *
007 * Permission is hereby granted, free of charge, to any person obtaining a
008 * copy of this software and associated documentation files (the "Software"),
009 * to deal in the Software without restriction, including without limitation
010 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
011 * and/or sell copies of the Software, and to permit persons to whom the
012 * Software is furnished to do so, subject to the following conditions:
013 *
014 * The above copyright notice and this permission notice shall be included in
015 * all copies or substantial portions of the Software.
016 *
017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
018 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
019 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
020 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
021 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
022 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
023 * DEALINGS IN THE SOFTWARE.
024 */
025
026 /*
027 * The comments following this one that use the same comment syntax as this
028 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
029 * amended as of June 23 2007.
030 * That document came with this statement:
031 * "© Copyright 2004-2007 Apple Computer, Inc., Mozilla Foundation, and
032 * Opera Software ASA. You are granted a license to use, reproduce and
033 * create derivative works of this document."
034 */
035
036 package nu.validator.htmlparser.impl;
037
038 import java.io.IOException;
039 import java.io.InputStream;
040 import java.io.Reader;
041 import java.nio.charset.Charset;
042 import java.nio.charset.CharsetDecoder;
043 import java.nio.charset.IllegalCharsetNameException;
044 import java.nio.charset.UnsupportedCharsetException;
045 import java.util.Arrays;
046 import java.util.regex.Matcher;
047 import java.util.regex.Pattern;
048
049 import nu.validator.htmlparser.common.XmlViolationPolicy;
050
051 import org.xml.sax.Attributes;
052 import org.xml.sax.ErrorHandler;
053 import org.xml.sax.InputSource;
054 import org.xml.sax.Locator;
055 import org.xml.sax.SAXException;
056 import org.xml.sax.SAXParseException;
057
058 /**
059 * An implementatition of
060 * http://www.whatwg.org/specs/web-apps/current-work/multipage/section-tokenisation.html
061 *
062 * This class implements the <code>Locator</code> interface. This is not an
063 * incidental implementation detail: Users of this class are encouraged to make
064 * use of the <code>Locator</code> nature.
065 *
066 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
067 * can be configured to treat these conditions as fatal or to coerce the infoset
068 * to something that XML 1.0 allows.
069 *
070 * @version $Id: Tokenizer.java 166 2007-10-14 19:42:57Z hsivonen $
071 * @author hsivonen
072 */
073 public final class Tokenizer implements Locator {
074
075 private static final Pattern NCNAME_PATTERN = Pattern.compile("(?:[\\u0041-\\u005A]|[\\u0061-\\u007A]|[\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u00FF]|[\\u0100-\\u0131]|[\\u0134-\\u013E]|[\\u0141-\\u0148]|[\\u014A-\\u017E]|[\\u0180-\\u01C3]|[\\u01CD-\\u01F0]|[\\u01F4-\\u01F5]|[\\u01FA-\\u0217]|[\\u0250-\\u02A8]|[\\u02BB-\\u02C1]|\\u0386|[\\u0388-\\u038A]|\\u038C|[\\u038E-\\u03A1]|[\\u03A3-\\u03CE]|[\\u03D0-\\u03D6]|\\u03DA|\\u03DC|\\u03DE|\\u03E0|[\\u03E2-\\u03F3]|[\\u0401-\\u040C]|[\\u040E-\\u044F]|[\\u0451-\\u045C]|[\\u045E-\\u0481]|[\\u0490-\\u04C4]|[\\u04C7-\\u04C8]|[\\u04CB-\\u04CC]|[\\u04D0-\\u04EB]|[\\u04EE-\\u04F5]|[\\u04F8-\\u04F9]|[\\u0531-\\u0556]|\\u0559|[\\u0561-\\u0586]|[\\u05D0-\\u05EA]|[\\u05F0-\\u05F2]|[\\u0621-\\u063A]|[\\u0641-\\u064A]|[\\u0671-\\u06B7]|[\\u06BA-\\u06BE]|[\\u06C0-\\u06CE]|[\\u06D0-\\u06D3]|\\u06D5|[\\u06E5-\\u06E6]|[\\u0905-\\u0939]|\\u093D|[\\u0958-\\u0961]|[\\u0985-\\u098C]|[\\u098F-\\u0990]|[\\u0993-\\u09A8]|[\\u09AA-\\u09B0]|\\u09B2|[\\u09B6-\\u09B9]|[\\u09DC-\\u09DD]|[\\u09DF-\\u09E1]|[\\u09F0-\\u09F1]|[\\u0A05-\\u0A0A]|[\\u0A0F-\\u0A10]|[\\u0A13-\\u0A28]|[\\u0A2A-\\u0A30]|[\\u0A32-\\u0A33]|[\\u0A35-\\u0A36]|[\\u0A38-\\u0A39]|[\\u0A59-\\u0A5C]|\\u0A5E|[\\u0A72-\\u0A74]|[\\u0A85-\\u0A8B]|\\u0A8D|[\\u0A8F-\\u0A91]|[\\u0A93-\\u0AA8]|[\\u0AAA-\\u0AB0]|[\\u0AB2-\\u0AB3]|[\\u0AB5-\\u0AB9]|\\u0ABD|\\u0AE0|[\\u0B05-\\u0B0C]|[\\u0B0F-\\u0B10]|[\\u0B13-\\u0B28]|[\\u0B2A-\\u0B30]|[\\u0B32-\\u0B33]|[\\u0B36-\\u0B39]|\\u0B3D|[\\u0B5C-\\u0B5D]|[\\u0B5F-\\u0B61]|[\\u0B85-\\u0B8A]|[\\u0B8E-\\u0B90]|[\\u0B92-\\u0B95]|[\\u0B99-\\u0B9A]|\\u0B9C|[\\u0B9E-\\u0B9F]|[\\u0BA3-\\u0BA4]|[\\u0BA8-\\u0BAA]|[\\u0BAE-\\u0BB5]|[\\u0BB7-\\u0BB9]|[\\u0C05-\\u0C0C]|[\\u0C0E-\\u0C10]|[\\u0C12-\\u0C28]|[\\u0C2A-\\u0C33]|[\\u0C35-\\u0C39]|[\\u0C60-\\u0C61]|[\\u0C85-\\u0C8C]|[\\u0C8E-\\u0C90]|[\\u0C92-\\u0CA8]|[\\u0CAA-\\u0CB3]|[\\u0CB5-\\u0CB9]|\\u0CDE|[\\u0CE0-\\u0CE1]|[\\u0D05-\\u0D0C]|[\\u0D0E-\\u0D10]|[\\u0D12-\\u0D28]|[\\u0D2A-\\u0D39]|[\\u0D60-\\u0D61]|[\\u0E01-\\u0E2E]|\\u0E30|[\\u0E32-\\u0E33]|[\\u0E40-\\u0E45]|[\\u0E81-\\u0E82]|\\u0E84|[\\u0E87-\\u0E88]|\\u0E8A|\\u0E8D|[\\u0E94-\\u0E97]|[\\u0E99-\\u0E9F]|[\\u0EA1-\\u0EA3]|\\u0EA5|\\u0EA7|[\\u0EAA-\\u0EAB]|[\\u0EAD-\\u0EAE]|\\u0EB0|[\\u0EB2-\\u0EB3]|\\u0EBD|[\\u0EC0-\\u0EC4]|[\\u0F40-\\u0F47]|[\\u0F49-\\u0F69]|[\\u10A0-\\u10C5]|[\\u10D0-\\u10F6]|\\u1100|[\\u1102-\\u1103]|[\\u1105-\\u1107]|\\u1109|[\\u110B-\\u110C]|[\\u110E-\\u1112]|\\u113C|\\u113E|\\u1140|\\u114C|\\u114E|\\u1150|[\\u1154-\\u1155]|\\u1159|[\\u115F-\\u1161]|\\u1163|\\u1165|\\u1167|\\u1169|[\\u116D-\\u116E]|[\\u1172-\\u1173]|\\u1175|\\u119E|\\u11A8|\\u11AB|[\\u11AE-\\u11AF]|[\\u11B7-\\u11B8]|\\u11BA|[\\u11BC-\\u11C2]|\\u11EB|\\u11F0|\\u11F9|[\\u1E00-\\u1E9B]|[\\u1EA0-\\u1EF9]|[\\u1F00-\\u1F15]|[\\u1F18-\\u1F1D]|[\\u1F20-\\u1F45]|[\\u1F48-\\u1F4D]|[\\u1F50-\\u1F57]|\\u1F59|\\u1F5B|\\u1F5D|[\\u1F5F-\\u1F7D]|[\\u1F80-\\u1FB4]|[\\u1FB6-\\u1FBC]|\\u1FBE|[\\u1FC2-\\u1FC4]|[\\u1FC6-\\u1FCC]|[\\u1FD0-\\u1FD3]|[\\u1FD6-\\u1FDB]|[\\u1FE0-\\u1FEC]|[\\u1FF2-\\u1FF4]|[\\u1FF6-\\u1FFC]|\\u2126|[\\u212A-\\u212B]|\\u212E|[\\u2180-\\u2182]|[\\u3041-\\u3094]|[\\u30A1-\\u30FA]|[\\u3105-\\u312C]|[\\uAC00-\\uD7A3]|[\\u4E00-\\u9FA5]|\\u3007|[\\u3021-\\u3029]|_)(?:[\\u0030-\\u0039]|[\\u0660-\\u0669]|[\\u06F0-\\u06F9]|[\\u0966-\\u096F]|[\\u09E6-\\u09EF]|[\\u0A66-\\u0A6F]|[\\u0AE6-\\u0AEF]|[\\u0B66-\\u0B6F]|[\\u0BE7-\\u0BEF]|[\\u0C66-\\u0C6F]|[\\u0CE6-\\u0CEF]|[\\u0D66-\\u0D6F]|[\\u0E50-\\u0E59]|[\\u0ED0-\\u0ED9]|[\\u0F20-\\u0F29]|[\\u0041-\\u005A]|[\\u0061-\\u007A]|[\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u00FF]|[\\u0100-\\u0131]|[\\u0134-\\u013E]|[\\u0141-\\u0148]|[\\u014A-\\u017E]|[\\u0180-\\u01C3]|[\\u01CD-\\u01F0]|[\\u01F4-\\u01F5]|[\\u01FA-\\u0217]|[\\u0250-\\u02A8]|[\\u02BB-\\u02C1]|\\u0386|[\\u0388-\\u038A]|\\u038C|[\\u038E-\\u03A1]|[\\u03A3-\\u03CE]|[\\u03D0-\\u03D6]|\\u03DA|\\u03DC|\\u03DE|\\u03E0|[\\u03E2-\\u03F3]|[\\u0401-\\u040C]|[\\u040E-\\u044F]|[\\u0451-\\u045C]|[\\u045E-\\u0481]|[\\u0490-\\u04C4]|[\\u04C7-\\u04C8]|[\\u04CB-\\u04CC]|[\\u04D0-\\u04EB]|[\\u04EE-\\u04F5]|[\\u04F8-\\u04F9]|[\\u0531-\\u0556]|\\u0559|[\\u0561-\\u0586]|[\\u05D0-\\u05EA]|[\\u05F0-\\u05F2]|[\\u0621-\\u063A]|[\\u0641-\\u064A]|[\\u0671-\\u06B7]|[\\u06BA-\\u06BE]|[\\u06C0-\\u06CE]|[\\u06D0-\\u06D3]|\\u06D5|[\\u06E5-\\u06E6]|[\\u0905-\\u0939]|\\u093D|[\\u0958-\\u0961]|[\\u0985-\\u098C]|[\\u098F-\\u0990]|[\\u0993-\\u09A8]|[\\u09AA-\\u09B0]|\\u09B2|[\\u09B6-\\u09B9]|[\\u09DC-\\u09DD]|[\\u09DF-\\u09E1]|[\\u09F0-\\u09F1]|[\\u0A05-\\u0A0A]|[\\u0A0F-\\u0A10]|[\\u0A13-\\u0A28]|[\\u0A2A-\\u0A30]|[\\u0A32-\\u0A33]|[\\u0A35-\\u0A36]|[\\u0A38-\\u0A39]|[\\u0A59-\\u0A5C]|\\u0A5E|[\\u0A72-\\u0A74]|[\\u0A85-\\u0A8B]|\\u0A8D|[\\u0A8F-\\u0A91]|[\\u0A93-\\u0AA8]|[\\u0AAA-\\u0AB0]|[\\u0AB2-\\u0AB3]|[\\u0AB5-\\u0AB9]|\\u0ABD|\\u0AE0|[\\u0B05-\\u0B0C]|[\\u0B0F-\\u0B10]|[\\u0B13-\\u0B28]|[\\u0B2A-\\u0B30]|[\\u0B32-\\u0B33]|[\\u0B36-\\u0B39]|\\u0B3D|[\\u0B5C-\\u0B5D]|[\\u0B5F-\\u0B61]|[\\u0B85-\\u0B8A]|[\\u0B8E-\\u0B90]|[\\u0B92-\\u0B95]|[\\u0B99-\\u0B9A]|\\u0B9C|[\\u0B9E-\\u0B9F]|[\\u0BA3-\\u0BA4]|[\\u0BA8-\\u0BAA]|[\\u0BAE-\\u0BB5]|[\\u0BB7-\\u0BB9]|[\\u0C05-\\u0C0C]|[\\u0C0E-\\u0C10]|[\\u0C12-\\u0C28]|[\\u0C2A-\\u0C33]|[\\u0C35-\\u0C39]|[\\u0C60-\\u0C61]|[\\u0C85-\\u0C8C]|[\\u0C8E-\\u0C90]|[\\u0C92-\\u0CA8]|[\\u0CAA-\\u0CB3]|[\\u0CB5-\\u0CB9]|\\u0CDE|[\\u0CE0-\\u0CE1]|[\\u0D05-\\u0D0C]|[\\u0D0E-\\u0D10]|[\\u0D12-\\u0D28]|[\\u0D2A-\\u0D39]|[\\u0D60-\\u0D61]|[\\u0E01-\\u0E2E]|\\u0E30|[\\u0E32-\\u0E33]|[\\u0E40-\\u0E45]|[\\u0E81-\\u0E82]|\\u0E84|[\\u0E87-\\u0E88]|\\u0E8A|\\u0E8D|[\\u0E94-\\u0E97]|[\\u0E99-\\u0E9F]|[\\u0EA1-\\u0EA3]|\\u0EA5|\\u0EA7|[\\u0EAA-\\u0EAB]|[\\u0EAD-\\u0EAE]|\\u0EB0|[\\u0EB2-\\u0EB3]|\\u0EBD|[\\u0EC0-\\u0EC4]|[\\u0F40-\\u0F47]|[\\u0F49-\\u0F69]|[\\u10A0-\\u10C5]|[\\u10D0-\\u10F6]|\\u1100|[\\u1102-\\u1103]|[\\u1105-\\u1107]|\\u1109|[\\u110B-\\u110C]|[\\u110E-\\u1112]|\\u113C|\\u113E|\\u1140|\\u114C|\\u114E|\\u1150|[\\u1154-\\u1155]|\\u1159|[\\u115F-\\u1161]|\\u1163|\\u1165|\\u1167|\\u1169|[\\u116D-\\u116E]|[\\u1172-\\u1173]|\\u1175|\\u119E|\\u11A8|\\u11AB|[\\u11AE-\\u11AF]|[\\u11B7-\\u11B8]|\\u11BA|[\\u11BC-\\u11C2]|\\u11EB|\\u11F0|\\u11F9|[\\u1E00-\\u1E9B]|[\\u1EA0-\\u1EF9]|[\\u1F00-\\u1F15]|[\\u1F18-\\u1F1D]|[\\u1F20-\\u1F45]|[\\u1F48-\\u1F4D]|[\\u1F50-\\u1F57]|\\u1F59|\\u1F5B|\\u1F5D|[\\u1F5F-\\u1F7D]|[\\u1F80-\\u1FB4]|[\\u1FB6-\\u1FBC]|\\u1FBE|[\\u1FC2-\\u1FC4]|[\\u1FC6-\\u1FCC]|[\\u1FD0-\\u1FD3]|[\\u1FD6-\\u1FDB]|[\\u1FE0-\\u1FEC]|[\\u1FF2-\\u1FF4]|[\\u1FF6-\\u1FFC]|\\u2126|[\\u212A-\\u212B]|\\u212E|[\\u2180-\\u2182]|[\\u3041-\\u3094]|[\\u30A1-\\u30FA]|[\\u3105-\\u312C]|[\\uAC00-\\uD7A3]|[\\u4E00-\\u9FA5]|\\u3007|[\\u3021-\\u3029]|_|\\.|-|[\\u0300-\\u0345]|[\\u0360-\\u0361]|[\\u0483-\\u0486]|[\\u0591-\\u05A1]|[\\u05A3-\\u05B9]|[\\u05BB-\\u05BD]|\\u05BF|[\\u05C1-\\u05C2]|\\u05C4|[\\u064B-\\u0652]|\\u0670|[\\u06D6-\\u06DC]|[\\u06DD-\\u06DF]|[\\u06E0-\\u06E4]|[\\u06E7-\\u06E8]|[\\u06EA-\\u06ED]|[\\u0901-\\u0903]|\\u093C|[\\u093E-\\u094C]|\\u094D|[\\u0951-\\u0954]|[\\u0962-\\u0963]|[\\u0981-\\u0983]|\\u09BC|\\u09BE|\\u09BF|[\\u09C0-\\u09C4]|[\\u09C7-\\u09C8]|[\\u09CB-\\u09CD]|\\u09D7|[\\u09E2-\\u09E3]|\\u0A02|\\u0A3C|\\u0A3E|\\u0A3F|[\\u0A40-\\u0A42]|[\\u0A47-\\u0A48]|[\\u0A4B-\\u0A4D]|[\\u0A70-\\u0A71]|[\\u0A81-\\u0A83]|\\u0ABC|[\\u0ABE-\\u0AC5]|[\\u0AC7-\\u0AC9]|[\\u0ACB-\\u0ACD]|[\\u0B01-\\u0B03]|\\u0B3C|[\\u0B3E-\\u0B43]|[\\u0B47-\\u0B48]|[\\u0B4B-\\u0B4D]|[\\u0B56-\\u0B57]|[\\u0B82-\\u0B83]|[\\u0BBE-\\u0BC2]|[\\u0BC6-\\u0BC8]|[\\u0BCA-\\u0BCD]|\\u0BD7|[\\u0C01-\\u0C03]|[\\u0C3E-\\u0C44]|[\\u0C46-\\u0C48]|[\\u0C4A-\\u0C4D]|[\\u0C55-\\u0C56]|[\\u0C82-\\u0C83]|[\\u0CBE-\\u0CC4]|[\\u0CC6-\\u0CC8]|[\\u0CCA-\\u0CCD]|[\\u0CD5-\\u0CD6]|[\\u0D02-\\u0D03]|[\\u0D3E-\\u0D43]|[\\u0D46-\\u0D48]|[\\u0D4A-\\u0D4D]|\\u0D57|\\u0E31|[\\u0E34-\\u0E3A]|[\\u0E47-\\u0E4E]|\\u0EB1|[\\u0EB4-\\u0EB9]|[\\u0EBB-\\u0EBC]|[\\u0EC8-\\u0ECD]|[\\u0F18-\\u0F19]|\\u0F35|\\u0F37|\\u0F39|\\u0F3E|\\u0F3F|[\\u0F71-\\u0F84]|[\\u0F86-\\u0F8B]|[\\u0F90-\\u0F95]|\\u0F97|[\\u0F99-\\u0FAD]|[\\u0FB1-\\u0FB7]|\\u0FB9|[\\u20D0-\\u20DC]|\\u20E1|[\\u302A-\\u302F]|\\u3099|\\u309A|\\u00B7|\\u02D0|\\u02D1|\\u0387|\\u0640|\\u0E46|\\u0EC6|\\u3005|[\\u3031-\\u3035]|[\\u309D-\\u309E]|[\\u30FC-\\u30FE])*");
076
077 /**
078 * Magic value for UTF-16 operations.
079 */
080 private static final int LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
081
082 /**
083 * Magic value for UTF-16 operations.
084 */
085 private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
086
087 /**
088 * UTF-16 code unit array containing less than and greater than for emitting
089 * those characters on certain parse errors.
090 */
091 private static final char[] LT_GT = { '<', '>' };
092
093 /**
094 * UTF-16 code unit array containing less than and solidus for emitting
095 * those characters on certain parse errors.
096 */
097 private static final char[] LT_SOLIDUS = { '<', '/' };
098
099 /**
100 * Array version of U+FFFD.
101 */
102 private static final char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
103
104 /**
105 * Array version of space.
106 */
107 private static final char[] SPACE = { ' ' };
108
109 /**
110 * Array version of line feed.
111 */
112 private static final char[] LF = { '\n' };
113
114 /**
115 * Buffer growth parameter.
116 */
117 private static final int BUFFER_GROW_BY = 1024;
118
119 /**
120 * Lexically sorted void element names
121 */
122 private static final String[] VOID_ELEMENTS = { "area", "base", "br",
123 "col", "embed", "hr", "img", "input", "link", "meta", "param" };
124
125 /**
126 * "octype" as <code>char[]</code>
127 */
128 private static final char[] OCTYPE = "octype".toCharArray();
129
130 /**
131 * "ublic" as <code>char[]</code>
132 */
133 private static final char[] UBLIC = "ublic".toCharArray();
134
135 /**
136 * "ystem" as <code>char[]</code>
137 */
138 private static final char[] YSTEM = "ystem".toCharArray();
139
140 /**
141 * The token handler.
142 */
143 private final TokenHandler tokenHandler;
144
145 /**
146 * The error handler.
147 */
148 private ErrorHandler errorHandler;
149
150 /**
151 * The input UTF-16 code unit stream. If a byte stream was given, this
152 * object is an instance of <code>HtmlInputStreamReader</code>.
153 */
154 private Reader reader;
155
156 /**
157 * The main input buffer that the tokenizer reads from. Filled from
158 * <code>reader</code>.
159 */
160 private char[] buf = new char[2048];
161
162 /**
163 * The index of the last <code>char</code> read from <code>buf</code>.
164 */
165 private int pos;
166
167 /**
168 * The index of the first <code>char</code> in <code>buf</code> that is
169 * part of a coalesced run of character tokens or <code>-1</code> if there
170 * is not a current run being coalesced.
171 */
172 private int cstart;
173
174 /**
175 * The number of <code>char</code>s in <code>buf</code> that have
176 * meaning. (The rest of the array is garbage and should not be examined.)
177 */
178 private int bufLen;
179
180 /**
181 * The previous <code>char</code> read from the buffer with infoset
182 * alteration applied except for CR. Used for CRLF normalization and
183 * surrogate pair checking.
184 */
185 private char prev;
186
187 /**
188 * Lookbehind buffer for magic RCDATA/CDATA escaping.
189 */
190 private final char[] prevFour = new char[4];
191
192 /**
193 * Points to the last <code>char</code> written to <code>prevFour</code>.
194 */
195 private int prevFourPtr = 0;
196
197 /**
198 * Single code unit buffer for reconsuming an input character. If
199 * <code>-1</code> the next <code>read()</code> returns from the real
200 * buffer, otherwise from here.
201 */
202 private int unreadBuffer = -1;
203
204 /**
205 * The current line number in the current resource being parsed. (First line
206 * is 1.) Passed on as locator data.
207 */
208 private int line;
209
210 private int linePrev;
211
212 /**
213 * The current column number in the current resource being tokenized. (First
214 * column is 1, counted by UTF-16 code units.) Passed on as locator data.
215 */
216 private int col;
217
218 private int colPrev;
219
220 private boolean nextCharOnNewLine;
221
222 /**
223 * The SAX public id for the resource being tokenized. (Only passed to back
224 * as part of locator data.)
225 */
226 private String publicId;
227
228 /**
229 * The SAX system id for the resource being tokenized. (Only passed to back
230 * as part of locator data.)
231 */
232 private String systemId;
233
234 /**
235 * Buffer for short identifiers.
236 */
237 private char[] strBuf = new char[64];
238
239 /**
240 * Number of significant <code>char</code>s in <code>strBuf</code>.
241 */
242 private int strBufLen = 0;
243
244 /**
245 * Buffer for long strings.
246 */
247 private char[] longStrBuf = new char[1024];
248
249 /**
250 * Number of significant <code>char</code>s in <code>longStrBuf</code>.
251 */
252 private int longStrBufLen = 0;
253
254 /**
255 * If not U+0000, a pending code unit to be appended to
256 * <code>longStrBuf</code>.
257 */
258 private char longStrBufPending = '\u0000';
259
260 /**
261 * The attribute holder.
262 */
263 private AttributesImpl attributes;
264
265 /**
266 * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
267 */
268 private final char[] bmpChar = new char[1];
269
270 /**
271 * Buffer for expanding astral NCRs.
272 */
273 private final char[] astralChar = new char[2];
274
275 /**
276 * Keeps track of PUA warnings.
277 */
278 private boolean alreadyWarnedAboutPrivateUseCharacters;
279
280 /**
281 * http://www.whatwg.org/specs/web-apps/current-work/#content2
282 */
283 private ContentModelFlag contentModelFlag = ContentModelFlag.PCDATA;
284
285 /**
286 * http://www.whatwg.org/specs/web-apps/current-work/#escape
287 */
288 private boolean escapeFlag = false;
289
290 /**
291 * The element whose end tag closes the current CDATA or RCDATA element.
292 */
293 private String contentModelElement = "";
294
295 /**
296 * <code>true</code> if tokenizing an end tag
297 */
298 private boolean endTag;
299
300 /**
301 * The current tag token name.
302 */
303 private String tagName = null;
304
305 /**
306 * The current attribute name.
307 */
308 private String attributeName = null;
309
310 /**
311 * Whether comment tokens are emitted.
312 */
313 private boolean wantsComments = false;
314
315 /**
316 * If <code>false</code>, <code>addAttribute*()</code> are no-ops.
317 */
318 private boolean shouldAddAttributes;
319
320 /**
321 * <code>true</code> when in text content or in attribute value.
322 */
323 private boolean inContent;
324
325 /**
326 * <code>true</code> when HTML4-specific additional errors are requested.
327 */
328 private boolean html4;
329
330 /**
331 * Whether non-ASCII causes an error.
332 */
333 private boolean nonAsciiProhibited;
334
335 /**
336 * Used together with <code>nonAsciiProhibited</code>.
337 */
338 private boolean alreadyComplainedAboutNonAscii;
339
340 /**
341 * Whether the stream is past the first 512 bytes.
342 */
343 private boolean metaBoundaryPassed;
344
345 /**
346 * The name of the current doctype token.
347 */
348 private String doctypeName;
349
350 /**
351 * The public id of the current doctype token.
352 */
353 private String publicIdentifier;
354
355 /**
356 * The system id of the current doctype token.
357 */
358 private String systemIdentifier;
359
360 /**
361 * Used for NFC checking if non-<code>null</code>, source code capture,
362 * etc.
363 */
364 private CharacterHandler[] characterHandlers = new CharacterHandler[0];
365
366 /**
367 * The policy for vertical tab and form feed.
368 */
369 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALLOW;
370
371 /**
372 * The policy for non-space non-XML characters.
373 */
374 private XmlViolationPolicy contentNonXmlCharPolicy = XmlViolationPolicy.ALLOW;
375
376 /**
377 * The policy for comments.
378 */
379 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALLOW;
380
381 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALLOW;
382
383 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALLOW;
384
385 private boolean swallowBom;
386
387 private boolean html4ModeCompatibleWithXhtml1Schemata;
388
389 private boolean mappingLangToXmlLang;
390
391 private XmlViolationPolicy bogusXmlnsPolicy;
392
393 // start public API
394
395 /**
396 * The constuctor.
397 *
398 * @param tokenHandler
399 * the handler for receiving tokens
400 */
401 public Tokenizer(TokenHandler tokenHandler) {
402 this.tokenHandler = tokenHandler;
403 }
404
405 /**
406 * Turns NFC checking on or off.
407 *
408 * @param enable
409 * <code>true</code> if checking on
410 */
411 public void setCheckingNormalization(boolean enable) {
412 if (enable) {
413 if (isCheckingNormalization()) {
414 return;
415 } else {
416 NormalizationChecker normalizationChecker = new NormalizationChecker(
417 this);
418 normalizationChecker.setErrorHandler(errorHandler);
419
420 }
421 } else {
422 if (isCheckingNormalization()) {
423 CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length - 1];
424 boolean skipped = false;
425 int j = 0;
426 for (int i = 0; i < characterHandlers.length; i++) {
427 CharacterHandler ch = characterHandlers[i];
428 if (!(!skipped && (ch instanceof NormalizationChecker))) {
429 newHandlers[j] = ch;
430 j++;
431 }
432 }
433 characterHandlers = newHandlers;
434 } else {
435 return;
436 }
437 }
438 }
439
440 public void addCharacterHandler(CharacterHandler characterHandler) {
441 if (characterHandler == null) {
442 throw new IllegalArgumentException("Null argument.");
443 }
444 CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length + 1];
445 System.arraycopy(characterHandlers, 0, newHandlers, 0,
446 characterHandlers.length);
447 newHandlers[characterHandlers.length] = characterHandler;
448 characterHandlers = newHandlers;
449 }
450
451 /**
452 * Query if checking normalization.
453 *
454 * @return <code>true</code> if checking on
455 */
456 public boolean isCheckingNormalization() {
457 for (int i = 0; i < characterHandlers.length; i++) {
458 CharacterHandler ch = characterHandlers[i];
459 if (ch instanceof NormalizationChecker) {
460 return true;
461 }
462 }
463 return false;
464 }
465
466 /**
467 * Sets the error handler.
468 *
469 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
470 */
471 public void setErrorHandler(ErrorHandler eh) {
472 this.errorHandler = eh;
473 for (int i = 0; i < characterHandlers.length; i++) {
474 CharacterHandler ch = characterHandlers[i];
475 if (ch instanceof NormalizationChecker) {
476 NormalizationChecker nc = (NormalizationChecker) ch;
477 nc.setErrorHandler(eh);
478 }
479 }
480 }
481
482 /**
483 * Returns the commentPolicy.
484 *
485 * @return the commentPolicy
486 */
487 public XmlViolationPolicy getCommentPolicy() {
488 return commentPolicy;
489 }
490
491 /**
492 * Sets the commentPolicy.
493 *
494 * @param commentPolicy
495 * the commentPolicy to set
496 */
497 public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
498 this.commentPolicy = commentPolicy;
499 }
500
501 /**
502 * Returns the contentNonXmlCharPolicy.
503 *
504 * @return the contentNonXmlCharPolicy
505 */
506 public XmlViolationPolicy getContentNonXmlCharPolicy() {
507 return contentNonXmlCharPolicy;
508 }
509
510 /**
511 * Sets the contentNonXmlCharPolicy.
512 *
513 * @param contentNonXmlCharPolicy
514 * the contentNonXmlCharPolicy to set
515 */
516 public void setContentNonXmlCharPolicy(
517 XmlViolationPolicy contentNonXmlCharPolicy) {
518 this.contentNonXmlCharPolicy = contentNonXmlCharPolicy;
519 }
520
521 /**
522 * Returns the contentSpacePolicy.
523 *
524 * @return the contentSpacePolicy
525 */
526 public XmlViolationPolicy getContentSpacePolicy() {
527 return contentSpacePolicy;
528 }
529
530 /**
531 * Sets the contentSpacePolicy.
532 *
533 * @param contentSpacePolicy
534 * the contentSpacePolicy to set
535 */
536 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
537 this.contentSpacePolicy = contentSpacePolicy;
538 }
539
540 /**
541 * Sets the xmlnsPolicy.
542 *
543 * @param xmlnsPolicy
544 * the xmlnsPolicy to set
545 */
546 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
547 if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
548 throw new IllegalArgumentException("Can't use FATAL here.");
549 }
550 this.xmlnsPolicy = xmlnsPolicy;
551 }
552
553 public void setNamePolicy(XmlViolationPolicy namePolicy) {
554 this.namePolicy = namePolicy;
555 }
556
557 /**
558 * Sets the bogusXmlnsPolicy.
559 *
560 * @param bogusXmlnsPolicy
561 * the bogusXmlnsPolicy to set
562 */
563 public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) {
564 this.bogusXmlnsPolicy = bogusXmlnsPolicy;
565 }
566
567 /**
568 * Sets the html4ModeCompatibleWithXhtml1Schemata.
569 *
570 * @param html4ModeCompatibleWithXhtml1Schemata
571 * the html4ModeCompatibleWithXhtml1Schemata to set
572 */
573 public void setHtml4ModeCompatibleWithXhtml1Schemata(
574 boolean html4ModeCompatibleWithXhtml1Schemata) {
575 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
576 }
577
578 /**
579 * Runs the tokenization. This is the main entry point.
580 *
581 * @param is
582 * the input source
583 * @throws SAXException
584 * on fatal error (if configured to treat XML violations as
585 * fatal) or if the token handler threw
586 * @throws IOException
587 * if the stream threw
588 */
589 public void tokenize(InputSource is) throws SAXException, IOException {
590 if (is == null) {
591 throw new IllegalArgumentException("InputSource was null.");
592 }
593 swallowBom = true;
594 this.systemId = is.getSystemId();
595 this.publicId = is.getPublicId();
596 this.reader = is.getCharacterStream();
597 CharsetDecoder decoder = decoderFromExternalDeclaration(is.getEncoding());
598 if (this.reader == null) {
599 InputStream inputStream = is.getByteStream();
600 if (inputStream == null) {
601 throw new SAXException("Both streams in InputSource were null.");
602 }
603 if (decoder == null) {
604 this.reader = new HtmlInputStreamReader(inputStream,
605 errorHandler, this, this);
606 } else {
607 this.reader = new HtmlInputStreamReader(inputStream,
608 errorHandler, this, this, decoder);
609 }
610 }
611 contentModelFlag = ContentModelFlag.PCDATA;
612 escapeFlag = false;
613 inContent = true;
614 pos = -1;
615 cstart = -1;
616 line = linePrev = 0;
617 col = colPrev = 1;
618 nextCharOnNewLine = true;
619 prev = '\u0000';
620 bufLen = 0;
621 nonAsciiProhibited = false;
622 alreadyComplainedAboutNonAscii = false;
623 html4 = false;
624 alreadyWarnedAboutPrivateUseCharacters = false;
625 metaBoundaryPassed = false;
626 tokenHandler.start(this);
627 for (int i = 0; i < characterHandlers.length; i++) {
628 CharacterHandler ch = characterHandlers[i];
629 ch.start();
630 }
631 wantsComments = tokenHandler.wantsComments();
632 try {
633 if (swallowBom) {
634 // Swallow the BOM
635 char c = read();
636 if (c == '\uFEFF') {
637 line = linePrev = 0;
638 col = colPrev = 1;
639 nextCharOnNewLine = true;
640 } else {
641 unread(c);
642 }
643 }
644 dataState();
645 } finally {
646 systemIdentifier = null;
647 publicIdentifier = null;
648 doctypeName = null;
649 tagName = null;
650 attributeName = null;
651 tokenHandler.eof();
652 for (int i = 0; i < characterHandlers.length; i++) {
653 CharacterHandler ch = characterHandlers[i];
654 ch.end();
655 }
656 reader.close();
657 }
658 }
659
660 // For the token handler to call
661 /**
662 * Sets the content model flag and the associated element name.
663 *
664 * @param contentModelFlag
665 * the flag
666 * @param contentModelElement
667 * the element causing the flag to be set
668 */
669 public void setContentModelFlag(ContentModelFlag contentModelFlag,
670 String contentModelElement) {
671 this.contentModelFlag = contentModelFlag;
672 this.contentModelElement = contentModelElement;
673 }
674
675 // start Locator impl
676
677 /**
678 * @see org.xml.sax.Locator#getPublicId()
679 */
680 public String getPublicId() {
681 return publicId;
682 }
683
684 /**
685 * @see org.xml.sax.Locator#getSystemId()
686 */
687 public String getSystemId() {
688 return systemId;
689 }
690
691 /**
692 * @see org.xml.sax.Locator#getLineNumber()
693 */
694 public int getLineNumber() {
695 if (line > 0) {
696 return line;
697 } else {
698 return -1;
699 }
700 }
701
702 /**
703 * @see org.xml.sax.Locator#getColumnNumber()
704 */
705 public int getColumnNumber() {
706 if (col > 0) {
707 return col;
708 } else {
709 return -1;
710 }
711 }
712
713 // end Locator impl
714
715 // end public API
716
717 void notifyAboutMetaBoundary() {
718 metaBoundaryPassed = true;
719 }
720
721 void turnOnAdditionalHtml4Errors() {
722 html4 = true;
723 }
724
725 void dontSwallowBom() {
726 swallowBom = false;
727 }
728
729 void noEncodingDeclared() {
730 nonAsciiProhibited = true;
731 }
732
733 AttributesImpl newAttributes() {
734 if (mappingLangToXmlLang) {
735 return new XmlLangAttributesImpl();
736 } else {
737 return new AttributesImpl();
738 }
739 }
740
741 /**
742 * Clears the smaller buffer.
743 */
744 private void clearStrBuf() {
745 strBufLen = 0;
746 }
747
748 /**
749 * Appends to the smaller buffer.
750 *
751 * @param c
752 * the UTF-16 code unit to append
753 */
754 private void appendStrBuf(char c) {
755 if (strBufLen == strBuf.length) {
756 char[] newBuf = new char[strBuf.length + BUFFER_GROW_BY];
757 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
758 strBuf = newBuf;
759 }
760 strBuf[strBufLen++] = c;
761 }
762
763 /**
764 * The smaller buffer as a string.
765 *
766 * @return the smaller buffer as a string
767 */
768 private String strBufToString() {
769 return new String(strBuf, 0, strBufLen);
770 }
771
772 /**
773 * Emits the smaller buffer as character tokens.
774 *
775 * @throws SAXException
776 * if the token handler threw
777 */
778 private void emitStrBuf() throws SAXException {
779 if (strBufLen > 0) {
780 tokenHandler.characters(strBuf, 0, strBufLen);
781 }
782 }
783
784 private boolean isNcname(String str) {
785 Matcher m = NCNAME_PATTERN.matcher(str);
786 return m.matches();
787 }
788
789 /**
790 * Clears the larger buffer.
791 */
792 private void clearLongStrBuf() {
793 longStrBufLen = 0;
794 longStrBufPending = '\u0000';
795 }
796
797 /**
798 * Appends to the larger buffer.
799 *
800 * @param c
801 * the UTF-16 code unit to append
802 */
803 private void appendLongStrBuf(char c) {
804 if (longStrBufLen == longStrBuf.length) {
805 char[] newBuf = new char[longStrBuf.length + BUFFER_GROW_BY];
806 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
807 longStrBuf = newBuf;
808 }
809 longStrBuf[longStrBufLen++] = c;
810 }
811
812 /**
813 * Appends to the larger buffer when it is used to buffer a comment. Checks
814 * for two consecutive hyphens.
815 *
816 * @param c
817 * the UTF-16 code unit to append
818 * @throws SAXException
819 */
820 private void appendToComment(char c) throws SAXException {
821 if (longStrBufPending == '-' && c == '-') {
822 if (commentPolicy == XmlViolationPolicy.FATAL) {
823 fatal("This document is not mappable to XML 1.0 without data loss to \u201C--\u201D in a comment.");
824 } else {
825 warn("This document is not mappable to XML 1.0 without data loss to \u201C--\u201D in a comment.");
826 if (wantsComments) {
827 if (commentPolicy == XmlViolationPolicy.ALLOW) {
828 appendLongStrBuf('-');
829 } else {
830 appendLongStrBuf('-');
831 appendLongStrBuf(' ');
832 }
833 }
834 longStrBufPending = '-';
835 }
836 } else {
837 if (longStrBufPending != '\u0000') {
838 if (wantsComments) {
839 appendLongStrBuf(longStrBufPending);
840 }
841 longStrBufPending = '\u0000';
842 }
843 if (c == '-') {
844 longStrBufPending = '-';
845 } else {
846 if (wantsComments) {
847 appendLongStrBuf(c);
848 }
849 }
850 }
851 }
852
853 /**
854 * Appends to the larger buffer.
855 *
856 * @param arr
857 * the UTF-16 code units to append
858 */
859 private void appendLongStrBuf(char[] arr) {
860 for (int i = 0; i < arr.length; i++) {
861 appendLongStrBuf(arr[i]);
862 }
863 }
864
865 /**
866 * Append the contents of the smaller buffer to the larger one.
867 */
868 private void appendStrBufToLongStrBuf() {
869 for (int i = 0; i < strBufLen; i++) {
870 appendLongStrBuf(strBuf[i]);
871 }
872 }
873
874 /**
875 * The larger buffer as a string.
876 *
877 * @return the larger buffer as a string
878 */
879 private String longStrBufToString() {
880 if (longStrBufPending != '\u0000') {
881 appendLongStrBuf(longStrBufPending);
882 }
883 return new String(longStrBuf, 0, longStrBufLen);
884 }
885
886 /**
887 * Emits the current comment token.
888 *
889 * @throws SAXException
890 */
891 private void emitComment() throws SAXException {
892 if (wantsComments) {
893 if (longStrBufPending != '\u0000') {
894 appendLongStrBuf(longStrBufPending);
895 }
896 }
897 tokenHandler.comment(longStrBuf, longStrBufLen);
898 }
899
900 /**
901 * Unreads a code unit so that it is returned the next time
902 * <code>read()</code> is called.
903 *
904 * @param c
905 * the code unit to unread
906 */
907 private void unread(char c) {
908 unreadBuffer = c;
909 }
910
911 /**
912 * Reads the next UTF-16 code unit.
913 *
914 * @return the next code unit
915 * @throws SAXException
916 * @throws IOException
917 */
918 private char read() throws SAXException, IOException {
919 for (;;) { // the loop is here for the CRLF case
920 if (unreadBuffer != -1) {
921 char c = (char) unreadBuffer;
922 unreadBuffer = -1;
923 return c;
924 }
925 assert (bufLen > -1);
926 pos++;
927 assert pos <= bufLen;
928 linePrev = line;
929 colPrev = col;
930 if (nextCharOnNewLine) {
931 line++;
932 col = 1;
933 nextCharOnNewLine = false;
934 } else {
935 col++;
936 }
937 if (pos == bufLen) {
938 boolean charDataContinuation = false;
939 if (cstart > -1) {
940 flushChars();
941 charDataContinuation = true;
942 }
943 bufLen = reader.read(buf);
944 assert bufLen <= buf.length;
945 if (bufLen == -1) {
946 return '\u0000';
947 } else {
948 for (int i = 0; i < characterHandlers.length; i++) {
949 CharacterHandler ch = characterHandlers[i];
950 ch.characters(buf, 0, bufLen);
951 }
952 }
953 if (charDataContinuation) {
954 cstart = 0;
955 }
956 pos = 0;
957 }
958 char c = buf[pos];
959 if (c > '\u007F' && nonAsciiProhibited
960 && !alreadyComplainedAboutNonAscii) {
961 err("The character encoding of the document was not explicit but the document contains non-ASCII.");
962 }
963 switch (c) {
964 case '\n':
965 /*
966 * U+000D CARRIAGE RETURN (CR) characters, and U+000A LINE
967 * FEED (LF) characters, are treated specially. Any CR
968 * characters that are followed by LF characters must be
969 * removed, and any CR characters not followed by LF
970 * characters must be converted to LF characters.
971 */
972 if (prev == '\r') {
973 // swallow the LF
974 if (cstart != -1) {
975 flushChars();
976 cstart = pos + 1;
977 }
978 col = colPrev;
979 line = linePrev;
980 nextCharOnNewLine = true;
981 prev = c;
982 continue;
983 } else {
984 nextCharOnNewLine = true;
985 }
986 break;
987 case '\r':
988 c = buf[pos] = '\n';
989 nextCharOnNewLine = true;
990 prev = '\r';
991 if (contentModelFlag != ContentModelFlag.PCDATA) {
992 prevFourPtr++;
993 prevFourPtr %= 4;
994 prevFour[prevFourPtr] = c;
995 }
996 return c;
997 case '\u0000':
998 /*
999 * All U+0000 NULL characters in the input must be replaced
1000 * by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
1001 * characters is a parse error.
1002 */
1003 err("Found U+0000 in the character stream.");
1004 c = buf[pos] = '\uFFFD';
1005 break;
1006 case '\u000B':
1007 case '\u000C':
1008 if (inContent) {
1009 if (contentNonXmlCharPolicy == XmlViolationPolicy.FATAL) {
1010 fatal("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
1011 } else {
1012 if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) {
1013 c = buf[pos] = ' ';
1014 }
1015 warn("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
1016 }
1017 }
1018 break;
1019 default:
1020 if ((c & 0xFC00) == 0xDC00) {
1021 // Got a low surrogate. See if prev was high surrogate
1022 if ((prev & 0xFC00) == 0xD800) {
1023 int intVal = (prev << 10) + c + SURROGATE_OFFSET;
1024 if (isNonCharacter(intVal)) {
1025 warn("Astral non-character.");
1026 }
1027 if (isAstralPrivateUse(intVal)) {
1028 warnAboutPrivateUseChar();
1029 }
1030 } else {
1031 // XXX figure out what to do about lone high
1032 // surrogates
1033 err("Found low surrogate without high surrogate.");
1034 c = buf[pos] = '\uFFFD';
1035 }
1036 } else if (inContent && (c < ' ' || isNonCharacter(c))
1037 && (c != '\t')) {
1038 if (contentNonXmlCharPolicy == XmlViolationPolicy.FATAL) {
1039 fatal("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
1040 } else {
1041 if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) {
1042 c = buf[pos] = '\uFFFD';
1043 }
1044 warn("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
1045 }
1046 } else if (isPrivateUse(c)) {
1047 warnAboutPrivateUseChar();
1048 }
1049 }
1050 prev = c;
1051 if (contentModelFlag != ContentModelFlag.PCDATA) {
1052 prevFourPtr++;
1053 prevFourPtr %= 4;
1054 prevFour[prevFourPtr] = c;
1055 }
1056 return c;
1057 }
1058 }
1059
1060 /**
1061 * Emits a warning about private use characters if the warning has not been
1062 * emitted yet.
1063 *
1064 * @throws SAXException
1065 */
1066 private void warnAboutPrivateUseChar() throws SAXException {
1067 if (!alreadyWarnedAboutPrivateUseCharacters) {
1068 warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)");
1069 alreadyWarnedAboutPrivateUseCharacters = true;
1070 }
1071 }
1072
1073 /**
1074 * Tells if the argument is a BMP PUA character.
1075 *
1076 * @param c
1077 * the UTF-16 code unit to check
1078 * @return <code>true</code> if PUA character
1079 */
1080 private boolean isPrivateUse(char c) {
1081 return c >= '\uE000' && c <= '\uF8FF';
1082 }
1083
1084 /**
1085 * Tells if the argument is an astral PUA character.
1086 *
1087 * @param c
1088 * the code point to check
1089 * @return <code>true</code> if astral private use
1090 */
1091 private boolean isAstralPrivateUse(int c) {
1092 return (c >= 0xF0000 && c <= 0xFFFFD)
1093 || (c >= 0x100000 && c <= 0x10FFFD);
1094 }
1095
1096 /**
1097 * Tells if the argument is a non-character (works for BMP and astral).
1098 *
1099 * @param c
1100 * the code point to check
1101 * @return <code>true</code> if non-character
1102 */
1103 private boolean isNonCharacter(int c) {
1104 return (c & 0xFFFE) == 0xFFFE;
1105 }
1106
1107 /**
1108 * Flushes coalesced character tokens.
1109 *
1110 * @throws SAXException
1111 */
1112 private void flushChars() throws SAXException, IOException {
1113 if (cstart != -1) {
1114 if (pos > cstart) {
1115 int currLine = line;
1116 int currCol = col;
1117 line = linePrev;
1118 col = colPrev;
1119 try {
1120 tokenHandler.characters(buf, cstart, pos - cstart);
1121 } finally {
1122 line = currLine;
1123 col = currCol;
1124 }
1125 }
1126 }
1127 cstart = -1;
1128 }
1129
1130 /**
1131 * Reports an condition that would make the infoset incompatible with XML
1132 * 1.0 as fatal.
1133 *
1134 * @param message
1135 * the message
1136 * @throws SAXException
1137 * @throws SAXParseException
1138 */
1139 private void fatal(String message) throws SAXException {
1140 SAXParseException spe = new SAXParseException(message, this);
1141 if (errorHandler != null) {
1142 errorHandler.fatalError(spe);
1143 }
1144 throw spe;
1145 }
1146
1147 /**
1148 * Reports a Parse Error.
1149 *
1150 * @param message
1151 * the message
1152 * @throws SAXException
1153 */
1154 private void err(String message) throws SAXException {
1155 if (errorHandler == null) {
1156 return;
1157 }
1158 SAXParseException spe = new SAXParseException(message, this);
1159 errorHandler.error(spe);
1160 }
1161
1162 /**
1163 * Reports a warning
1164 *
1165 * @param message
1166 * the message
1167 * @throws SAXException
1168 */
1169 private void warn(String message) throws SAXException {
1170 if (errorHandler == null) {
1171 return;
1172 }
1173 SAXParseException spe = new SAXParseException(message, this);
1174 errorHandler.warning(spe);
1175 }
1176
1177 /**
1178 * Initializes a decoder from external decl.
1179 */
1180 private CharsetDecoder decoderFromExternalDeclaration(String encoding)
1181 throws SAXException {
1182 if (encoding == null) {
1183 return null;
1184 }
1185 encoding = encoding.toUpperCase();
1186 if ("ISO-8859-1".equals(encoding)) {
1187 encoding = "Windows-1252";
1188 }
1189 if ("UTF-16".equals(encoding) || "UTF-32".equals(encoding)) {
1190 swallowBom = false;
1191 }
1192 try {
1193 Charset cs = Charset.forName(encoding);
1194 String canonName = cs.name();
1195 if (canonName.startsWith("X-") || canonName.startsWith("x-")
1196 || canonName.startsWith("Mac")) {
1197 if (encoding.startsWith("X-")) {
1198 err("The encoding \u201C"
1199 + encoding
1200 + "\u201D is not an IANA-registered encoding. (Charmod C022)");
1201 } else {
1202 err("The encoding \u201C"
1203 + encoding
1204 + "\u201D is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
1205 }
1206 } else if (!canonName.equalsIgnoreCase(encoding)) {
1207 err("The encoding \u201C"
1208 + encoding
1209 + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
1210 + canonName + "\u201D. (Charmod C024)");
1211 }
1212 if (EncodingInfo.isObscure(canonName)) {
1213 warn("The character encoding \u201C"
1214 + encoding
1215 + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
1216 }
1217 return cs.newDecoder();
1218 } catch (IllegalCharsetNameException e) {
1219 err("Illegal character encoding name: \u201C" + encoding
1220 + "\u201D. Will sniff.");
1221 } catch (UnsupportedCharsetException e) {
1222 err("Unsupported character encoding name: \u201C" + encoding
1223 + "\u201D. Will sniff.");
1224 swallowBom = true;
1225 }
1226 return null; // keep the compiler happy
1227 }
1228
1229 private boolean currentIsVoid() {
1230 return Arrays.binarySearch(VOID_ELEMENTS, tagName) > -1;
1231 }
1232
1233 /**
1234 * Data state
1235 *
1236 * @throws IOException
1237 * @throws SAXException
1238 *
1239 */
1240 private void dataState() throws SAXException, IOException {
1241 char c = '\u0000';
1242 for (;;) {
1243 c = read();
1244 if (c == '&'
1245 && (contentModelFlag == ContentModelFlag.PCDATA || (contentModelFlag == ContentModelFlag.RCDATA)
1246 && !escapeFlag)) {
1247 /*
1248 * U+0026 AMPERSAND (&) When the content model flag is set to
1249 * one of the PCDATA or RCDATA states: switch to the entity data
1250 * state. Otherwise: treat it as per the "anything else" entry
1251 * below.
1252 */
1253 flushChars();
1254 entityDataState();
1255 continue;
1256 } else if (c == '<'
1257 && ((contentModelFlag == ContentModelFlag.PCDATA) || (escapeFlag == false && (contentModelFlag == ContentModelFlag.CDATA || contentModelFlag == ContentModelFlag.RCDATA)))) {
1258 /*
1259 * U+003C LESS-THAN SIGN (<) When the content model flag is set
1260 * to the PCDATA state: switch to the tag open state. When the
1261 * content model flag is set to either the RCDATA state or the
1262 * CDATA state and the escape flag is false: switch to the tag
1263 * open state. Otherwise: treat it as per the "anything else"
1264 * entry below.
1265 */
1266 flushChars();
1267 resetAttributes();
1268 inContent = false;
1269 tagOpenState();
1270 inContent = true;
1271 continue;
1272 } else if (c == '\u0000') {
1273 /*
1274 * EOF Emit an end-of-file token.
1275 */
1276 flushChars();
1277 return; // eof() called in parent finally block
1278 } else {
1279 if (c == '-'
1280 && (escapeFlag == false)
1281 && (contentModelFlag == ContentModelFlag.RCDATA || contentModelFlag == ContentModelFlag.CDATA)
1282 && lastLtExclHyph()) {
1283 /*
1284 * U+002D HYPHEN-MINUS (-) If the content model flag is set
1285 * to either the RCDATA state or the CDATA state, and the
1286 * escape flag is false, and there are at least three
1287 * characters before this one in the input stream, and the
1288 * last four characters in the input stream, including this
1289 * one, are U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK,
1290 * U+002D HYPHEN-MINUS, and U+002D HYPHEN-MINUS ("<!--"),
1291 * then set the escape flag to true.
1292 *
1293 * In any case, emit the input character as a character
1294 * token. Stay in the data state.
1295 */
1296 escapeFlag = true;
1297 } else if (c == '>' && escapeFlag && lastHyphHyph()) {
1298 /*
1299 * U+003E GREATER-THAN SIGN (>) If the content model flag is
1300 * set to either the RCDATA state or the CDATA state, and
1301 * the escape flag is true, and the last three characters in
1302 * the input stream including this one are U+002D
1303 * HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN
1304 * SIGN ("-->"), set the escape flag to false.
1305 *
1306 * In any case, emit the input character as a character
1307 * token. Stay in the data state.
1308 */
1309 escapeFlag = false;
1310 }
1311 /*
1312 * Anything else Emit the input character as a character token.
1313 */
1314 if (cstart == -1) {
1315 // start coalescing character tokens
1316 cstart = pos;
1317 }
1318 /*
1319 * Stay in the data state.
1320 */
1321 continue;
1322 }
1323 }
1324 }
1325
1326 private boolean lastHyphHyph() {
1327 return prevFour[(prevFourPtr - 1 + 4) % 4] == '-'
1328 && prevFour[(prevFourPtr - 2 + 4) % 4] == '-';
1329 }
1330
1331 private boolean lastLtExclHyph() {
1332 return prevFour[(prevFourPtr - 1 + 4) % 4] == '-'
1333 && prevFour[(prevFourPtr - 2 + 4) % 4] == '!'
1334 && prevFour[(prevFourPtr - 3 + 4) % 4] == '<';
1335 }
1336
1337 /**
1338 *
1339 * Entity data state
1340 *
1341 * @throws IOException
1342 * @throws SAXException
1343 */
1344 private void entityDataState() throws SAXException, IOException {
1345 /*
1346 * (This cannot happen if the content model flag is set to the CDATA
1347 * state.)
1348 *
1349 * Attempt to consume an entity.
1350 */
1351 consumeEntity(false);
1352 /*
1353 * If nothing is returned, emit a U+0026 AMPERSAND character token.
1354 *
1355 * Otherwise, emit the character token that was returned.
1356 */
1357 // Handled by consumeEntity()
1358 /*
1359 * Finally, switch to the data state.
1360 */
1361 return;
1362 }
1363
1364 /**
1365 * Tag open state
1366 *
1367 * @throws IOException
1368 * @throws SAXException
1369 */
1370 private void tagOpenState() throws SAXException, IOException {
1371 /*
1372 * The behaviour of this state depends on the content model flag.
1373 */
1374 // this can't happen in PLAINTEXT, so using not PCDATA as the condition
1375 if (contentModelFlag != ContentModelFlag.PCDATA) {
1376 /*
1377 * If the content model flag is set to the RCDATA or CDATA states
1378 * Consume the next input character.
1379 */
1380 char c = read();
1381 if (c == '/') {
1382 /*
1383 * If it is a U+002F SOLIDUS (/) character, switch to the close
1384 * tag open state.
1385 */
1386 closeTagOpenState();
1387 return;
1388 } else {
1389 /*
1390 * Otherwise, emit a U+003C LESS-THAN SIGN character token
1391 */
1392 tokenHandler.characters(LT_GT, 0, 1);
1393 /*
1394 * and reconsume the current input character in the data state.
1395 */
1396 unread(c);
1397 return;
1398 }
1399 } else {
1400 /*
1401 * If the content model flag is set to the PCDATA state Consume the
1402 * next input character:
1403 */
1404 char c = read();
1405 if (c == '!') {
1406 /*
1407 * U+0021 EXCLAMATION MARK (!) Switch to the markup declaration
1408 * open state.
1409 */
1410 markupDeclarationOpenState();
1411 return;
1412 } else if (c == '/') {
1413 /* U+002F SOLIDUS (/) Switch to the close tag open state. */
1414 closeTagOpenState();
1415 return;
1416 } else if (c >= 'A' && c <= 'Z') {
1417 /*
1418 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL
1419 * LETTER Z Create a new start tag token,
1420 */
1421 endTag = false;
1422 /*
1423 * set its tag name to the lowercase version of the input
1424 * character (add 0x0020 to the character's code point),
1425 */
1426 clearStrBuf();
1427 appendStrBuf((char) (c + 0x20));
1428 /* then switch to the tag name state. */
1429 tagNameState();
1430 /*
1431 * (Don't emit the token yet; further details will be filled in
1432 * before it is emitted.)
1433 */
1434 return;
1435 } else if (c >= 'a' && c <= 'z') {
1436 /*
1437 * U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL
1438 * LETTER Z Create a new start tag token,
1439 */
1440 endTag = false;
1441 /*
1442 * set its tag name to the input character,
1443 */
1444 clearStrBuf();
1445 appendStrBuf(c);
1446 /* then switch to the tag name state. */
1447 tagNameState();
1448 /*
1449 * (Don't emit the token yet; further details will be filled in
1450 * before it is emitted.)
1451 */
1452 return;
1453 } else if (c == '>') {
1454 /*
1455 * U+003E GREATER-THAN SIGN (>) Parse error.
1456 */
1457 err("Bad character \u201C>\u201D in the tag open state.");
1458 /*
1459 * Emit a U+003C LESS-THAN SIGN character token and a U+003E
1460 * GREATER-THAN SIGN character token.
1461 */
1462 tokenHandler.characters(LT_GT, 0, 2);
1463 /* Switch to the data state. */
1464 return;
1465 } else if (c == '?') {
1466 /*
1467 * U+003F QUESTION MARK (?) Parse error.
1468 */
1469 err("Bad character \u201C?\u201D in the tag open state.");
1470 /*
1471 * Switch to the bogus comment state.
1472 */
1473 clearLongStrBuf();
1474 appendLongStrBuf(c);
1475 bogusCommentState();
1476 return;
1477 } else {
1478 /*
1479 * Anything else Parse error.
1480 */
1481 err("Bad character \u201C" + c
1482 + "\u201D in the tag open state.");
1483 /*
1484 * Emit a U+003C LESS-THAN SIGN character token
1485 */
1486 tokenHandler.characters(LT_GT, 0, 1);
1487 /*
1488 * and reconsume the current input character in the data state.
1489 */
1490 unread(c);
1491 return;
1492 }
1493 }
1494 }
1495
1496 /**
1497 * Close tag open state
1498 *
1499 * @throws IOException
1500 * @throws SAXException
1501 */
1502 private void closeTagOpenState() throws SAXException, IOException {
1503 // this can't happen in PLAINTEXT, so using not PCDATA as the condition
1504 if (contentModelFlag != ContentModelFlag.PCDATA
1505 && contentModelElement != null) {
1506 /*
1507 * If the content model flag is set to the RCDATA or CDATA states
1508 * but no start tag token has ever been emitted by this instance of
1509 * the tokeniser (fragment case), or, if the content model flag is
1510 * set to the RCDATA or CDATA states and the next few characters do
1511 * not match the tag name of the last start tag token emitted (case
1512 * insensitively), or if they do but they are not immediately
1513 * followed by one of the following characters: + U+0009 CHARACTER
1514 * TABULATION + U+000A LINE FEED (LF) + U+000B LINE TABULATION +
1515 * U+000C FORM FEED (FF) + U+0020 SPACE + U+003E GREATER-THAN SIGN
1516 * (>) + U+002F SOLIDUS (/) + EOF
1517 *
1518 * ...then emit a U+003C LESS-THAN SIGN character token, a U+002F
1519 * SOLIDUS character token, and switch to the data state to process
1520 * the next input character.
1521 */
1522 // Let's implement the above without lookahead. strBuf holds
1523 // characters that need to be emitted if looking for an end tag
1524 // fails.
1525 // Duplicating the relevant part of tag name state here as well.
1526 clearStrBuf();
1527 for (int i = 0; i < contentModelElement.length(); i++) {
1528 char e = contentModelElement.charAt(i);
1529 char c = read();
1530 char folded = c;
1531 if (c >= 'A' && c <= 'Z') {
1532 folded += 0x20;
1533 }
1534 if (folded != e) {
1535 if (i > 0 || (folded >= 'a' && folded <= 'z')) {
1536 if (html4) {
1537 if (!"iframe".equals(contentModelElement)) {
1538 err((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
1539 : "RCDATA")
1540 + " element \u201C"
1541 + contentModelElement
1542 + "\u201D contained the string \u201C</\u201D, but it was not the start of the end tag. (HTML4-only error)");
1543 }
1544 } else {
1545 warn((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
1546 : "RCDATA")
1547 + " element \u201C"
1548 + contentModelElement
1549 + "\u201D contained the string \u201C</\u201D, but this did not close the element.");
1550 }
1551 }
1552 tokenHandler.characters(LT_SOLIDUS, 0, 2);
1553 emitStrBuf();
1554 unread(c);
1555 return;
1556 }
1557 appendStrBuf(c);
1558 }
1559 endTag = true;
1560 tagName = contentModelElement;
1561 char c = read();
1562 switch (c) {
1563 case ' ':
1564 case '\t':
1565 case '\n':
1566 case '\u000B':
1567 case '\u000C':
1568 /*
1569 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
1570 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
1571 * to the before attribute name state.
1572 */
1573 beforeAttributeNameState();
1574 return;
1575 case '>':
1576 /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
1577 emitCurrentTagToken();
1578 /*
1579 * Switch to the data state.
1580 */
1581 return;
1582 case '\u0000':
1583 /*
1584 * EOF Parse error.
1585 */
1586 err("Expected \u201C>\u201D but saw end of file instead.");
1587 /*
1588 * Emit the current tag token.
1589 */
1590 emitCurrentTagToken();
1591 /* Reconsume the character in the data state. */
1592 unread(c);
1593 return;
1594 case '/':
1595 /*
1596 * U+002F SOLIDUS (/) Parse error unless this is a permitted
1597 * slash.
1598 */
1599 // never permitted here
1600 err("Stray \u201C/\u201D in end tag.");
1601 /* Switch to the before attribute name state. */
1602 beforeAttributeNameState();
1603 return;
1604 default:
1605 if (html4) {
1606 err((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
1607 : "RCDATA")
1608 + " element \u201C"
1609 + contentModelElement
1610 + "\u201D contained the string \u201C</\u201D, but it was not the start of the end tag. (HTML4-only error)");
1611 } else {
1612 warn((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
1613 : "RCDATA")
1614 + " element \u201C"
1615 + contentModelElement
1616 + "\u201D contained the string \u201C</\u201D, but this did not close the element.");
1617 }
1618 tokenHandler.characters(LT_SOLIDUS, 0, 2);
1619 emitStrBuf();
1620 cstart = pos; // don't drop the character
1621 return;
1622 }
1623 } else {
1624 /*
1625 * Otherwise, if the content model flag is set to the PCDATA state,
1626 * or if the next few characters do match that tag name, consume the
1627 * next input character:
1628 */
1629 char c = read();
1630 if (c >= 'A' && c <= 'Z') {
1631 /*
1632 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL
1633 * LETTER Z Create a new end tag token,
1634 */
1635 endTag = true;
1636 clearStrBuf();
1637 /*
1638 * set its tag name to the lowercase version of the input
1639 * character (add 0x0020 to the character's code point),
1640 */
1641 appendStrBuf((char) (c + 0x20));
1642 /*
1643 * then switch to the tag name state. (Don't emit the token yet;
1644 * further details will be filled in before it is emitted.)
1645 */
1646 tagNameState();
1647 return;
1648 } else if (c >= 'a' && c <= 'z') {
1649 /*
1650 * U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL
1651 * LETTER Z Create a new end tag token,
1652 */
1653 endTag = true;
1654 clearStrBuf();
1655 /*
1656 * set its tag name to the input character,
1657 */
1658 appendStrBuf(c);
1659 /*
1660 * then switch to the tag name state. (Don't emit the token yet;
1661 * further details will be filled in before it is emitted.)
1662 */
1663 tagNameState();
1664 return;
1665 } else if (c == '>') {
1666 /* U+003E GREATER-THAN SIGN (>) Parse error. */
1667 err("Saw \u201C</>\u201D.");
1668 /*
1669 * Switch to the data state.
1670 */
1671 return;
1672 } else if (c == '\u0000') {
1673 /* EOF Parse error. */
1674 err("Saw \u201C</\u201D immediately before end of file.");
1675 /*
1676 * Emit a U+003C LESS-THAN SIGN character token and a U+002F
1677 * SOLIDUS character token.
1678 */
1679 tokenHandler.characters(LT_SOLIDUS, 0, 2);
1680 /*
1681 * Reconsume the EOF character in the data state.
1682 */
1683 unread(c);
1684 return;
1685 } else {
1686 /* Anything else Parse error. */
1687 err("Garbage after \u201C</\u201D.");
1688 /*
1689 * Switch to the bogus comment state.
1690 */
1691 clearLongStrBuf();
1692 appendToComment(c);
1693 bogusCommentState();
1694 return;
1695 }
1696 }
1697 }
1698
1699 /**
1700 * Tag name state
1701 *
1702 * @throws IOException
1703 * @throws SAXException
1704 */
1705 private void tagNameState() throws SAXException, IOException {
1706 for (;;) {
1707 /*
1708 * Consume the next input character:
1709 */
1710 char c = read();
1711 switch (c) {
1712 case ' ':
1713 case '\t':
1714 case '\n':
1715 case '\u000B':
1716 case '\u000C':
1717 /*
1718 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
1719 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
1720 * to the before attribute name state.
1721 */
1722 tagName = strBufToElementNameString();
1723 beforeAttributeNameState();
1724 return;
1725 case '>':
1726 /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
1727 tagName = strBufToElementNameString();
1728 emitCurrentTagToken();
1729 /*
1730 * Switch to the data state.
1731 */
1732 return;
1733 case '\u0000':
1734 /*
1735 * EOF Parse error.
1736 */
1737 err("End of file seen when looking for tag name");
1738 /*
1739 * Emit the current tag token.
1740 */
1741 tagName = strBufToElementNameString();
1742 emitCurrentTagToken();
1743 /*
1744 * Reconsume the EOF character in the data state.
1745 */
1746 unread(c);
1747 return;
1748 case '/':
1749 /*
1750 * U+002F SOLIDUS (/) Parse error unless this is a permitted
1751 * slash.
1752 */
1753 tagName = strBufToElementNameString();
1754 parseErrorUnlessPermittedSlash();
1755 /*
1756 * Switch to the before attribute name state.
1757 */
1758 beforeAttributeNameState();
1759 return;
1760 default:
1761 if (c >= 'A' && c <= 'Z') {
1762 /*
1763 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
1764 * CAPITAL LETTER Z Append the lowercase version of the
1765 * current input character (add 0x0020 to the
1766 * character's code point) to the current tag token's
1767 * tag name.
1768 */
1769 appendStrBuf((char) (c + 0x20));
1770 } else {
1771 /*
1772 * Anything else Append the current input character to
1773 * the current tag token's tag name.
1774 */
1775 appendStrBuf(c);
1776 }
1777 /*
1778 * Stay in the tag name state.
1779 */
1780 continue;
1781 }
1782 }
1783 }
1784
1785 private String strBufToElementNameString() {
1786 // TODO Generate a better interning function
1787 return strBufToString().intern();
1788 }
1789
1790 /**
1791 * This method implements a wrapper loop for the attribute-related states to
1792 * avoid recursion to an arbitrary depth.
1793 *
1794 * @throws IOException
1795 * @throws SAXException
1796 */
1797 private void beforeAttributeNameState() throws SAXException, IOException {
1798 while (beforeAttributeNameStateImpl()) {
1799 // Spin.
1800 }
1801 }
1802
1803 /**
1804 *
1805 */
1806 private void resetAttributes() {
1807 attributes = null; // XXX figure out reuse
1808 }
1809
1810 /**
1811 * Before attribute name state
1812 *
1813 * @throws IOException
1814 * @throws SAXException
1815 */
1816 private boolean beforeAttributeNameStateImpl() throws SAXException,
1817 IOException {
1818 /*
1819 * Consume the next input character:
1820 */
1821 for (;;) {
1822 char c = read();
1823 switch (c) {
1824 case ' ':
1825 case '\t':
1826 case '\n':
1827 case '\u000B':
1828 case '\u000C':
1829 /*
1830 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
1831 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
1832 * in the before attribute name state.
1833 */
1834 continue;
1835 case '>':
1836 /*
1837 * U+003E GREATER-THAN SIGN (>) Emit the current tag token.
1838 */
1839 emitCurrentTagToken();
1840 /*
1841 * Switch to the data state.
1842 */
1843 return false;
1844 case '/':
1845 /*
1846 * U+002F SOLIDUS (/) Parse error unless this is a permitted
1847 * slash.
1848 */
1849 parseErrorUnlessPermittedSlash();
1850 /*
1851 * Stay in the before attribute name state.
1852 */
1853 continue;
1854 case '\u0000':
1855 /* EOF Parse error. */
1856 err("Saw end of file without the previous tag ending with \u201C>\u201C.");
1857 /*
1858 * Emit the current tag token.
1859 */
1860 emitCurrentTagToken();
1861 /*
1862 * Reconsume the EOF character in the data state.
1863 */
1864 unread(c);
1865 return false;
1866 default:
1867 /*
1868 * Anything else Start a new attribute in the current tag
1869 * token.
1870 */
1871 clearStrBuf();
1872
1873 if (c >= 'A' && c <= 'Z') {
1874 /*
1875 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
1876 * CAPITAL LETTER Z Set that attribute's name to the
1877 * lowercase version of the current input character (add
1878 * 0x0020 to the character's code point)
1879 */
1880 appendStrBuf((char) (c + 0x20));
1881 } else {
1882 /*
1883 * Set that attribute's name to the current input
1884 * character,
1885 */
1886 appendStrBuf(c);
1887 }
1888 /*
1889 * and its value to the empty string.
1890 */
1891 // Will do later.
1892 /*
1893 * Switch to the attribute name state.
1894 */
1895 return attributeNameState();
1896 }
1897 }
1898 }
1899
1900 private void parseErrorUnlessPermittedSlash() throws SAXException,
1901 IOException {
1902 /*
1903 * A permitted slash is a U+002F SOLIDUS character that is immediately
1904 * followed by a U+003E GREATER-THAN SIGN, if, and only if, the current
1905 * token being processed is a start tag token whose tag name is one of
1906 * the following: base, link, meta, hr, br, img, embed, param, area,
1907 * col, input
1908 */
1909 if (endTag) {
1910 err("Stray \u201C/\u201D in an end tag.");
1911 return;
1912 }
1913 char c = read();
1914 int saveLine = line;
1915 int saveCol = col;
1916 line = linePrev;
1917 col = colPrev;
1918 if (c == '>') {
1919 if (!currentIsVoid() && !html4) {
1920 if (html4) {
1921 err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is not permitted in HTML4.");
1922 } else {
1923 err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is only permitted on void elements.");
1924 }
1925 } else if (html4) {
1926 err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is not permitted in HTML4. (HTML4-only error)");
1927 }
1928 } else {
1929 err("Stray \u201C/\u201D in tag.");
1930 }
1931 line = saveLine;
1932 col = saveCol;
1933 unread(c);
1934 }
1935
1936 private void emitCurrentTagToken() throws SAXException {
1937 if (namePolicy != XmlViolationPolicy.ALLOW) {
1938 if (!isNcname(tagName)) {
1939 if (namePolicy == XmlViolationPolicy.FATAL) {
1940 fatal((endTag ? "End" : "Start") + " tag \u201C" + tagName
1941 + "\u201D has a non-NCName name.");
1942 } else {
1943 warn((endTag ? "End" : "Start") + " tag \u201C" + tagName
1944 + "\u201D has a non-NCName name. Ignoring token.");
1945 return;
1946 }
1947 }
1948 }
1949 Attributes attrs = (attributes == null ? EmptyAttributes.EMPTY_ATTRIBUTES
1950 : attributes);
1951 if (endTag) {
1952 /*
1953 * When an end tag token is emitted, the content model flag must be
1954 * switched to the PCDATA state.
1955 */
1956 escapeFlag = false;
1957 contentModelFlag = ContentModelFlag.PCDATA;
1958 if (attrs.getLength() != 0) {
1959 /*
1960 * When an end tag token is emitted with attributes, that is a
1961 * parse error.
1962 */
1963 err("End tag had attributes.");
1964 }
1965 tokenHandler.endTag(tagName, attrs);
1966 } else {
1967 tokenHandler.startTag(tagName, attrs);
1968 }
1969 }
1970
1971 /**
1972 * Attribute name state
1973 *
1974 * @throws IOException
1975 * @throws SAXException
1976 */
1977 private boolean attributeNameState() throws SAXException, IOException {
1978 for (;;) {
1979 /*
1980 * Consume the next input character:
1981 */
1982 char c = read();
1983 switch (c) {
1984 case ' ':
1985 case '\t':
1986 case '\n':
1987 case '\u000B':
1988 case '\u000C':
1989 /*
1990 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
1991 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
1992 * to the after attribute name state.
1993 */
1994 attributeNameComplete();
1995 return afterAttributeNameState();
1996 case '=':
1997 /*
1998 * U+003D EQUALS SIGN (=) Switch to the before attribute
1999 * value state.
2000 */
2001 attributeNameComplete();
2002 return beforeAttributeValueState();
2003 case '>':
2004 /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
2005 attributeNameComplete();
2006 addAttributeWithoutValue();
2007 emitCurrentTagToken();
2008 /*
2009 * Switch to the data state.
2010 */
2011 return false;
2012 case '/':
2013 /*
2014 * U+002F SOLIDUS (/) Parse error unless this is a permitted
2015 * slash.
2016 */
2017 parseErrorUnlessPermittedSlash();
2018 /* Switch to the before attribute name state. */
2019 attributeNameComplete();
2020 addAttributeWithoutValue();
2021 return true;
2022 case '\u0000':
2023 /*
2024 * EOF Parse error.
2025 */
2026 err("End of file occurred in an attribute name.");
2027 /*
2028 * Emit the current tag token.
2029 */
2030 attributeNameComplete();
2031 addAttributeWithoutValue();
2032 emitCurrentTagToken();
2033 /* Reconsume the EOF character in the data state. */
2034 unread(c);
2035 return false;
2036 default:
2037 if (c >= 'A' && c <= 'Z') {
2038 /*
2039 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
2040 * CAPITAL LETTER Z Append the lowercase version of the
2041 * current input character (add 0x0020 to the
2042 * character's code point) to the current attribute's
2043 * name.
2044 */
2045 appendStrBuf((char) (c + 0x20));
2046 } else {
2047 /*
2048 * Anything else Append the current input character to
2049 * the current attribute's name.
2050 */
2051 appendStrBuf(c);
2052 }
2053 }
2054 /*
2055 * Stay in the attribute name state.
2056 */
2057 continue;
2058 }
2059 }
2060
2061 private void attributeNameComplete() throws SAXException {
2062 attributeName = strBufToString();
2063 if (attributes == null) {
2064 attributes = newAttributes();
2065 }
2066 /*
2067 * When the user agent leaves the attribute name state (and before
2068 * emitting the tag token, if appropriate), the complete attribute's
2069 * name must be compared to the other attributes on the same token; if
2070 * there is already an attribute on the token with the exact same name,
2071 * then this is a parse error and the new attribute must be dropped,
2072 * along with the value that gets associated with it (if any).
2073 */
2074 if (attributes.getIndex(attributeName) == -1) {
2075 if (namePolicy == XmlViolationPolicy.ALLOW) {
2076 shouldAddAttributes = true;
2077 } else {
2078 if (isNcname(attributeName)) {
2079 shouldAddAttributes = true;
2080 } else {
2081 if (namePolicy == XmlViolationPolicy.FATAL) {
2082 fatal("Attribute name \u201C" + attributeName
2083 + "\u201D is not an NCName.");
2084 } else {
2085 shouldAddAttributes = false;
2086 warn("Attribute name \u201C"
2087 + attributeName
2088 + "\u201D is not an NCName. Ignoring the attribute.");
2089 }
2090 }
2091 }
2092 } else {
2093 shouldAddAttributes = false;
2094 err("Duplicate attribute \u201C" + attributeName + "\u201D.");
2095 }
2096 }
2097
2098 private void addAttributeWithoutValue() throws SAXException {
2099 if (metaBoundaryPassed && "charset".equals(attributeName)
2100 && "meta".equals(tagName)) {
2101 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
2102 }
2103 if (shouldAddAttributes) {
2104 if (html4) {
2105 if (AttributeInfo.isBoolean(attributeName)) {
2106 if (html4ModeCompatibleWithXhtml1Schemata) {
2107 attributes.addAttribute(attributeName, attributeName);
2108 } else {
2109 attributes.addAttribute(attributeName, "");
2110 }
2111 } else {
2112 err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
2113 attributes.addAttribute(attributeName, "");
2114 }
2115 } else {
2116 if ("src".equals(attributeName) || "href".equals(attributeName)) {
2117 warn("Attribute \u201C"
2118 + attributeName
2119 + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
2120 }
2121 attributes.addAttribute(attributeName, "");
2122 }
2123 }
2124 }
2125
2126 private void addAttributeWithValue() throws SAXException {
2127 if (metaBoundaryPassed && "meta" == tagName
2128 && "charset".equals(attributeName)) {
2129 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
2130 }
2131 if (shouldAddAttributes) {
2132 String value = longStrBufToString();
2133 if (!endTag) {
2134 if ("xmlns".equals(attributeName)) {
2135 if ("html" == tagName
2136 && "http://www.w3.org/1999/xhtml".equals(value)) {
2137 if (xmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) {
2138 return;
2139 }
2140 } else {
2141 if (bogusXmlnsPolicy == XmlViolationPolicy.FATAL) {
2142 fatal("Forbidden attribute \u201C"
2143 + attributeName
2144 + "\u201D is not mappable to namespace-aware XML 1.0.");
2145 } else {
2146 warn("Forbidden attribute \u201C"
2147 + attributeName
2148 + "\u201D is not mappable to namespace-aware XML 1.0.");
2149 if (bogusXmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) {
2150 return;
2151 }
2152 }
2153 }
2154 } else if (attributeName.startsWith("xmlns:")) {
2155 if (bogusXmlnsPolicy == XmlViolationPolicy.FATAL) {
2156 fatal("Forbidden attribute \u201C"
2157 + attributeName
2158 + "\u201D is not mappable to namespace-aware XML 1.0.");
2159 } else {
2160 warn("Forbidden attribute \u201C"
2161 + attributeName
2162 + "\u201D is not mappable to namespace-aware XML 1.0.");
2163 if (bogusXmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) {
2164 return;
2165 }
2166 }
2167 } else if (html4 && html4ModeCompatibleWithXhtml1Schemata && AttributeInfo.isCaseFolded(attributeName)) {
2168 value = toAsciiLowerCase(value);
2169 }
2170 }
2171 attributes.addAttribute(attributeName, value);
2172 }
2173 }
2174
2175 private String toAsciiLowerCase(String str) {
2176 if (str == null) {
2177 return null;
2178 }
2179 char[] b = new char[str.length()];
2180 for (int i = 0; i < str.length(); i++) {
2181 char c = str.charAt(i);
2182 if (c >= 'A' && c <= 'Z') {
2183 c += 0x20;
2184 }
2185 b[i] = c;
2186 }
2187 return new String(b);
2188 }
2189
2190 /**
2191 * After attribute name state
2192 *
2193 * @throws IOException
2194 * @throws SAXException
2195 */
2196 private boolean afterAttributeNameState() throws SAXException, IOException {
2197 for (;;) {
2198 /*
2199 * Consume the next input character:
2200 */
2201 char c = read();
2202 switch (c) {
2203 case ' ':
2204 case '\t':
2205 case '\n':
2206 case '\u000B':
2207 case '\u000C':
2208 /*
2209 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
2210 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
2211 * in the after attribute name state.
2212 */
2213 continue;
2214 case '=':
2215 /*
2216 * U+003D EQUALS SIGN (=) Switch to the before attribute
2217 * value state.
2218 */
2219 return beforeAttributeValueState();
2220 case '>':
2221 /*
2222 * U+003E GREATER-THAN SIGN (>) Emit the current tag token.
2223 */
2224 addAttributeWithoutValue();
2225 emitCurrentTagToken();
2226 /*
2227 * Switch to the data state.
2228 */
2229 return false;
2230 case '/':
2231 /*
2232 * U+002F SOLIDUS (/) Parse error unless this is a permitted
2233 * slash.
2234 */
2235 addAttributeWithoutValue();
2236 parseErrorUnlessPermittedSlash();
2237 /* Switch to the before attribute name state. */
2238 return true;
2239 case '\u0000':
2240 /* EOF Parse error. */
2241 err("Saw end of file without the previous tag ending with \u201C>\u201C.");
2242 /*
2243 * Emit the current tag token.
2244 */
2245 addAttributeWithoutValue();
2246 emitCurrentTagToken();
2247 /*
2248 * Reconsume the character in the data state.
2249 */
2250 unread(c);
2251 return false;
2252 default:
2253 /*
2254 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
2255 * CAPITAL LETTER Z Start a new attribute in the current tag
2256 * token. Set that attribute's name to the lowercase version
2257 * of the current input character (add 0x0020 to the
2258 * character's code point), and its value to the empty
2259 * string. Switch to the attribute name state.
2260 *
2261 * Anything else Start a new attribute in the current tag
2262 * token. Set that attribute's name to the current input
2263 * character, and its value to the empty string. Switch to
2264 * the attribute name state.
2265 */
2266 // let's do this by respinning through the attribute loop
2267 addAttributeWithoutValue();
2268 unread(c);
2269 return true;
2270 }
2271 }
2272 }
2273
2274 /**
2275 * Before attribute value state
2276 *
2277 * @throws IOException
2278 * @throws SAXException
2279 */
2280 private boolean beforeAttributeValueState() throws SAXException,
2281 IOException {
2282 clearLongStrBuf();
2283 for (;;) {
2284 /*
2285 * Consume the next input character:
2286 */
2287 char c = read();
2288 switch (c) {
2289 case ' ':
2290 case '\t':
2291 case '\n':
2292 case '\u000B':
2293 case '\u000C':
2294 /*
2295 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
2296 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
2297 * in the before attribute value state.
2298 */
2299 continue;
2300 case '"':
2301 /*
2302 * U+0022 QUOTATION MARK (") Switch to the attribute value
2303 * (double-quoted) state.
2304 */
2305 return attributeValueDoubleQuotedState();
2306 case '&':
2307 /*
2308 * U+0026 AMPERSAND (&) Switch to the attribute value
2309 * (unquoted) state and reconsume this input character.
2310 */
2311 unread(c);
2312 return attributeValueUnquotedState();
2313 case '\'':
2314 /*
2315 * U+0027 APOSTROPHE (') Switch to the attribute value
2316 * (single-quoted) state.
2317 */
2318 return attributeValueSingleQuotedState();
2319 case '>':
2320 /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
2321 addAttributeWithoutValue();
2322 emitCurrentTagToken();
2323 /*
2324 * Switch to the data state.
2325 */
2326 return false;
2327 case '\u0000':
2328 /* EOF Parse error. */
2329 err("Saw end of file without the previous tag ending with \u201C>\u201C.");
2330 /*
2331 * Emit the current tag token.
2332 */
2333 addAttributeWithoutValue();
2334 emitCurrentTagToken();
2335 /*
2336 * Reconsume the character in the data state.
2337 */
2338 unread(c);
2339 return false;
2340 default:
2341 if (html4
2342 && !((c >= 'a' && c <= 'z')
2343 || (c >= 'A' && c <= 'Z')
2344 || (c >= '0' && c <= '9') || c == '.'
2345 || c == '-' || c == '_' || c == ':')) {
2346 err("Non-name character in an unquoted attribute value. (This is an HTML4-only error.)");
2347 }
2348 /*
2349 * Anything else Append the current input character to the
2350 * current attribute's value.
2351 */
2352 appendLongStrBuf(c);
2353 /*
2354 * Switch to the attribute value (unquoted) state.
2355 */
2356 return attributeValueUnquotedState();
2357 }
2358 }
2359 }
2360
2361 /**
2362 * Attribute value (double-quoted) state
2363 *
2364 * @throws IOException
2365 * @throws SAXException
2366 */
2367 private boolean attributeValueDoubleQuotedState() throws SAXException,
2368 IOException {
2369 inContent = true;
2370 for (;;) {
2371 /*
2372 * Consume the next input character:
2373 */
2374 char c = read();
2375 switch (c) {
2376 case '"':
2377 /*
2378 * U+0022 QUOTATION MARK (") Switch to the before attribute
2379 * name state.
2380 */
2381 addAttributeWithValue();
2382 inContent = false;
2383 return true;
2384 case '&':
2385 /*
2386 * U+0026 AMPERSAND (&) Switch to the entity in attribute
2387 * value state.
2388 */
2389 entityInAttributeValueState();
2390 continue;
2391 case '\u0000':
2392 /* EOF Parse error. */
2393 err("End of file reached when inside a quoted attribute value.");
2394 /* Emit the current tag token. */
2395 addAttributeWithValue();
2396 emitCurrentTagToken();
2397 /*
2398 * Reconsume the character in the data state.
2399 */
2400 unread(c);
2401 inContent = false;
2402 return false;
2403 default:
2404 /*
2405 * Anything else Append the current input character to the
2406 * current attribute's value.
2407 */
2408 appendLongStrBuf(c);
2409 /*
2410 * Stay in the attribute value (double-quoted) state.
2411 */
2412 continue;
2413 }
2414 }
2415 }
2416
2417 /**
2418 * Attribute value (single-quoted) state
2419 *
2420 * @throws SAXException
2421 * @throws IOException
2422 */
2423 private boolean attributeValueSingleQuotedState() throws SAXException,
2424 IOException {
2425 inContent = true;
2426 for (;;) {
2427 /*
2428 * Consume the next input character:
2429 */
2430 char c = read();
2431 switch (c) {
2432 case '\'':
2433 /*
2434 * U+0027 APOSTROPHE (') Switch to the before attribute name
2435 * state.
2436 */
2437 addAttributeWithValue();
2438 inContent = false;
2439 return true;
2440 case '&':
2441 /*
2442 * U+0026 AMPERSAND (&) Switch to the entity in attribute
2443 * value state.
2444 */
2445 entityInAttributeValueState();
2446 continue;
2447 case '\u0000':
2448 /* EOF Parse error. */
2449 err("End of file reached when inside a quoted attribute value.");
2450 /* Emit the current tag token. */
2451 addAttributeWithValue();
2452 emitCurrentTagToken();
2453 /*
2454 * Reconsume the character in the data state.
2455 */
2456 unread(c);
2457 inContent = false;
2458 return false;
2459 default:
2460 /*
2461 * Anything else Append the current input character to the
2462 * current attribute's value.
2463 */
2464 appendLongStrBuf(c);
2465 /*
2466 * Stay in the attribute value (double-quoted) state.
2467 */
2468 continue;
2469 }
2470 }
2471 }
2472
2473 /**
2474 * Attribute value (unquoted) state
2475 *
2476 * @throws IOException
2477 * @throws SAXException
2478 */
2479 private boolean attributeValueUnquotedState() throws SAXException,
2480 IOException {
2481 inContent = true;
2482 for (;;) {
2483 /*
2484 * Consume the next input character:
2485 */
2486 char c = read();
2487 switch (c) {
2488 case ' ':
2489 case '\t':
2490 case '\n':
2491 case '\u000B':
2492 case '\u000C':
2493 /*
2494 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
2495 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
2496 * to the before attribute name state.
2497 */
2498 addAttributeWithValue();
2499 inContent = false;
2500 return true;
2501 case '&':
2502 /*
2503 * U+0026 AMPERSAND (&) Switch to the entity in attribute
2504 * value state.
2505 */
2506 entityInAttributeValueState();
2507 continue;
2508 case '>':
2509 /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
2510 addAttributeWithValue();
2511 emitCurrentTagToken();
2512 /*
2513 * Switch to the data state.
2514 */
2515 inContent = false;
2516 return false;
2517 case '\u0000':
2518 /* EOF Parse error. */
2519 err("Saw end of file without the previous tag ending with \u201C>\u201C.");
2520 /*
2521 * Emit the current tag token.
2522 */
2523 addAttributeWithValue();
2524 emitCurrentTagToken();
2525 /*
2526 * Reconsume the character in the data state.
2527 */
2528 unread(c);
2529 inContent = false;
2530 return false;
2531 case '<':
2532 warn("\u201C<\u201D in an unquoted attribute value. This does not end the tag.");
2533 // fall through
2534 default:
2535 if (html4
2536 && !((c >= 'a' && c <= 'z')
2537 || (c >= 'A' && c <= 'Z')
2538 || (c >= '0' && c <= '9') || c == '.'
2539 || c == '-' || c == '_' || c == ':')) {
2540 err("Non-name character in an unquoted attribute value. (This is an HTML4-only error.)");
2541 }
2542 /*
2543 * Anything else Append the current input character to the
2544 * current attribute's value.
2545 */
2546 appendLongStrBuf(c);
2547 /*
2548 * Stay in the attribute value (unquoted) state.
2549 */
2550 continue;
2551 }
2552 }
2553 }
2554
2555 /**
2556 * Entity in attribute value state
2557 *
2558 * @throws IOException
2559 * @throws SAXException
2560 */
2561 private void entityInAttributeValueState() throws SAXException, IOException {
2562 /*
2563 * Attempt to consume an entity.
2564 */
2565 consumeEntity(true);
2566 /*
2567 * If nothing is returned, append a U+0026 AMPERSAND character to the
2568 * current attribute's value.
2569 *
2570 * Otherwise, append the returned character token to the current
2571 * attribute's value.
2572 */
2573 // handled in consumeEntity();
2574 /*
2575 * Finally, switch back to the attribute value state that you were in
2576 * when were switched into this state.
2577 */
2578 return;
2579 }
2580
2581 /**
2582 * Bogus comment state
2583 *
2584 * @throws IOException
2585 * @throws SAXException
2586 */
2587 private void bogusCommentState() throws SAXException, IOException {
2588 /*
2589 * (This can only happen if the content model flag is set to the PCDATA
2590 * state.)
2591 *
2592 * Consume every character up to the first U+003E GREATER-THAN SIGN
2593 * character (>) or the end of the file (EOF), whichever comes first.
2594 * Emit a comment token whose data is the concatenation of all the
2595 * characters starting from and including the character that caused the
2596 * state machine to switch into the bogus comment state, up to and
2597 * including the last consumed character before the U+003E character, if
2598 * any, or up to the end of the file otherwise. (If the comment was
2599 * started by the end of the file (EOF), the token is empty.)
2600 *
2601 * Switch to the data state.
2602 *
2603 * If the end of the file was reached, reconsume the EOF character.
2604 */
2605 for (;;) {
2606 char c = read();
2607 switch (c) {
2608 case '>':
2609 emitComment();
2610 return;
2611 case '\u0000':
2612 emitComment();
2613 unread(c);
2614 return;
2615 default:
2616 appendToComment(c);
2617 }
2618 }
2619 }
2620
2621 /**
2622 * Markup declaration open state
2623 *
2624 * @throws IOException
2625 * @throws SAXException
2626 */
2627 private void markupDeclarationOpenState() throws SAXException, IOException {
2628 /*
2629 * (This can only happen if the content model flag is set to the PCDATA
2630 * state.)
2631 */
2632 clearLongStrBuf();
2633 /*
2634 * If the next two characters are both U+002D HYPHEN-MINUS (-)
2635 * characters, consume those two characters, create a comment token
2636 * whose data is the empty string, and switch to the comment start
2637 * state.
2638 *
2639 * Otherwise if the next seven characters are a case-insensitive match
2640 * for the word "DOCTYPE", then consume those characters and switch to
2641 * the DOCTYPE state.
2642 *
2643 * Otherwise, is is a parse error. Switch to the bogus comment state.
2644 * The next character that is consumed, if any, is the first character
2645 * that will be in the comment.
2646 */
2647 char c = read();
2648 switch (c) {
2649 case '-':
2650 c = read();
2651 if (c == '-') {
2652 commentStates();
2653 return;
2654 } else {
2655 err("Bogus comment.");
2656 appendToComment('-');
2657 unread(c);
2658 bogusCommentState();
2659 return;
2660 }
2661 case 'd':
2662 case 'D':
2663 appendToComment(c);
2664 for (int i = 0; i < OCTYPE.length; i++) {
2665 c = read();
2666 char folded = c;
2667 if (c >= 'A' && c <= 'Z') {
2668 folded += 0x20;
2669 }
2670 if (folded == OCTYPE[i]) {
2671 appendToComment(c);
2672 } else {
2673 err("Bogus comment.");
2674 unread(c);
2675 bogusCommentState();
2676 return;
2677 }
2678 }
2679 doctypeState();
2680 return;
2681 default:
2682 err("Bogus comment.");
2683 unread(c);
2684 bogusCommentState();
2685 return;
2686 }
2687 }
2688
2689 private enum CommentState {
2690 COMMENT_START_STATE, COMMENT_START_DASH_STATE, COMMENT_STATE, COMMENT_END_DASH_STATE, COMMENT_END_STATE
2691 }
2692
2693 /**
2694 * Comment start state, Comment start dash state, Comment state, Comment end
2695 * dash state and Comment end state
2696 *
2697 * @throws IOException
2698 * @throws SAXException
2699 */
2700 private void commentStates() throws SAXException, IOException {
2701 CommentState state = CommentState.COMMENT_START_STATE;
2702 for (;;) {
2703 char c = read();
2704 switch (state) {
2705 case COMMENT_START_STATE:
2706 /*
2707 * Comment start state
2708 *
2709 *
2710 * Consume the next input character:
2711 */
2712 switch (c) {
2713 case '-':
2714 /*
2715 * U+002D HYPHEN-MINUS (-) Switch to the comment
2716 * start dash state.
2717 */
2718 state = CommentState.COMMENT_START_DASH_STATE;
2719 continue;
2720 case '>':
2721 /*
2722 * U+003E GREATER-THAN SIGN (>) Parse error.
2723 */
2724 err("Premature end of comment.");
2725 /* Emit the comment token. */
2726 emitComment();
2727 /*
2728 * Switch to the data state.
2729 */
2730 return;
2731 case '\u0000':
2732 /*
2733 * EOF Parse error.
2734 */
2735 err("End of file inside comment.");
2736 /* Emit the comment token. */
2737 emitComment();
2738 /*
2739 * Reconsume the EOF character in the data state.
2740 */
2741 unread(c);
2742 return;
2743 default:
2744 /*
2745 * Anything else Append the input character to the
2746 * comment token's data.
2747 */
2748 appendToComment(c);
2749 /*
2750 * Switch to the comment state.
2751 */
2752 state = CommentState.COMMENT_STATE;
2753 continue;
2754 }
2755 case COMMENT_START_DASH_STATE:
2756 /*
2757 * Comment start dash state
2758 *
2759 * Consume the next input character:
2760 */
2761 switch (c) {
2762 case '-':
2763 /*
2764 * U+002D HYPHEN-MINUS (-) Switch to the comment end
2765 * state
2766 */
2767 state = CommentState.COMMENT_END_STATE;
2768 continue;
2769 case '>':
2770 /*
2771 * U+003E GREATER-THAN SIGN (>) Parse error.
2772 */
2773 err("Premature end of comment.");
2774 /* Emit the comment token. */
2775 emitComment();
2776 /*
2777 * Switch to the data state.
2778 */
2779 return;
2780 case '\u0000':
2781 /*
2782 * EOF Parse error.
2783 */
2784 err("End of file inside comment.");
2785 /* Emit the comment token. */
2786 emitComment();
2787 /*
2788 * Reconsume the EOF character in the data state.
2789 */
2790 unread(c);
2791 return;
2792 default:
2793 /*
2794 * Anything else Append a U+002D HYPHEN-MINUS (-)
2795 * character and the input character to the comment
2796 * token's data.
2797 */
2798 appendToComment('-');
2799 appendToComment(c);
2800 /*
2801 * Switch to the comment state.
2802 */
2803 state = CommentState.COMMENT_STATE;
2804 continue;
2805 }
2806 case COMMENT_STATE:
2807 /*
2808 * Comment state Consume the next input character:
2809 */
2810 switch (c) {
2811 case '-':
2812 /*
2813 * U+002D HYPHEN-MINUS (-) Switch to the comment end
2814 * dash state
2815 */
2816 state = CommentState.COMMENT_END_DASH_STATE;
2817 continue;
2818 case '\u0000':
2819 /*
2820 * EOF Parse error.
2821 */
2822 err("End of file inside comment.");
2823 /* Emit the comment token. */
2824 emitComment();
2825 /*
2826 * Reconsume the EOF character in the data state.
2827 */
2828 unread(c);
2829 return;
2830 default:
2831 /*
2832 * Anything else Append the input character to the
2833 * comment token's data.
2834 */
2835 appendToComment(c);
2836 /*
2837 * Stay in the comment state.
2838 */
2839 continue;
2840 }
2841 case COMMENT_END_DASH_STATE:
2842 /*
2843 * Comment end dash state Consume the next input character:
2844 */
2845 switch (c) {
2846 case '-':
2847 /*
2848 * U+002D HYPHEN-MINUS (-) Switch to the comment end
2849 * state
2850 */
2851 state = CommentState.COMMENT_END_STATE;
2852 continue;
2853 case '\u0000':
2854 /*
2855 * EOF Parse error.
2856 */
2857 err("End of file inside comment.");
2858 /* Emit the comment token. */
2859 emitComment();
2860 /*
2861 * Reconsume the EOF character in the data state.
2862 */
2863 unread(c);
2864 return;
2865 default:
2866 /*
2867 * Anything else Append a U+002D HYPHEN-MINUS (-)
2868 * character and the input character to the comment
2869 * token's data.
2870 */
2871 appendToComment('-');
2872 appendToComment(c);
2873 /*
2874 * Switch to the comment state.
2875 */
2876 state = CommentState.COMMENT_STATE;
2877 continue;
2878 }
2879 case COMMENT_END_STATE:
2880 /*
2881 * Comment end dash state Consume the next input character:
2882 */
2883 switch (c) {
2884 case '>':
2885 /*
2886 * U+003E GREATER-THAN SIGN (>) Emit the comment
2887 * token.
2888 */
2889 emitComment();
2890 /*
2891 * Switch to the data state.
2892 */
2893 return;
2894 case '-':
2895 /* U+002D HYPHEN-MINUS (-) Parse error. */
2896 err("Consecutive hyphens did not terminate a comment.");
2897 /*
2898 * Append a U+002D HYPHEN-MINUS (-) character to the
2899 * comment token's data.
2900 */
2901 appendToComment('-');
2902 /*
2903 * Stay in the comment end state.
2904 */
2905 continue;
2906 case '\u0000':
2907 /*
2908 * EOF Parse error.
2909 */
2910 err("End of file inside comment.");
2911 /* Emit the comment token. */
2912 emitComment();
2913 /*
2914 * Reconsume the EOF character in the data state.
2915 */
2916 unread(c);
2917 return;
2918 default:
2919 /*
2920 * Anything else Parse error.
2921 */
2922 err("Consecutive hyphens did not terminate a comment.");
2923 /*
2924 * Append two U+002D HYPHEN-MINUS (-) characters and
2925 * the input character to the comment token's data.
2926 */
2927 appendToComment('-');
2928 appendToComment('-');
2929 appendToComment(c);
2930 /*
2931 * Switch to the comment state.
2932 */
2933 state = CommentState.COMMENT_STATE;
2934 continue;
2935 }
2936 }
2937 }
2938 }
2939
2940 /**
2941 * DOCTYPE state
2942 *
2943 * @throws IOException
2944 * @throws SAXException
2945 */
2946 private void doctypeState() throws SAXException, IOException {
2947 systemIdentifier = null;
2948 publicIdentifier = null;
2949 doctypeName = null;
2950 /*
2951 * Consume the next input character:
2952 */
2953 char c = read();
2954 switch (c) {
2955 case ' ':
2956 case '\t':
2957 case '\n':
2958 case '\u000B':
2959 case '\u000C':
2960 /*
2961 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B LINE
2962 * TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch to the
2963 * before DOCTYPE name state.
2964 */
2965 beforeDoctypeNameState();
2966 return;
2967 default:
2968 /*
2969 * Anything else Parse error.
2970 */
2971 err("Missing space before doctype name.");
2972 /*
2973 * Reconsume the current character in the before DOCTYPE name
2974 * state.
2975 */
2976 unread(c);
2977 beforeDoctypeNameState();
2978 return;
2979 }
2980 }
2981
2982 /**
2983 * Before DOCTYPE name state
2984 *
2985 * @throws IOException
2986 * @throws SAXException
2987 */
2988 private void beforeDoctypeNameState() throws SAXException, IOException {
2989 for (;;) {
2990 /*
2991 * Consume the next input character:
2992 */
2993 char c = read();
2994 switch (c) {
2995 case ' ':
2996 case '\t':
2997 case '\n':
2998 case '\u000B':
2999 case '\u000C':
3000 /*
3001 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3002 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3003 * in the before DOCTYPE name state.
3004 */
3005 continue;
3006 case '>':
3007 /*
3008 * U+003E GREATER-THAN SIGN (>) Parse error.
3009 */
3010 err("Nameless doctype.");
3011 /*
3012 * Create a new DOCTYPE token. Set its correctness flag to
3013 * incorrect. Emit the token.
3014 */
3015 tokenHandler.doctype("", null, null, false);
3016 /*
3017 * Switch to the data state.
3018 */
3019 return;
3020 case '\u0000':
3021 /* EOF Parse error. */
3022 err("End of file inside doctype.");
3023 /*
3024 * Create a new DOCTYPE token. Set its correctness flag to
3025 * incorrect. Emit the token.
3026 */
3027 tokenHandler.doctype("", null, null, false);
3028 /*
3029 * Reconsume the EOF character in the data state.
3030 */
3031 unread(c);
3032 return;
3033 default:
3034 /* Anything else Create a new DOCTYPE token. */
3035 clearStrBuf();
3036 /*
3037 * Set the token's name name to the current input character.
3038 */
3039 appendStrBuf(c);
3040 /*
3041 * Switch to the DOCTYPE name state.
3042 */
3043 doctypeNameState();
3044 return;
3045 }
3046 }
3047 }
3048
3049 /**
3050 * DOCTYPE name state
3051 *
3052 * @throws IOException
3053 * @throws SAXException
3054 */
3055 private void doctypeNameState() throws SAXException, IOException {
3056 for (;;) {
3057 /*
3058 * First, consume the next input character:
3059 */
3060 char c = read();
3061 switch (c) {
3062 case ' ':
3063 case '\t':
3064 case '\n':
3065 case '\u000B':
3066 case '\u000C':
3067 /*
3068 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3069 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
3070 * to the after DOCTYPE name state.
3071 */
3072 doctypeName = strBufToString();
3073 afterDoctypeNameState();
3074 return;
3075 case '>':
3076 /*
3077 * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE
3078 * token.
3079 */
3080 tokenHandler.doctype(strBufToString(), null, null, true);
3081 /*
3082 * Switch to the data state.
3083 */
3084 return;
3085 case '\u0000':
3086 /* EOF Parse error. */
3087 err("End of file inside doctype.");
3088 /*
3089 * Set the DOCTYPE token's correctness flag to incorrect.
3090 * Emit that DOCTYPE token.
3091 */
3092 tokenHandler.doctype(strBufToString(), null, null, false);
3093 /*
3094 * Reconsume the EOF character in the data state.
3095 */
3096 unread(c);
3097 return;
3098 default:
3099 /*
3100 * Anything else Append the current input character to the
3101 * current DOCTYPE token's name.
3102 */
3103 appendStrBuf(c);
3104 /*
3105 * Stay in the DOCTYPE name state.
3106 */
3107 continue;
3108 }
3109 }
3110 }
3111
3112 /**
3113 * After DOCTYPE name state
3114 *
3115 * @throws IOException
3116 * @throws SAXException
3117 */
3118 private void afterDoctypeNameState() throws SAXException, IOException {
3119 for (;;) {
3120 /*
3121 * Consume the next input character:
3122 */
3123 char c = read();
3124 switch (c) {
3125 case ' ':
3126 case '\t':
3127 case '\n':
3128 case '\u000B':
3129 case '\u000C':
3130 /*
3131 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3132 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3133 * in the after DOCTYPE name state.
3134 */
3135 continue;
3136 case '>':
3137 /*
3138 * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE
3139 * token.
3140 */
3141 tokenHandler.doctype(doctypeName, null, null, true);
3142 /*
3143 * Switch to the data state.
3144 */
3145 return;
3146 case '\u0000':
3147 /* EOF Parse error. */
3148 err("End of file inside doctype.");
3149 /*
3150 * Set the DOCTYPE token's correctness flag to incorrect.
3151 * Emit that DOCTYPE token.
3152 */
3153 tokenHandler.doctype(doctypeName, null, null, false);
3154 /*
3155 * Reconsume the EOF character in the data state.
3156 */
3157 unread(c);
3158 return;
3159 case 'p':
3160 case 'P':
3161 /*
3162 * If the next six characters are a case-insensitive match
3163 * for the word "PUBLIC", then consume those characters and
3164 * switch to the before DOCTYPE public identifier state.
3165 */
3166 for (int i = 0; i < UBLIC.length; i++) {
3167 c = read();
3168 char folded = c;
3169 if (c >= 'A' && c <= 'Z') {
3170 folded += 0x20;
3171 }
3172 if (folded != UBLIC[i]) {
3173 err("Bogus doctype.");
3174 unread(c);
3175 bogusDoctypeState();
3176 return;
3177 }
3178 }
3179 beforeDoctypePublicIdentifierState();
3180 return;
3181 case 's':
3182 case 'S':
3183 /*
3184 * Otherwise, if the next six characters are a
3185 * case-insensitive match for the word "SYSTEM", then
3186 * consume those characters and switch to the before DOCTYPE
3187 * system identifier state.
3188 */
3189 for (int i = 0; i < YSTEM.length; i++) {
3190 c = read();
3191 char folded = c;
3192 if (c >= 'A' && c <= 'Z') {
3193 folded += 0x20;
3194 }
3195 if (folded != YSTEM[i]) {
3196 err("Bogus doctype.");
3197 unread(c);
3198 bogusDoctypeState();
3199 return;
3200 }
3201 }
3202 beforeDoctypeSystemIdentifierState();
3203 return;
3204 default:
3205 /*
3206 * Otherwise, this is the parse error.
3207 */
3208 err("Bogus doctype.");
3209 /*
3210 * Switch to the bogus DOCTYPE state.
3211 */
3212 bogusDoctypeState();
3213 return;
3214 }
3215 }
3216 }
3217
3218 /**
3219 * Before DOCTYPE public identifier state
3220 *
3221 * @throws IOException
3222 * @throws SAXException
3223 */
3224 private void beforeDoctypePublicIdentifierState() throws SAXException,
3225 IOException {
3226 for (;;) {
3227 /*
3228 * Consume the next input character:
3229 */
3230 char c = read();
3231 switch (c) {
3232 case ' ':
3233 case '\t':
3234 case '\n':
3235 case '\u000B':
3236 case '\u000C':
3237 /*
3238 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3239 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3240 * in the before DOCTYPE public identifier state.
3241 */
3242 continue;
3243 case '"':
3244 /*
3245 * U+0022 QUOTATION MARK (") Set the DOCTYPE token's public
3246 * identifier to the empty string,
3247 */
3248 clearLongStrBuf();
3249 /*
3250 * then switch to the DOCTYPE public identifier
3251 * (double-quoted) state.
3252 */
3253 doctypePublicIdentifierDoubleQuotedState();
3254 return;
3255 case '\'':
3256 /*
3257 * U+0027 APOSTROPHE (') Set the DOCTYPE token's public
3258 * identifier to the empty string,
3259 */
3260 clearLongStrBuf();
3261 /*
3262 * then switch to the DOCTYPE public identifier
3263 * (single-quoted) state.
3264 */
3265 doctypePublicIdentifierSingleQuotedState();
3266 return;
3267 case '>':
3268 /* U+003E GREATER-THAN SIGN (>) Parse error. */
3269 err("Expected a public identifier but the doctype ended.");
3270 /*
3271 * Set the DOCTYPE token's correctness flag to incorrect.
3272 * Emit that DOCTYPE token.
3273 */
3274 tokenHandler.doctype(doctypeName, null, null, false);
3275 /*
3276 * Switch to the data state.
3277 */
3278 return;
3279 case '\u0000':
3280 /* EOF Parse error. */
3281 err("End of file inside a doctype.");
3282 /*
3283 * Set the DOCTYPE token's correctness flag to incorrect.
3284 * Emit that DOCTYPE token.
3285 */
3286 tokenHandler.doctype(doctypeName, null, null, false);
3287 /*
3288 * Reconsume the EOF character in the data state.
3289 */
3290 unread(c);
3291 return;
3292 default:
3293 /* Anything else Parse error. */
3294 err("Bogus doctype.");
3295 /*
3296 * Switch to the bogus DOCTYPE state.
3297 */
3298 bogusDoctypeState();
3299 return;
3300 }
3301 }
3302 }
3303
3304 /**
3305 * DOCTYPE public identifier (double-quoted) state
3306 *
3307 * @throws IOException
3308 * @throws SAXException
3309 */
3310 private void doctypePublicIdentifierDoubleQuotedState()
3311 throws SAXException, IOException {
3312 for (;;) {
3313 /*
3314 * Consume the next input character:
3315 */
3316 char c = read();
3317 switch (c) {
3318 case '"':
3319 /*
3320 * U+0022 QUOTATION MARK (") Switch to the after DOCTYPE
3321 * public identifier state.
3322 */
3323 publicIdentifier = longStrBufToString();
3324 afterDoctypePublicIdentifierState();
3325 return;
3326 case '\u0000':
3327 /* EOF Parse error. */
3328 err("End of file inside public identifier.");
3329 /*
3330 * Set the DOCTYPE token's correctness flag to incorrect.
3331 * Emit that DOCTYPE token.
3332 */
3333 tokenHandler.doctype(doctypeName, longStrBufToString(),
3334 null, false);
3335 /*
3336 * Reconsume the EOF character in the data state.
3337 */
3338 unread(c);
3339 return;
3340 default:
3341 /*
3342 * Anything else Append the current input character to the
3343 * current DOCTYPE token's public identifier.
3344 */
3345 appendLongStrBuf(c);
3346 /*
3347 * Stay in the DOCTYPE public identifier (double-quoted)
3348 * state.
3349 */
3350 continue;
3351 }
3352 }
3353 }
3354
3355 /**
3356 * DOCTYPE public identifier (single-quoted) state
3357 *
3358 * @throws IOException
3359 * @throws SAXException
3360 */
3361 private void doctypePublicIdentifierSingleQuotedState()
3362 throws SAXException, IOException {
3363 for (;;) {
3364 /*
3365 * Consume the next input character:
3366 */
3367 char c = read();
3368 switch (c) {
3369 case '\'':
3370 /*
3371 * U+0027 APOSTROPHE (') Switch to the after DOCTYPE public
3372 * identifier state.
3373 */
3374 publicIdentifier = longStrBufToString();
3375 afterDoctypePublicIdentifierState();
3376 return;
3377 case '\u0000':
3378 /* EOF Parse error. */
3379 err("End of file inside public identifier.");
3380 /*
3381 * Set the DOCTYPE token's correctness flag to incorrect.
3382 * Emit that DOCTYPE token.
3383 */
3384 tokenHandler.doctype(doctypeName, longStrBufToString(),
3385 null, false);
3386 /*
3387 * Reconsume the EOF character in the data state.
3388 */
3389 unread(c);
3390 return;
3391 default:
3392 /*
3393 * Anything else Append the current input character to the
3394 * current DOCTYPE token's public identifier.
3395 */
3396 appendLongStrBuf(c);
3397 /*
3398 * Stay in the DOCTYPE public identifier (single-quoted)
3399 * state.
3400 */
3401 continue;
3402 }
3403 }
3404 }
3405
3406 /**
3407 * After DOCTYPE public identifier state
3408 *
3409 * @throws IOException
3410 * @throws SAXException
3411 *
3412 */
3413 private void afterDoctypePublicIdentifierState() throws SAXException,
3414 IOException {
3415 for (;;) {
3416 /*
3417 * Consume the next input character:
3418 */
3419 char c = read();
3420 switch (c) {
3421 case ' ':
3422 case '\t':
3423 case '\n':
3424 case '\u000B':
3425 case '\u000C':
3426 /*
3427 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3428 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3429 * in the after DOCTYPE public identifier state.
3430 */
3431 continue;
3432 case '"':
3433 /*
3434 * U+0022 QUOTATION MARK (") Set the DOCTYPE token's system
3435 * identifier to the empty string,
3436 */
3437 clearLongStrBuf();
3438 /*
3439 * then switch to the DOCTYPE system identifier
3440 * (double-quoted) state.
3441 */
3442 doctypeSystemIdentifierDoubleQuotedState();
3443 return;
3444 case '\'':
3445 /*
3446 * U+0027 APOSTROPHE (') Set the DOCTYPE token's system
3447 * identifier to the empty string,
3448 */
3449 clearLongStrBuf();
3450 /*
3451 * then switch to the DOCTYPE system identifier
3452 * (single-quoted) state.
3453 */
3454 doctypeSystemIdentifierSingleQuotedState();
3455 return;
3456 case '>':
3457 /*
3458 * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE
3459 * token.
3460 */
3461 tokenHandler.doctype(doctypeName, publicIdentifier, null,
3462 true);
3463 /*
3464 * Switch to the data state.
3465 */
3466 return;
3467 case '\u0000':
3468 /* EOF Parse error. */
3469 err("End of file inside doctype.");
3470 /*
3471 * Set the DOCTYPE token's correctness flag to incorrect.
3472 * Emit that DOCTYPE token.
3473 */
3474 tokenHandler.doctype(doctypeName, publicIdentifier, null,
3475 false);
3476 /*
3477 * Reconsume the EOF character in the data state.
3478 */
3479 unread(c);
3480 return;
3481 default:
3482 /* Anything else Parse error. */
3483 err("Bogus doctype.");
3484 /*
3485 * Switch to the bogus DOCTYPE state.
3486 */
3487 bogusDoctypeState();
3488 return;
3489 }
3490 }
3491 }
3492
3493 /**
3494 * Before DOCTYPE system identifier state
3495 *
3496 * @throws IOException
3497 * @throws SAXException
3498 */
3499 private void beforeDoctypeSystemIdentifierState() throws SAXException,
3500 IOException {
3501 for (;;) {
3502 /*
3503 * Consume the next input character:
3504 */
3505 char c = read();
3506 switch (c) {
3507 case ' ':
3508 case '\t':
3509 case '\n':
3510 case '\u000B':
3511 case '\u000C':
3512 /*
3513 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3514 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3515 * in the before DOCTYPE system identifier state.
3516 */
3517 continue;
3518 case '"':
3519 /*
3520 * U+0022 QUOTATION MARK (") Set the DOCTYPE token's system
3521 * identifier to the empty string,
3522 */
3523 clearLongStrBuf();
3524 /*
3525 * then switch to the DOCTYPE system identifier
3526 * (double-quoted) state.
3527 */
3528 doctypeSystemIdentifierDoubleQuotedState();
3529 return;
3530 case '\'':
3531 /*
3532 * U+0027 APOSTROPHE (') Set the DOCTYPE token's system
3533 * identifier to the empty string,
3534 */
3535 clearLongStrBuf();
3536 /*
3537 * then switch to the DOCTYPE system identifier
3538 * (single-quoted) state.
3539 */
3540 doctypeSystemIdentifierSingleQuotedState();
3541 return;
3542 case '>':
3543 /* U+003E GREATER-THAN SIGN (>) Parse error. */
3544 err("Expected a system identifier but the doctype ended.");
3545 /*
3546 * Set the DOCTYPE token's correctness flag to incorrect.
3547 * Emit that DOCTYPE token.
3548 */
3549 tokenHandler.doctype(doctypeName, null, null, false);
3550 /*
3551 * Switch to the data state.
3552 */
3553 return;
3554 case '\u0000':
3555 /* EOF Parse error. */
3556 err("End of file inside a doctype.");
3557 /*
3558 * Set the DOCTYPE token's correctness flag to incorrect.
3559 * Emit that DOCTYPE token.
3560 */
3561 tokenHandler.doctype(doctypeName, null, null, false);
3562 /*
3563 * Reconsume the EOF character in the data state.
3564 */
3565 unread(c);
3566 return;
3567 default:
3568 /* Anything else Parse error. */
3569 err("Bogus doctype.");
3570 /*
3571 * Switch to the bogus DOCTYPE state.
3572 */
3573 bogusDoctypeState();
3574 return;
3575 }
3576 }
3577 }
3578
3579 /**
3580 * DOCTYPE system identifier (double-quoted) state
3581 *
3582 * @throws IOException
3583 * @throws SAXException
3584 */
3585 private void doctypeSystemIdentifierDoubleQuotedState()
3586 throws SAXException, IOException {
3587 for (;;) {
3588 /*
3589 * Consume the next input character:
3590 */
3591 char c = read();
3592 switch (c) {
3593 case '"':
3594 /*
3595 * U+0022 QUOTATION MARK (") Switch to the after DOCTYPE
3596 * system identifier state.
3597 */
3598 systemIdentifier = longStrBufToString();
3599 afterDoctypeSystemIdentifierState();
3600 return;
3601 case '\u0000':
3602 /* EOF Parse error. */
3603 err("End of file inside system identifier.");
3604 /*
3605 * Set the DOCTYPE token's correctness flag to incorrect.
3606 * Emit that DOCTYPE token.
3607 */
3608 tokenHandler.doctype(doctypeName, publicIdentifier,
3609 longStrBufToString(), false);
3610 /*
3611 * Reconsume the EOF character in the data state.
3612 */
3613 unread(c);
3614 return;
3615 default:
3616 /*
3617 * Anything else Append the current input character to the
3618 * current DOCTYPE token's system identifier.
3619 */
3620 appendLongStrBuf(c);
3621 /*
3622 * Stay in the DOCTYPE system identifier (double-quoted)
3623 * state.
3624 */
3625 continue;
3626 }
3627 }
3628 }
3629
3630 /**
3631 * DOCTYPE system identifier (single-quoted) state
3632 *
3633 * @throws IOException
3634 * @throws SAXException
3635 */
3636 private void doctypeSystemIdentifierSingleQuotedState()
3637 throws SAXException, IOException {
3638 for (;;) {
3639 /*
3640 * Consume the next input character:
3641 */
3642 char c = read();
3643 switch (c) {
3644 case '\'':
3645 /*
3646 * U+0027 APOSTROPHE (') Switch to the after DOCTYPE system
3647 * identifier state.
3648 */
3649 systemIdentifier = longStrBufToString();
3650 afterDoctypeSystemIdentifierState();
3651 return;
3652 case '\u0000':
3653 /* EOF Parse error. */
3654 err("End of file inside system identifier.");
3655 /*
3656 * Set the DOCTYPE token's correctness flag to incorrect.
3657 * Emit that DOCTYPE token.
3658 */
3659 tokenHandler.doctype(doctypeName, publicIdentifier,
3660 longStrBufToString(), false);
3661 /*
3662 * Reconsume the EOF character in the data state.
3663 */
3664 unread(c);
3665 return;
3666 default:
3667 /*
3668 * Anything else Append the current input character to the
3669 * current DOCTYPE token's system identifier.
3670 */
3671 appendLongStrBuf(c);
3672 /*
3673 * Stay in the DOCTYPE system identifier (double-quoted)
3674 * state.
3675 */
3676 continue;
3677 }
3678 }
3679 }
3680
3681 /**
3682 * After DOCTYPE system identifier state
3683 *
3684 * @throws IOException
3685 * @throws SAXException
3686 */
3687 private void afterDoctypeSystemIdentifierState() throws SAXException,
3688 IOException {
3689 for (;;) {
3690 /*
3691 * Consume the next input character:
3692 */
3693 char c = read();
3694 switch (c) {
3695 case ' ':
3696 case '\t':
3697 case '\n':
3698 case '\u000B':
3699 case '\u000C':
3700 /*
3701 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
3702 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
3703 * in the after DOCTYPE system identifier state.
3704 */
3705 continue;
3706 case '>':
3707 /*
3708 * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE
3709 * token.
3710 */
3711 tokenHandler.doctype(doctypeName, publicIdentifier,
3712 systemIdentifier, true);
3713 /*
3714 * Switch to the data state.
3715 */
3716 return;
3717 case '\u0000':
3718 /* EOF Parse error. */
3719 err("End of file inside doctype.");
3720 /*
3721 * Set the DOCTYPE token's correctness flag to incorrect.
3722 * Emit that DOCTYPE token.
3723 */
3724 tokenHandler.doctype(doctypeName, publicIdentifier,
3725 systemIdentifier, false);
3726 /*
3727 * Reconsume the EOF character in the data state.
3728 */
3729 unread(c);
3730 return;
3731 default:
3732 /* Anything else Parse error. */
3733 err("Bogus doctype.");
3734 /*
3735 * Switch to the bogus DOCTYPE state.
3736 */
3737 bogusDoctypeState();
3738 return;
3739 }
3740 }
3741 }
3742
3743 /**
3744 * Bogus DOCTYPE state
3745 *
3746 * @throws IOException
3747 * @throws SAXException
3748 */
3749 private void bogusDoctypeState() throws SAXException, IOException {
3750 for (;;) {
3751 /*
3752 * Consume the next input character:
3753 */
3754 char c = read();
3755 switch (c) {
3756 case '>':
3757 /*
3758 * U+003E GREATER-THAN SIGN (>) Set the DOCTYPE token's
3759 * correctness flag to incorrect. Emit that DOCTYPE token.
3760 */
3761 tokenHandler.doctype(doctypeName, publicIdentifier,
3762 systemIdentifier, false);
3763 /*
3764 * Switch to the data state.
3765 */
3766 return;
3767 case '\u0000':
3768 /* EOF Parse error. */
3769 err("End of file inside doctype.");
3770 /*
3771 * Set the DOCTYPE token's correctness flag to incorrect.
3772 * Emit that DOCTYPE token.
3773 */
3774 tokenHandler.doctype(doctypeName, publicIdentifier,
3775 systemIdentifier, false);
3776 /*
3777 * Reconsume the EOF character in the data state.
3778 */
3779 unread(c);
3780 return;
3781 default:
3782 /*
3783 * Anything else Stay in the bogus DOCTYPE state.
3784 */
3785 continue;
3786 }
3787 }
3788 }
3789
3790 /**
3791 * Consume entity
3792 *
3793 * Unlike the definition is the spec, this method does not return a value
3794 * and never requires the caller to backtrack. This method takes care of
3795 * emitting characters or appending to the current attribute value. It also
3796 * takes care of that in the case when consuming the entity fails.
3797 *
3798 * @throws IOException
3799 * @throws SAXException
3800 */
3801 private void consumeEntity(boolean inAttribute) throws SAXException,
3802 IOException {
3803 clearStrBuf();
3804 appendStrBuf('&');
3805 /*
3806 * This section defines how to consume an entity. This definition is
3807 * used when parsing entities in text and in attributes.
3808 *
3809 * The behaviour depends on the identity of the next character (the one
3810 * immediately after the U+0026 AMPERSAND character):
3811 */
3812 char c = read();
3813 switch (c) {
3814 case ' ':
3815 case '\t':
3816 case '\n':
3817 case '\u000B':
3818 case '\u000C':
3819 case '<':
3820 case '&':
3821 case '\u0000':
3822 /*
3823 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B LINE
3824 * TABULATION U+000C FORM FEED (FF) U+0020 SPACE U+003C
3825 * LESS-THAN SIGN U+0026 AMPERSAND EOF Not an entity. No
3826 * characters are consumed, and nothing is returned. (This is
3827 * not an error, either.)
3828 */
3829 if (inAttribute) {
3830 appendStrBufToLongStrBuf();
3831 } else {
3832 emitStrBuf();
3833 }
3834 unread(c);
3835 return;
3836 case '#':
3837 /*
3838 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER SIGN.
3839 */
3840 appendStrBuf('#');
3841 consumeNCR(inAttribute);
3842 return;
3843 default:
3844 unread(c);
3845 int entCol = -1;
3846 int lo = 0;
3847 int hi = (Entities.NAMES.length - 1);
3848 int candidate = -1;
3849 int strBufMark = 0;
3850 outer: for (;;) {
3851 entCol++;
3852 c = read();
3853 /*
3854 * Anything else Consume the maximum number of characters
3855 * possible, with the consumed characters case-sensitively
3856 * matching one of the identifiers in the first column of
3857 * the entities table.
3858 */
3859 hiloop: for (;;) {
3860 if (hi == -1) {
3861 break;
3862 }
3863 if (entCol == Entities.NAMES[hi].length()) {
3864 break hiloop;
3865 }
3866 if (entCol > Entities.NAMES[hi].length()) {
3867 break outer;
3868 } else if (c < Entities.NAMES[hi].charAt(entCol)) {
3869 hi--;
3870 } else {
3871 break hiloop;
3872 }
3873 }
3874
3875 loloop: for (;;) {
3876 if (hi < lo) {
3877 break outer;
3878 }
3879 if (entCol == Entities.NAMES[lo].length()) {
3880 candidate = lo;
3881 strBufMark = strBufLen;
3882 lo++;
3883 } else if (entCol > Entities.NAMES[lo].length()) {
3884 break outer;
3885 } else if (c > Entities.NAMES[lo].charAt(entCol)) {
3886 lo++;
3887 } else {
3888 break loloop;
3889 }
3890 }
3891 if (hi < lo) {
3892 break outer;
3893 }
3894 appendStrBuf(c);
3895 }
3896 unread(c);
3897 // TODO warn about apos (IE) and TRADE (Opera)
3898 if (candidate == -1) {
3899 /* If no match can be made, then this is a parse error. */
3900 err("Text after \u201C&\u201D did not match an entity name.");
3901 /*
3902 * No characters are consumed, and nothing is returned.
3903 */
3904 if (inAttribute) {
3905 appendStrBufToLongStrBuf();
3906 } else {
3907 emitStrBuf();
3908 }
3909 return;
3910 } else {
3911 if (!Entities.NAMES[candidate].endsWith(";")) {
3912 /*
3913 * If the last character matched is not a U+003B
3914 * SEMICOLON (;), there is a parse error.
3915 */
3916 err("Entity reference was not terminated by a semicolon.");
3917 if (inAttribute) {
3918 /*
3919 * If the entity is being consumed as part of an
3920 * attribute, and the last character matched is not
3921 * a U+003B SEMICOLON (;),
3922 */
3923 if (strBufMark == strBufLen) {
3924 c = read();
3925 unread(c);
3926 } else {
3927 c = strBuf[strBufMark];
3928 }
3929 if ((c >= '0' && c <= '9')
3930 || (c >= 'A' && c <= 'Z')
3931 || (c >= 'a' && c <= 'z')) {
3932 /*
3933 * and the next character is in the range U+0030
3934 * DIGIT ZERO to U+0039 DIGIT NINE, U+0041 LATIN
3935 * CAPITAL LETTER A to U+005A LATIN CAPITAL
3936 * LETTER Z, or U+0061 LATIN SMALL LETTER A to
3937 * U+007A LATIN SMALL LETTER Z, then, for
3938 * historical reasons, all the characters that
3939 * were matched after the U+0026 AMPERSAND (&)
3940 * must be unconsumed, and nothing is returned.
3941 */
3942 appendStrBufToLongStrBuf();
3943 return;
3944 }
3945 }
3946 }
3947
3948 /*
3949 * Otherwise, return a character token for the character
3950 * corresponding to the entity name (as given by the second
3951 * column of the entities table).
3952 */
3953 char[] val = Entities.VALUES[candidate];
3954 emitOrAppend(val, inAttribute);
3955 // this is so complicated!
3956 if (strBufMark < strBufLen) {
3957 if (inAttribute) {
3958 for (int i = strBufMark; i < strBufLen; i++) {
3959 appendLongStrBuf(strBuf[i]);
3960 }
3961 } else {
3962 tokenHandler.characters(strBuf, strBufMark,
3963 strBufLen - strBufMark);
3964 }
3965 }
3966 return;
3967 /*
3968 * If the markup contains I'm ¬it; I tell you, the entity
3969 * is parsed as "not", as in, I'm ¬it; I tell you. But if
3970 * the markup was I'm ∉ I tell you, the entity would
3971 * be parsed as "notin;", resulting in I'm ∉ I tell you.
3972 */
3973 }
3974
3975 }
3976 }
3977
3978 private void consumeNCR(boolean inAttribute) throws SAXException,
3979 IOException {
3980 int prevValue = -1;
3981 int value = 0;
3982 boolean seenDigits = false;
3983 boolean hex = false;
3984 /*
3985 * The behaviour further depends on the character after the U+0023
3986 * NUMBER SIGN:
3987 */
3988 char c = read();
3989 if (c == 'x' || c == 'X') {
3990 /*
3991 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL LETTER X Consume
3992 * the X.
3993 *
3994 * Follow the steps below, but using the range of characters U+0030
3995 * DIGIT ZERO through to U+0039 DIGIT NINE, U+0061 LATIN SMALL
3996 * LETTER A through to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
3997 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL LETTER F (in
3998 * other words, 0-9, A-F, a-f).
3999 *
4000 * When it comes to interpreting the number, interpret it as a
4001 * hexadecimal number.
4002 */
4003 appendStrBuf(c);
4004 hex = true;
4005 } else {
4006 unread(c);
4007 /*
4008 * Anything else Follow the steps below, but using the range of
4009 * characters U+0030 DIGIT ZERO through to U+0039 DIGIT NINE (i.e.
4010 * just 0-9).
4011 *
4012 * When it comes to interpreting the number, interpret it as a
4013 * decimal number.
4014 */
4015 }
4016 for (;;) {
4017 // Deal with overflow gracefully
4018 if (value < prevValue) {
4019 value = 0x110000; // Value above Unicode range but within int
4020 // range
4021 }
4022 prevValue = value;
4023 /*
4024 * Consume as many characters as match the range of characters given
4025 * above.
4026 */
4027 c = read();
4028 if (c >= '0' && c <= '9') {
4029 seenDigits = true;
4030 if (hex) {
4031 value *= 16;
4032 } else {
4033 value *= 10;
4034 }
4035 value += c - '0';
4036 } else if (hex && c >= 'A' && c <= 'F') {
4037 seenDigits = true;
4038 value *= 16;
4039 value += c - 'A' + 10;
4040 } else if (hex && c >= 'a' && c <= 'f') {
4041 seenDigits = true;
4042 value *= 16;
4043 value += c - 'a' + 10;
4044 } else if (c == ';') {
4045 if (seenDigits) {
4046 handleNCRValue(value, inAttribute);
4047 return;
4048 } else {
4049 err("No digits after \u201C" + strBufToString() + "\u201D.");
4050 appendStrBuf(';');
4051 if (inAttribute) {
4052 appendStrBufToLongStrBuf();
4053 } else {
4054 emitStrBuf();
4055 }
4056 return;
4057 }
4058 } else {
4059 /*
4060 * If no characters match the range, then don't consume any
4061 * characters (and unconsume the U+0023 NUMBER SIGN character
4062 * and, if appropriate, the X character). This is a parse error;
4063 * nothing is returned.
4064 *
4065 * Otherwise, if the next character is a U+003B SEMICOLON,
4066 * consume that too. If it isn't, there is a parse error.
4067 */
4068 unread(c);
4069 if (seenDigits) {
4070 err("Character reference was not terminated by a semicolon.");
4071 handleNCRValue(value, inAttribute);
4072 return;
4073 } else {
4074 err("No digits after \u201C" + strBufToString() + "\u201D.");
4075 if (inAttribute) {
4076 appendStrBufToLongStrBuf();
4077 } else {
4078 emitStrBuf();
4079 }
4080 return;
4081 }
4082 }
4083 }
4084 }
4085
4086 private void handleNCRValue(int value, boolean inAttribute)
4087 throws SAXException, IOException {
4088 /*
4089 * If one or more characters match the range, then take them all and
4090 * interpret the string of characters as a number (either hexadecimal or
4091 * decimal as appropriate).
4092 */
4093 if (value >= 0x80 && value <= 0x9f) {
4094 /*
4095 * If that number is one of the numbers in the first column of the
4096 * following table, then this is a parse error.
4097 */
4098 err("A numeric character reference expanded to the C1 controls range.");
4099 /*
4100 * Find the row with that number in the first column, and return a
4101 * character token for the Unicode character given in the second
4102 * column of that row.
4103 */
4104 char[] val = Entities.WINDOWS_1252[value - 0x80];
4105 emitOrAppend(val, inAttribute);
4106 return;
4107 } else if (value == 0x0D) {
4108 err("A numeric character reference expanded to carriage return.");
4109 emitOrAppend(LF, inAttribute);
4110 return;
4111 } else if (value == 0) {
4112 /*
4113 * Otherwise, if the number is zero, if the number is higher than
4114 * 0x10FFFF, or if it's one of the surrogate characters (characters
4115 * in the range 0xD800 to 0xDFFF), then this is a parse error;
4116 * return a character token for the U+FFFD REPLACEMENT CHARACTER
4117 * character instead.
4118 */
4119 err("Character reference expands to U+0000.");
4120 emitOrAppend(REPLACEMENT_CHARACTER, inAttribute);
4121 return;
4122 } else if ((contentSpacePolicy != XmlViolationPolicy.ALLOW)
4123 && (value == 0xB || value == 0xC)) {
4124 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
4125 emitOrAppend(SPACE, inAttribute);
4126 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
4127 fatal("A character reference expanded to a space character that is not legal XML 1.0 white space.");
4128 }
4129 } else if ((value & 0xF800) == 0xD800) {
4130 err("Character reference expands to a surrogate.");
4131 emitOrAppend(REPLACEMENT_CHARACTER, inAttribute);
4132 return;
4133 } else if (value <= 0xFFFF) {
4134 /*
4135 * Otherwise, return a character token for the Unicode character
4136 * whose code point is that number.
4137 */
4138 char c = (char) value;
4139 if (c < '\t' || (c > '\r' && c < ' ') || isNonCharacter(c)) {
4140 if (contentNonXmlCharPolicy != XmlViolationPolicy.FATAL) {
4141 if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) {
4142 c = '\uFFFD';
4143 }
4144 warn("Character reference expanded to a character that is not a legal XML 1.0 character.");
4145 } else {
4146 fatal("Character reference expanded to a character that is not a legal XML 1.0 character.");
4147 }
4148 }
4149 if (isPrivateUse(c)) {
4150 warnAboutPrivateUseChar();
4151 }
4152 bmpChar[0] = c;
4153 emitOrAppend(bmpChar, inAttribute);
4154 return;
4155 } else if (value <= 0x10FFFF) {
4156 if (isNonCharacter(value)) {
4157 warn("Character reference expands to an astral non-character.");
4158 }
4159 if (isAstralPrivateUse(value)) {
4160 warnAboutPrivateUseChar();
4161 }
4162 astralChar[0] = (char) (LEAD_OFFSET + (value >> 10));
4163 astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
4164 emitOrAppend(astralChar, inAttribute);
4165 return;
4166 } else {
4167 err("Character reference outside the permissible Unicode range.");
4168 emitOrAppend(REPLACEMENT_CHARACTER, inAttribute);
4169 return;
4170 }
4171 }
4172
4173 /**
4174 * @param val
4175 * @throws SAXException
4176 * @throws IOException
4177 */
4178 private void emitOrAppend(char[] val, boolean inAttribute)
4179 throws SAXException, IOException {
4180 if (inAttribute) {
4181 appendLongStrBuf(val);
4182 } else {
4183 tokenHandler.characters(val, 0, val.length);
4184 }
4185 }
4186
4187 /**
4188 * Returns the mappingLangToXmlLang.
4189 *
4190 * @return the mappingLangToXmlLang
4191 */
4192 public boolean isMappingLangToXmlLang() {
4193 return mappingLangToXmlLang;
4194 }
4195
4196 /**
4197 * Sets the mappingLangToXmlLang.
4198 *
4199 * @param mappingLangToXmlLang
4200 * the mappingLangToXmlLang to set
4201 */
4202 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
4203 this.mappingLangToXmlLang = mappingLangToXmlLang;
4204 }
4205 }