001 /* 002 * Copyright (c) 2005, 2006, 2007 Henri Sivonen 003 * Copyright (c) 2007 Mozilla Foundation 004 * Portions of comments Copyright 2004-2007 Apple Computer, Inc., Mozilla 005 * Foundation, and Opera Software ASA. 006 * 007 * Permission is hereby granted, free of charge, to any person obtaining a 008 * copy of this software and associated documentation files (the "Software"), 009 * to deal in the Software without restriction, including without limitation 010 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 011 * and/or sell copies of the Software, and to permit persons to whom the 012 * Software is furnished to do so, subject to the following conditions: 013 * 014 * The above copyright notice and this permission notice shall be included in 015 * all copies or substantial portions of the Software. 016 * 017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 018 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 019 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 020 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 021 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 022 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 023 * DEALINGS IN THE SOFTWARE. 024 */ 025 026 /* 027 * The comments following this one that use the same comment syntax as this 028 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 029 * amended as of June 23 2007. 030 * That document came with this statement: 031 * "© Copyright 2004-2007 Apple Computer, Inc., Mozilla Foundation, and 032 * Opera Software ASA. You are granted a license to use, reproduce and 033 * create derivative works of this document." 034 */ 035 036 package nu.validator.htmlparser.impl; 037 038 import java.io.IOException; 039 import java.io.InputStream; 040 import java.io.Reader; 041 import java.nio.charset.Charset; 042 import java.nio.charset.CharsetDecoder; 043 import java.nio.charset.IllegalCharsetNameException; 044 import java.nio.charset.UnsupportedCharsetException; 045 import java.util.Arrays; 046 import java.util.regex.Matcher; 047 import java.util.regex.Pattern; 048 049 import nu.validator.htmlparser.common.XmlViolationPolicy; 050 051 import org.xml.sax.Attributes; 052 import org.xml.sax.ErrorHandler; 053 import org.xml.sax.InputSource; 054 import org.xml.sax.Locator; 055 import org.xml.sax.SAXException; 056 import org.xml.sax.SAXParseException; 057 058 /** 059 * An implementatition of 060 * http://www.whatwg.org/specs/web-apps/current-work/multipage/section-tokenisation.html 061 * 062 * This class implements the <code>Locator</code> interface. This is not an 063 * incidental implementation detail: Users of this class are encouraged to make 064 * use of the <code>Locator</code> nature. 065 * 066 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer 067 * can be configured to treat these conditions as fatal or to coerce the infoset 068 * to something that XML 1.0 allows. 069 * 070 * @version $Id: Tokenizer.java 166 2007-10-14 19:42:57Z hsivonen $ 071 * @author hsivonen 072 */ 073 public final class Tokenizer implements Locator { 074 075 private static final Pattern NCNAME_PATTERN = Pattern.compile("(?:[\\u0041-\\u005A]|[\\u0061-\\u007A]|[\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u00FF]|[\\u0100-\\u0131]|[\\u0134-\\u013E]|[\\u0141-\\u0148]|[\\u014A-\\u017E]|[\\u0180-\\u01C3]|[\\u01CD-\\u01F0]|[\\u01F4-\\u01F5]|[\\u01FA-\\u0217]|[\\u0250-\\u02A8]|[\\u02BB-\\u02C1]|\\u0386|[\\u0388-\\u038A]|\\u038C|[\\u038E-\\u03A1]|[\\u03A3-\\u03CE]|[\\u03D0-\\u03D6]|\\u03DA|\\u03DC|\\u03DE|\\u03E0|[\\u03E2-\\u03F3]|[\\u0401-\\u040C]|[\\u040E-\\u044F]|[\\u0451-\\u045C]|[\\u045E-\\u0481]|[\\u0490-\\u04C4]|[\\u04C7-\\u04C8]|[\\u04CB-\\u04CC]|[\\u04D0-\\u04EB]|[\\u04EE-\\u04F5]|[\\u04F8-\\u04F9]|[\\u0531-\\u0556]|\\u0559|[\\u0561-\\u0586]|[\\u05D0-\\u05EA]|[\\u05F0-\\u05F2]|[\\u0621-\\u063A]|[\\u0641-\\u064A]|[\\u0671-\\u06B7]|[\\u06BA-\\u06BE]|[\\u06C0-\\u06CE]|[\\u06D0-\\u06D3]|\\u06D5|[\\u06E5-\\u06E6]|[\\u0905-\\u0939]|\\u093D|[\\u0958-\\u0961]|[\\u0985-\\u098C]|[\\u098F-\\u0990]|[\\u0993-\\u09A8]|[\\u09AA-\\u09B0]|\\u09B2|[\\u09B6-\\u09B9]|[\\u09DC-\\u09DD]|[\\u09DF-\\u09E1]|[\\u09F0-\\u09F1]|[\\u0A05-\\u0A0A]|[\\u0A0F-\\u0A10]|[\\u0A13-\\u0A28]|[\\u0A2A-\\u0A30]|[\\u0A32-\\u0A33]|[\\u0A35-\\u0A36]|[\\u0A38-\\u0A39]|[\\u0A59-\\u0A5C]|\\u0A5E|[\\u0A72-\\u0A74]|[\\u0A85-\\u0A8B]|\\u0A8D|[\\u0A8F-\\u0A91]|[\\u0A93-\\u0AA8]|[\\u0AAA-\\u0AB0]|[\\u0AB2-\\u0AB3]|[\\u0AB5-\\u0AB9]|\\u0ABD|\\u0AE0|[\\u0B05-\\u0B0C]|[\\u0B0F-\\u0B10]|[\\u0B13-\\u0B28]|[\\u0B2A-\\u0B30]|[\\u0B32-\\u0B33]|[\\u0B36-\\u0B39]|\\u0B3D|[\\u0B5C-\\u0B5D]|[\\u0B5F-\\u0B61]|[\\u0B85-\\u0B8A]|[\\u0B8E-\\u0B90]|[\\u0B92-\\u0B95]|[\\u0B99-\\u0B9A]|\\u0B9C|[\\u0B9E-\\u0B9F]|[\\u0BA3-\\u0BA4]|[\\u0BA8-\\u0BAA]|[\\u0BAE-\\u0BB5]|[\\u0BB7-\\u0BB9]|[\\u0C05-\\u0C0C]|[\\u0C0E-\\u0C10]|[\\u0C12-\\u0C28]|[\\u0C2A-\\u0C33]|[\\u0C35-\\u0C39]|[\\u0C60-\\u0C61]|[\\u0C85-\\u0C8C]|[\\u0C8E-\\u0C90]|[\\u0C92-\\u0CA8]|[\\u0CAA-\\u0CB3]|[\\u0CB5-\\u0CB9]|\\u0CDE|[\\u0CE0-\\u0CE1]|[\\u0D05-\\u0D0C]|[\\u0D0E-\\u0D10]|[\\u0D12-\\u0D28]|[\\u0D2A-\\u0D39]|[\\u0D60-\\u0D61]|[\\u0E01-\\u0E2E]|\\u0E30|[\\u0E32-\\u0E33]|[\\u0E40-\\u0E45]|[\\u0E81-\\u0E82]|\\u0E84|[\\u0E87-\\u0E88]|\\u0E8A|\\u0E8D|[\\u0E94-\\u0E97]|[\\u0E99-\\u0E9F]|[\\u0EA1-\\u0EA3]|\\u0EA5|\\u0EA7|[\\u0EAA-\\u0EAB]|[\\u0EAD-\\u0EAE]|\\u0EB0|[\\u0EB2-\\u0EB3]|\\u0EBD|[\\u0EC0-\\u0EC4]|[\\u0F40-\\u0F47]|[\\u0F49-\\u0F69]|[\\u10A0-\\u10C5]|[\\u10D0-\\u10F6]|\\u1100|[\\u1102-\\u1103]|[\\u1105-\\u1107]|\\u1109|[\\u110B-\\u110C]|[\\u110E-\\u1112]|\\u113C|\\u113E|\\u1140|\\u114C|\\u114E|\\u1150|[\\u1154-\\u1155]|\\u1159|[\\u115F-\\u1161]|\\u1163|\\u1165|\\u1167|\\u1169|[\\u116D-\\u116E]|[\\u1172-\\u1173]|\\u1175|\\u119E|\\u11A8|\\u11AB|[\\u11AE-\\u11AF]|[\\u11B7-\\u11B8]|\\u11BA|[\\u11BC-\\u11C2]|\\u11EB|\\u11F0|\\u11F9|[\\u1E00-\\u1E9B]|[\\u1EA0-\\u1EF9]|[\\u1F00-\\u1F15]|[\\u1F18-\\u1F1D]|[\\u1F20-\\u1F45]|[\\u1F48-\\u1F4D]|[\\u1F50-\\u1F57]|\\u1F59|\\u1F5B|\\u1F5D|[\\u1F5F-\\u1F7D]|[\\u1F80-\\u1FB4]|[\\u1FB6-\\u1FBC]|\\u1FBE|[\\u1FC2-\\u1FC4]|[\\u1FC6-\\u1FCC]|[\\u1FD0-\\u1FD3]|[\\u1FD6-\\u1FDB]|[\\u1FE0-\\u1FEC]|[\\u1FF2-\\u1FF4]|[\\u1FF6-\\u1FFC]|\\u2126|[\\u212A-\\u212B]|\\u212E|[\\u2180-\\u2182]|[\\u3041-\\u3094]|[\\u30A1-\\u30FA]|[\\u3105-\\u312C]|[\\uAC00-\\uD7A3]|[\\u4E00-\\u9FA5]|\\u3007|[\\u3021-\\u3029]|_)(?:[\\u0030-\\u0039]|[\\u0660-\\u0669]|[\\u06F0-\\u06F9]|[\\u0966-\\u096F]|[\\u09E6-\\u09EF]|[\\u0A66-\\u0A6F]|[\\u0AE6-\\u0AEF]|[\\u0B66-\\u0B6F]|[\\u0BE7-\\u0BEF]|[\\u0C66-\\u0C6F]|[\\u0CE6-\\u0CEF]|[\\u0D66-\\u0D6F]|[\\u0E50-\\u0E59]|[\\u0ED0-\\u0ED9]|[\\u0F20-\\u0F29]|[\\u0041-\\u005A]|[\\u0061-\\u007A]|[\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u00FF]|[\\u0100-\\u0131]|[\\u0134-\\u013E]|[\\u0141-\\u0148]|[\\u014A-\\u017E]|[\\u0180-\\u01C3]|[\\u01CD-\\u01F0]|[\\u01F4-\\u01F5]|[\\u01FA-\\u0217]|[\\u0250-\\u02A8]|[\\u02BB-\\u02C1]|\\u0386|[\\u0388-\\u038A]|\\u038C|[\\u038E-\\u03A1]|[\\u03A3-\\u03CE]|[\\u03D0-\\u03D6]|\\u03DA|\\u03DC|\\u03DE|\\u03E0|[\\u03E2-\\u03F3]|[\\u0401-\\u040C]|[\\u040E-\\u044F]|[\\u0451-\\u045C]|[\\u045E-\\u0481]|[\\u0490-\\u04C4]|[\\u04C7-\\u04C8]|[\\u04CB-\\u04CC]|[\\u04D0-\\u04EB]|[\\u04EE-\\u04F5]|[\\u04F8-\\u04F9]|[\\u0531-\\u0556]|\\u0559|[\\u0561-\\u0586]|[\\u05D0-\\u05EA]|[\\u05F0-\\u05F2]|[\\u0621-\\u063A]|[\\u0641-\\u064A]|[\\u0671-\\u06B7]|[\\u06BA-\\u06BE]|[\\u06C0-\\u06CE]|[\\u06D0-\\u06D3]|\\u06D5|[\\u06E5-\\u06E6]|[\\u0905-\\u0939]|\\u093D|[\\u0958-\\u0961]|[\\u0985-\\u098C]|[\\u098F-\\u0990]|[\\u0993-\\u09A8]|[\\u09AA-\\u09B0]|\\u09B2|[\\u09B6-\\u09B9]|[\\u09DC-\\u09DD]|[\\u09DF-\\u09E1]|[\\u09F0-\\u09F1]|[\\u0A05-\\u0A0A]|[\\u0A0F-\\u0A10]|[\\u0A13-\\u0A28]|[\\u0A2A-\\u0A30]|[\\u0A32-\\u0A33]|[\\u0A35-\\u0A36]|[\\u0A38-\\u0A39]|[\\u0A59-\\u0A5C]|\\u0A5E|[\\u0A72-\\u0A74]|[\\u0A85-\\u0A8B]|\\u0A8D|[\\u0A8F-\\u0A91]|[\\u0A93-\\u0AA8]|[\\u0AAA-\\u0AB0]|[\\u0AB2-\\u0AB3]|[\\u0AB5-\\u0AB9]|\\u0ABD|\\u0AE0|[\\u0B05-\\u0B0C]|[\\u0B0F-\\u0B10]|[\\u0B13-\\u0B28]|[\\u0B2A-\\u0B30]|[\\u0B32-\\u0B33]|[\\u0B36-\\u0B39]|\\u0B3D|[\\u0B5C-\\u0B5D]|[\\u0B5F-\\u0B61]|[\\u0B85-\\u0B8A]|[\\u0B8E-\\u0B90]|[\\u0B92-\\u0B95]|[\\u0B99-\\u0B9A]|\\u0B9C|[\\u0B9E-\\u0B9F]|[\\u0BA3-\\u0BA4]|[\\u0BA8-\\u0BAA]|[\\u0BAE-\\u0BB5]|[\\u0BB7-\\u0BB9]|[\\u0C05-\\u0C0C]|[\\u0C0E-\\u0C10]|[\\u0C12-\\u0C28]|[\\u0C2A-\\u0C33]|[\\u0C35-\\u0C39]|[\\u0C60-\\u0C61]|[\\u0C85-\\u0C8C]|[\\u0C8E-\\u0C90]|[\\u0C92-\\u0CA8]|[\\u0CAA-\\u0CB3]|[\\u0CB5-\\u0CB9]|\\u0CDE|[\\u0CE0-\\u0CE1]|[\\u0D05-\\u0D0C]|[\\u0D0E-\\u0D10]|[\\u0D12-\\u0D28]|[\\u0D2A-\\u0D39]|[\\u0D60-\\u0D61]|[\\u0E01-\\u0E2E]|\\u0E30|[\\u0E32-\\u0E33]|[\\u0E40-\\u0E45]|[\\u0E81-\\u0E82]|\\u0E84|[\\u0E87-\\u0E88]|\\u0E8A|\\u0E8D|[\\u0E94-\\u0E97]|[\\u0E99-\\u0E9F]|[\\u0EA1-\\u0EA3]|\\u0EA5|\\u0EA7|[\\u0EAA-\\u0EAB]|[\\u0EAD-\\u0EAE]|\\u0EB0|[\\u0EB2-\\u0EB3]|\\u0EBD|[\\u0EC0-\\u0EC4]|[\\u0F40-\\u0F47]|[\\u0F49-\\u0F69]|[\\u10A0-\\u10C5]|[\\u10D0-\\u10F6]|\\u1100|[\\u1102-\\u1103]|[\\u1105-\\u1107]|\\u1109|[\\u110B-\\u110C]|[\\u110E-\\u1112]|\\u113C|\\u113E|\\u1140|\\u114C|\\u114E|\\u1150|[\\u1154-\\u1155]|\\u1159|[\\u115F-\\u1161]|\\u1163|\\u1165|\\u1167|\\u1169|[\\u116D-\\u116E]|[\\u1172-\\u1173]|\\u1175|\\u119E|\\u11A8|\\u11AB|[\\u11AE-\\u11AF]|[\\u11B7-\\u11B8]|\\u11BA|[\\u11BC-\\u11C2]|\\u11EB|\\u11F0|\\u11F9|[\\u1E00-\\u1E9B]|[\\u1EA0-\\u1EF9]|[\\u1F00-\\u1F15]|[\\u1F18-\\u1F1D]|[\\u1F20-\\u1F45]|[\\u1F48-\\u1F4D]|[\\u1F50-\\u1F57]|\\u1F59|\\u1F5B|\\u1F5D|[\\u1F5F-\\u1F7D]|[\\u1F80-\\u1FB4]|[\\u1FB6-\\u1FBC]|\\u1FBE|[\\u1FC2-\\u1FC4]|[\\u1FC6-\\u1FCC]|[\\u1FD0-\\u1FD3]|[\\u1FD6-\\u1FDB]|[\\u1FE0-\\u1FEC]|[\\u1FF2-\\u1FF4]|[\\u1FF6-\\u1FFC]|\\u2126|[\\u212A-\\u212B]|\\u212E|[\\u2180-\\u2182]|[\\u3041-\\u3094]|[\\u30A1-\\u30FA]|[\\u3105-\\u312C]|[\\uAC00-\\uD7A3]|[\\u4E00-\\u9FA5]|\\u3007|[\\u3021-\\u3029]|_|\\.|-|[\\u0300-\\u0345]|[\\u0360-\\u0361]|[\\u0483-\\u0486]|[\\u0591-\\u05A1]|[\\u05A3-\\u05B9]|[\\u05BB-\\u05BD]|\\u05BF|[\\u05C1-\\u05C2]|\\u05C4|[\\u064B-\\u0652]|\\u0670|[\\u06D6-\\u06DC]|[\\u06DD-\\u06DF]|[\\u06E0-\\u06E4]|[\\u06E7-\\u06E8]|[\\u06EA-\\u06ED]|[\\u0901-\\u0903]|\\u093C|[\\u093E-\\u094C]|\\u094D|[\\u0951-\\u0954]|[\\u0962-\\u0963]|[\\u0981-\\u0983]|\\u09BC|\\u09BE|\\u09BF|[\\u09C0-\\u09C4]|[\\u09C7-\\u09C8]|[\\u09CB-\\u09CD]|\\u09D7|[\\u09E2-\\u09E3]|\\u0A02|\\u0A3C|\\u0A3E|\\u0A3F|[\\u0A40-\\u0A42]|[\\u0A47-\\u0A48]|[\\u0A4B-\\u0A4D]|[\\u0A70-\\u0A71]|[\\u0A81-\\u0A83]|\\u0ABC|[\\u0ABE-\\u0AC5]|[\\u0AC7-\\u0AC9]|[\\u0ACB-\\u0ACD]|[\\u0B01-\\u0B03]|\\u0B3C|[\\u0B3E-\\u0B43]|[\\u0B47-\\u0B48]|[\\u0B4B-\\u0B4D]|[\\u0B56-\\u0B57]|[\\u0B82-\\u0B83]|[\\u0BBE-\\u0BC2]|[\\u0BC6-\\u0BC8]|[\\u0BCA-\\u0BCD]|\\u0BD7|[\\u0C01-\\u0C03]|[\\u0C3E-\\u0C44]|[\\u0C46-\\u0C48]|[\\u0C4A-\\u0C4D]|[\\u0C55-\\u0C56]|[\\u0C82-\\u0C83]|[\\u0CBE-\\u0CC4]|[\\u0CC6-\\u0CC8]|[\\u0CCA-\\u0CCD]|[\\u0CD5-\\u0CD6]|[\\u0D02-\\u0D03]|[\\u0D3E-\\u0D43]|[\\u0D46-\\u0D48]|[\\u0D4A-\\u0D4D]|\\u0D57|\\u0E31|[\\u0E34-\\u0E3A]|[\\u0E47-\\u0E4E]|\\u0EB1|[\\u0EB4-\\u0EB9]|[\\u0EBB-\\u0EBC]|[\\u0EC8-\\u0ECD]|[\\u0F18-\\u0F19]|\\u0F35|\\u0F37|\\u0F39|\\u0F3E|\\u0F3F|[\\u0F71-\\u0F84]|[\\u0F86-\\u0F8B]|[\\u0F90-\\u0F95]|\\u0F97|[\\u0F99-\\u0FAD]|[\\u0FB1-\\u0FB7]|\\u0FB9|[\\u20D0-\\u20DC]|\\u20E1|[\\u302A-\\u302F]|\\u3099|\\u309A|\\u00B7|\\u02D0|\\u02D1|\\u0387|\\u0640|\\u0E46|\\u0EC6|\\u3005|[\\u3031-\\u3035]|[\\u309D-\\u309E]|[\\u30FC-\\u30FE])*"); 076 077 /** 078 * Magic value for UTF-16 operations. 079 */ 080 private static final int LEAD_OFFSET = 0xD800 - (0x10000 >> 10); 081 082 /** 083 * Magic value for UTF-16 operations. 084 */ 085 private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00; 086 087 /** 088 * UTF-16 code unit array containing less than and greater than for emitting 089 * those characters on certain parse errors. 090 */ 091 private static final char[] LT_GT = { '<', '>' }; 092 093 /** 094 * UTF-16 code unit array containing less than and solidus for emitting 095 * those characters on certain parse errors. 096 */ 097 private static final char[] LT_SOLIDUS = { '<', '/' }; 098 099 /** 100 * Array version of U+FFFD. 101 */ 102 private static final char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; 103 104 /** 105 * Array version of space. 106 */ 107 private static final char[] SPACE = { ' ' }; 108 109 /** 110 * Array version of line feed. 111 */ 112 private static final char[] LF = { '\n' }; 113 114 /** 115 * Buffer growth parameter. 116 */ 117 private static final int BUFFER_GROW_BY = 1024; 118 119 /** 120 * Lexically sorted void element names 121 */ 122 private static final String[] VOID_ELEMENTS = { "area", "base", "br", 123 "col", "embed", "hr", "img", "input", "link", "meta", "param" }; 124 125 /** 126 * "octype" as <code>char[]</code> 127 */ 128 private static final char[] OCTYPE = "octype".toCharArray(); 129 130 /** 131 * "ublic" as <code>char[]</code> 132 */ 133 private static final char[] UBLIC = "ublic".toCharArray(); 134 135 /** 136 * "ystem" as <code>char[]</code> 137 */ 138 private static final char[] YSTEM = "ystem".toCharArray(); 139 140 /** 141 * The token handler. 142 */ 143 private final TokenHandler tokenHandler; 144 145 /** 146 * The error handler. 147 */ 148 private ErrorHandler errorHandler; 149 150 /** 151 * The input UTF-16 code unit stream. If a byte stream was given, this 152 * object is an instance of <code>HtmlInputStreamReader</code>. 153 */ 154 private Reader reader; 155 156 /** 157 * The main input buffer that the tokenizer reads from. Filled from 158 * <code>reader</code>. 159 */ 160 private char[] buf = new char[2048]; 161 162 /** 163 * The index of the last <code>char</code> read from <code>buf</code>. 164 */ 165 private int pos; 166 167 /** 168 * The index of the first <code>char</code> in <code>buf</code> that is 169 * part of a coalesced run of character tokens or <code>-1</code> if there 170 * is not a current run being coalesced. 171 */ 172 private int cstart; 173 174 /** 175 * The number of <code>char</code>s in <code>buf</code> that have 176 * meaning. (The rest of the array is garbage and should not be examined.) 177 */ 178 private int bufLen; 179 180 /** 181 * The previous <code>char</code> read from the buffer with infoset 182 * alteration applied except for CR. Used for CRLF normalization and 183 * surrogate pair checking. 184 */ 185 private char prev; 186 187 /** 188 * Lookbehind buffer for magic RCDATA/CDATA escaping. 189 */ 190 private final char[] prevFour = new char[4]; 191 192 /** 193 * Points to the last <code>char</code> written to <code>prevFour</code>. 194 */ 195 private int prevFourPtr = 0; 196 197 /** 198 * Single code unit buffer for reconsuming an input character. If 199 * <code>-1</code> the next <code>read()</code> returns from the real 200 * buffer, otherwise from here. 201 */ 202 private int unreadBuffer = -1; 203 204 /** 205 * The current line number in the current resource being parsed. (First line 206 * is 1.) Passed on as locator data. 207 */ 208 private int line; 209 210 private int linePrev; 211 212 /** 213 * The current column number in the current resource being tokenized. (First 214 * column is 1, counted by UTF-16 code units.) Passed on as locator data. 215 */ 216 private int col; 217 218 private int colPrev; 219 220 private boolean nextCharOnNewLine; 221 222 /** 223 * The SAX public id for the resource being tokenized. (Only passed to back 224 * as part of locator data.) 225 */ 226 private String publicId; 227 228 /** 229 * The SAX system id for the resource being tokenized. (Only passed to back 230 * as part of locator data.) 231 */ 232 private String systemId; 233 234 /** 235 * Buffer for short identifiers. 236 */ 237 private char[] strBuf = new char[64]; 238 239 /** 240 * Number of significant <code>char</code>s in <code>strBuf</code>. 241 */ 242 private int strBufLen = 0; 243 244 /** 245 * Buffer for long strings. 246 */ 247 private char[] longStrBuf = new char[1024]; 248 249 /** 250 * Number of significant <code>char</code>s in <code>longStrBuf</code>. 251 */ 252 private int longStrBufLen = 0; 253 254 /** 255 * If not U+0000, a pending code unit to be appended to 256 * <code>longStrBuf</code>. 257 */ 258 private char longStrBufPending = '\u0000'; 259 260 /** 261 * The attribute holder. 262 */ 263 private AttributesImpl attributes; 264 265 /** 266 * Buffer for expanding NCRs falling into the Basic Multilingual Plane. 267 */ 268 private final char[] bmpChar = new char[1]; 269 270 /** 271 * Buffer for expanding astral NCRs. 272 */ 273 private final char[] astralChar = new char[2]; 274 275 /** 276 * Keeps track of PUA warnings. 277 */ 278 private boolean alreadyWarnedAboutPrivateUseCharacters; 279 280 /** 281 * http://www.whatwg.org/specs/web-apps/current-work/#content2 282 */ 283 private ContentModelFlag contentModelFlag = ContentModelFlag.PCDATA; 284 285 /** 286 * http://www.whatwg.org/specs/web-apps/current-work/#escape 287 */ 288 private boolean escapeFlag = false; 289 290 /** 291 * The element whose end tag closes the current CDATA or RCDATA element. 292 */ 293 private String contentModelElement = ""; 294 295 /** 296 * <code>true</code> if tokenizing an end tag 297 */ 298 private boolean endTag; 299 300 /** 301 * The current tag token name. 302 */ 303 private String tagName = null; 304 305 /** 306 * The current attribute name. 307 */ 308 private String attributeName = null; 309 310 /** 311 * Whether comment tokens are emitted. 312 */ 313 private boolean wantsComments = false; 314 315 /** 316 * If <code>false</code>, <code>addAttribute*()</code> are no-ops. 317 */ 318 private boolean shouldAddAttributes; 319 320 /** 321 * <code>true</code> when in text content or in attribute value. 322 */ 323 private boolean inContent; 324 325 /** 326 * <code>true</code> when HTML4-specific additional errors are requested. 327 */ 328 private boolean html4; 329 330 /** 331 * Whether non-ASCII causes an error. 332 */ 333 private boolean nonAsciiProhibited; 334 335 /** 336 * Used together with <code>nonAsciiProhibited</code>. 337 */ 338 private boolean alreadyComplainedAboutNonAscii; 339 340 /** 341 * Whether the stream is past the first 512 bytes. 342 */ 343 private boolean metaBoundaryPassed; 344 345 /** 346 * The name of the current doctype token. 347 */ 348 private String doctypeName; 349 350 /** 351 * The public id of the current doctype token. 352 */ 353 private String publicIdentifier; 354 355 /** 356 * The system id of the current doctype token. 357 */ 358 private String systemIdentifier; 359 360 /** 361 * Used for NFC checking if non-<code>null</code>, source code capture, 362 * etc. 363 */ 364 private CharacterHandler[] characterHandlers = new CharacterHandler[0]; 365 366 /** 367 * The policy for vertical tab and form feed. 368 */ 369 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALLOW; 370 371 /** 372 * The policy for non-space non-XML characters. 373 */ 374 private XmlViolationPolicy contentNonXmlCharPolicy = XmlViolationPolicy.ALLOW; 375 376 /** 377 * The policy for comments. 378 */ 379 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALLOW; 380 381 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALLOW; 382 383 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALLOW; 384 385 private boolean swallowBom; 386 387 private boolean html4ModeCompatibleWithXhtml1Schemata; 388 389 private boolean mappingLangToXmlLang; 390 391 private XmlViolationPolicy bogusXmlnsPolicy; 392 393 // start public API 394 395 /** 396 * The constuctor. 397 * 398 * @param tokenHandler 399 * the handler for receiving tokens 400 */ 401 public Tokenizer(TokenHandler tokenHandler) { 402 this.tokenHandler = tokenHandler; 403 } 404 405 /** 406 * Turns NFC checking on or off. 407 * 408 * @param enable 409 * <code>true</code> if checking on 410 */ 411 public void setCheckingNormalization(boolean enable) { 412 if (enable) { 413 if (isCheckingNormalization()) { 414 return; 415 } else { 416 NormalizationChecker normalizationChecker = new NormalizationChecker( 417 this); 418 normalizationChecker.setErrorHandler(errorHandler); 419 420 } 421 } else { 422 if (isCheckingNormalization()) { 423 CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length - 1]; 424 boolean skipped = false; 425 int j = 0; 426 for (int i = 0; i < characterHandlers.length; i++) { 427 CharacterHandler ch = characterHandlers[i]; 428 if (!(!skipped && (ch instanceof NormalizationChecker))) { 429 newHandlers[j] = ch; 430 j++; 431 } 432 } 433 characterHandlers = newHandlers; 434 } else { 435 return; 436 } 437 } 438 } 439 440 public void addCharacterHandler(CharacterHandler characterHandler) { 441 if (characterHandler == null) { 442 throw new IllegalArgumentException("Null argument."); 443 } 444 CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length + 1]; 445 System.arraycopy(characterHandlers, 0, newHandlers, 0, 446 characterHandlers.length); 447 newHandlers[characterHandlers.length] = characterHandler; 448 characterHandlers = newHandlers; 449 } 450 451 /** 452 * Query if checking normalization. 453 * 454 * @return <code>true</code> if checking on 455 */ 456 public boolean isCheckingNormalization() { 457 for (int i = 0; i < characterHandlers.length; i++) { 458 CharacterHandler ch = characterHandlers[i]; 459 if (ch instanceof NormalizationChecker) { 460 return true; 461 } 462 } 463 return false; 464 } 465 466 /** 467 * Sets the error handler. 468 * 469 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 470 */ 471 public void setErrorHandler(ErrorHandler eh) { 472 this.errorHandler = eh; 473 for (int i = 0; i < characterHandlers.length; i++) { 474 CharacterHandler ch = characterHandlers[i]; 475 if (ch instanceof NormalizationChecker) { 476 NormalizationChecker nc = (NormalizationChecker) ch; 477 nc.setErrorHandler(eh); 478 } 479 } 480 } 481 482 /** 483 * Returns the commentPolicy. 484 * 485 * @return the commentPolicy 486 */ 487 public XmlViolationPolicy getCommentPolicy() { 488 return commentPolicy; 489 } 490 491 /** 492 * Sets the commentPolicy. 493 * 494 * @param commentPolicy 495 * the commentPolicy to set 496 */ 497 public void setCommentPolicy(XmlViolationPolicy commentPolicy) { 498 this.commentPolicy = commentPolicy; 499 } 500 501 /** 502 * Returns the contentNonXmlCharPolicy. 503 * 504 * @return the contentNonXmlCharPolicy 505 */ 506 public XmlViolationPolicy getContentNonXmlCharPolicy() { 507 return contentNonXmlCharPolicy; 508 } 509 510 /** 511 * Sets the contentNonXmlCharPolicy. 512 * 513 * @param contentNonXmlCharPolicy 514 * the contentNonXmlCharPolicy to set 515 */ 516 public void setContentNonXmlCharPolicy( 517 XmlViolationPolicy contentNonXmlCharPolicy) { 518 this.contentNonXmlCharPolicy = contentNonXmlCharPolicy; 519 } 520 521 /** 522 * Returns the contentSpacePolicy. 523 * 524 * @return the contentSpacePolicy 525 */ 526 public XmlViolationPolicy getContentSpacePolicy() { 527 return contentSpacePolicy; 528 } 529 530 /** 531 * Sets the contentSpacePolicy. 532 * 533 * @param contentSpacePolicy 534 * the contentSpacePolicy to set 535 */ 536 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { 537 this.contentSpacePolicy = contentSpacePolicy; 538 } 539 540 /** 541 * Sets the xmlnsPolicy. 542 * 543 * @param xmlnsPolicy 544 * the xmlnsPolicy to set 545 */ 546 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { 547 if (xmlnsPolicy == XmlViolationPolicy.FATAL) { 548 throw new IllegalArgumentException("Can't use FATAL here."); 549 } 550 this.xmlnsPolicy = xmlnsPolicy; 551 } 552 553 public void setNamePolicy(XmlViolationPolicy namePolicy) { 554 this.namePolicy = namePolicy; 555 } 556 557 /** 558 * Sets the bogusXmlnsPolicy. 559 * 560 * @param bogusXmlnsPolicy 561 * the bogusXmlnsPolicy to set 562 */ 563 public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) { 564 this.bogusXmlnsPolicy = bogusXmlnsPolicy; 565 } 566 567 /** 568 * Sets the html4ModeCompatibleWithXhtml1Schemata. 569 * 570 * @param html4ModeCompatibleWithXhtml1Schemata 571 * the html4ModeCompatibleWithXhtml1Schemata to set 572 */ 573 public void setHtml4ModeCompatibleWithXhtml1Schemata( 574 boolean html4ModeCompatibleWithXhtml1Schemata) { 575 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata; 576 } 577 578 /** 579 * Runs the tokenization. This is the main entry point. 580 * 581 * @param is 582 * the input source 583 * @throws SAXException 584 * on fatal error (if configured to treat XML violations as 585 * fatal) or if the token handler threw 586 * @throws IOException 587 * if the stream threw 588 */ 589 public void tokenize(InputSource is) throws SAXException, IOException { 590 if (is == null) { 591 throw new IllegalArgumentException("InputSource was null."); 592 } 593 swallowBom = true; 594 this.systemId = is.getSystemId(); 595 this.publicId = is.getPublicId(); 596 this.reader = is.getCharacterStream(); 597 CharsetDecoder decoder = decoderFromExternalDeclaration(is.getEncoding()); 598 if (this.reader == null) { 599 InputStream inputStream = is.getByteStream(); 600 if (inputStream == null) { 601 throw new SAXException("Both streams in InputSource were null."); 602 } 603 if (decoder == null) { 604 this.reader = new HtmlInputStreamReader(inputStream, 605 errorHandler, this, this); 606 } else { 607 this.reader = new HtmlInputStreamReader(inputStream, 608 errorHandler, this, this, decoder); 609 } 610 } 611 contentModelFlag = ContentModelFlag.PCDATA; 612 escapeFlag = false; 613 inContent = true; 614 pos = -1; 615 cstart = -1; 616 line = linePrev = 0; 617 col = colPrev = 1; 618 nextCharOnNewLine = true; 619 prev = '\u0000'; 620 bufLen = 0; 621 nonAsciiProhibited = false; 622 alreadyComplainedAboutNonAscii = false; 623 html4 = false; 624 alreadyWarnedAboutPrivateUseCharacters = false; 625 metaBoundaryPassed = false; 626 tokenHandler.start(this); 627 for (int i = 0; i < characterHandlers.length; i++) { 628 CharacterHandler ch = characterHandlers[i]; 629 ch.start(); 630 } 631 wantsComments = tokenHandler.wantsComments(); 632 try { 633 if (swallowBom) { 634 // Swallow the BOM 635 char c = read(); 636 if (c == '\uFEFF') { 637 line = linePrev = 0; 638 col = colPrev = 1; 639 nextCharOnNewLine = true; 640 } else { 641 unread(c); 642 } 643 } 644 dataState(); 645 } finally { 646 systemIdentifier = null; 647 publicIdentifier = null; 648 doctypeName = null; 649 tagName = null; 650 attributeName = null; 651 tokenHandler.eof(); 652 for (int i = 0; i < characterHandlers.length; i++) { 653 CharacterHandler ch = characterHandlers[i]; 654 ch.end(); 655 } 656 reader.close(); 657 } 658 } 659 660 // For the token handler to call 661 /** 662 * Sets the content model flag and the associated element name. 663 * 664 * @param contentModelFlag 665 * the flag 666 * @param contentModelElement 667 * the element causing the flag to be set 668 */ 669 public void setContentModelFlag(ContentModelFlag contentModelFlag, 670 String contentModelElement) { 671 this.contentModelFlag = contentModelFlag; 672 this.contentModelElement = contentModelElement; 673 } 674 675 // start Locator impl 676 677 /** 678 * @see org.xml.sax.Locator#getPublicId() 679 */ 680 public String getPublicId() { 681 return publicId; 682 } 683 684 /** 685 * @see org.xml.sax.Locator#getSystemId() 686 */ 687 public String getSystemId() { 688 return systemId; 689 } 690 691 /** 692 * @see org.xml.sax.Locator#getLineNumber() 693 */ 694 public int getLineNumber() { 695 if (line > 0) { 696 return line; 697 } else { 698 return -1; 699 } 700 } 701 702 /** 703 * @see org.xml.sax.Locator#getColumnNumber() 704 */ 705 public int getColumnNumber() { 706 if (col > 0) { 707 return col; 708 } else { 709 return -1; 710 } 711 } 712 713 // end Locator impl 714 715 // end public API 716 717 void notifyAboutMetaBoundary() { 718 metaBoundaryPassed = true; 719 } 720 721 void turnOnAdditionalHtml4Errors() { 722 html4 = true; 723 } 724 725 void dontSwallowBom() { 726 swallowBom = false; 727 } 728 729 void noEncodingDeclared() { 730 nonAsciiProhibited = true; 731 } 732 733 AttributesImpl newAttributes() { 734 if (mappingLangToXmlLang) { 735 return new XmlLangAttributesImpl(); 736 } else { 737 return new AttributesImpl(); 738 } 739 } 740 741 /** 742 * Clears the smaller buffer. 743 */ 744 private void clearStrBuf() { 745 strBufLen = 0; 746 } 747 748 /** 749 * Appends to the smaller buffer. 750 * 751 * @param c 752 * the UTF-16 code unit to append 753 */ 754 private void appendStrBuf(char c) { 755 if (strBufLen == strBuf.length) { 756 char[] newBuf = new char[strBuf.length + BUFFER_GROW_BY]; 757 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); 758 strBuf = newBuf; 759 } 760 strBuf[strBufLen++] = c; 761 } 762 763 /** 764 * The smaller buffer as a string. 765 * 766 * @return the smaller buffer as a string 767 */ 768 private String strBufToString() { 769 return new String(strBuf, 0, strBufLen); 770 } 771 772 /** 773 * Emits the smaller buffer as character tokens. 774 * 775 * @throws SAXException 776 * if the token handler threw 777 */ 778 private void emitStrBuf() throws SAXException { 779 if (strBufLen > 0) { 780 tokenHandler.characters(strBuf, 0, strBufLen); 781 } 782 } 783 784 private boolean isNcname(String str) { 785 Matcher m = NCNAME_PATTERN.matcher(str); 786 return m.matches(); 787 } 788 789 /** 790 * Clears the larger buffer. 791 */ 792 private void clearLongStrBuf() { 793 longStrBufLen = 0; 794 longStrBufPending = '\u0000'; 795 } 796 797 /** 798 * Appends to the larger buffer. 799 * 800 * @param c 801 * the UTF-16 code unit to append 802 */ 803 private void appendLongStrBuf(char c) { 804 if (longStrBufLen == longStrBuf.length) { 805 char[] newBuf = new char[longStrBuf.length + BUFFER_GROW_BY]; 806 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); 807 longStrBuf = newBuf; 808 } 809 longStrBuf[longStrBufLen++] = c; 810 } 811 812 /** 813 * Appends to the larger buffer when it is used to buffer a comment. Checks 814 * for two consecutive hyphens. 815 * 816 * @param c 817 * the UTF-16 code unit to append 818 * @throws SAXException 819 */ 820 private void appendToComment(char c) throws SAXException { 821 if (longStrBufPending == '-' && c == '-') { 822 if (commentPolicy == XmlViolationPolicy.FATAL) { 823 fatal("This document is not mappable to XML 1.0 without data loss to \u201C--\u201D in a comment."); 824 } else { 825 warn("This document is not mappable to XML 1.0 without data loss to \u201C--\u201D in a comment."); 826 if (wantsComments) { 827 if (commentPolicy == XmlViolationPolicy.ALLOW) { 828 appendLongStrBuf('-'); 829 } else { 830 appendLongStrBuf('-'); 831 appendLongStrBuf(' '); 832 } 833 } 834 longStrBufPending = '-'; 835 } 836 } else { 837 if (longStrBufPending != '\u0000') { 838 if (wantsComments) { 839 appendLongStrBuf(longStrBufPending); 840 } 841 longStrBufPending = '\u0000'; 842 } 843 if (c == '-') { 844 longStrBufPending = '-'; 845 } else { 846 if (wantsComments) { 847 appendLongStrBuf(c); 848 } 849 } 850 } 851 } 852 853 /** 854 * Appends to the larger buffer. 855 * 856 * @param arr 857 * the UTF-16 code units to append 858 */ 859 private void appendLongStrBuf(char[] arr) { 860 for (int i = 0; i < arr.length; i++) { 861 appendLongStrBuf(arr[i]); 862 } 863 } 864 865 /** 866 * Append the contents of the smaller buffer to the larger one. 867 */ 868 private void appendStrBufToLongStrBuf() { 869 for (int i = 0; i < strBufLen; i++) { 870 appendLongStrBuf(strBuf[i]); 871 } 872 } 873 874 /** 875 * The larger buffer as a string. 876 * 877 * @return the larger buffer as a string 878 */ 879 private String longStrBufToString() { 880 if (longStrBufPending != '\u0000') { 881 appendLongStrBuf(longStrBufPending); 882 } 883 return new String(longStrBuf, 0, longStrBufLen); 884 } 885 886 /** 887 * Emits the current comment token. 888 * 889 * @throws SAXException 890 */ 891 private void emitComment() throws SAXException { 892 if (wantsComments) { 893 if (longStrBufPending != '\u0000') { 894 appendLongStrBuf(longStrBufPending); 895 } 896 } 897 tokenHandler.comment(longStrBuf, longStrBufLen); 898 } 899 900 /** 901 * Unreads a code unit so that it is returned the next time 902 * <code>read()</code> is called. 903 * 904 * @param c 905 * the code unit to unread 906 */ 907 private void unread(char c) { 908 unreadBuffer = c; 909 } 910 911 /** 912 * Reads the next UTF-16 code unit. 913 * 914 * @return the next code unit 915 * @throws SAXException 916 * @throws IOException 917 */ 918 private char read() throws SAXException, IOException { 919 for (;;) { // the loop is here for the CRLF case 920 if (unreadBuffer != -1) { 921 char c = (char) unreadBuffer; 922 unreadBuffer = -1; 923 return c; 924 } 925 assert (bufLen > -1); 926 pos++; 927 assert pos <= bufLen; 928 linePrev = line; 929 colPrev = col; 930 if (nextCharOnNewLine) { 931 line++; 932 col = 1; 933 nextCharOnNewLine = false; 934 } else { 935 col++; 936 } 937 if (pos == bufLen) { 938 boolean charDataContinuation = false; 939 if (cstart > -1) { 940 flushChars(); 941 charDataContinuation = true; 942 } 943 bufLen = reader.read(buf); 944 assert bufLen <= buf.length; 945 if (bufLen == -1) { 946 return '\u0000'; 947 } else { 948 for (int i = 0; i < characterHandlers.length; i++) { 949 CharacterHandler ch = characterHandlers[i]; 950 ch.characters(buf, 0, bufLen); 951 } 952 } 953 if (charDataContinuation) { 954 cstart = 0; 955 } 956 pos = 0; 957 } 958 char c = buf[pos]; 959 if (c > '\u007F' && nonAsciiProhibited 960 && !alreadyComplainedAboutNonAscii) { 961 err("The character encoding of the document was not explicit but the document contains non-ASCII."); 962 } 963 switch (c) { 964 case '\n': 965 /* 966 * U+000D CARRIAGE RETURN (CR) characters, and U+000A LINE 967 * FEED (LF) characters, are treated specially. Any CR 968 * characters that are followed by LF characters must be 969 * removed, and any CR characters not followed by LF 970 * characters must be converted to LF characters. 971 */ 972 if (prev == '\r') { 973 // swallow the LF 974 if (cstart != -1) { 975 flushChars(); 976 cstart = pos + 1; 977 } 978 col = colPrev; 979 line = linePrev; 980 nextCharOnNewLine = true; 981 prev = c; 982 continue; 983 } else { 984 nextCharOnNewLine = true; 985 } 986 break; 987 case '\r': 988 c = buf[pos] = '\n'; 989 nextCharOnNewLine = true; 990 prev = '\r'; 991 if (contentModelFlag != ContentModelFlag.PCDATA) { 992 prevFourPtr++; 993 prevFourPtr %= 4; 994 prevFour[prevFourPtr] = c; 995 } 996 return c; 997 case '\u0000': 998 /* 999 * All U+0000 NULL characters in the input must be replaced 1000 * by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such 1001 * characters is a parse error. 1002 */ 1003 err("Found U+0000 in the character stream."); 1004 c = buf[pos] = '\uFFFD'; 1005 break; 1006 case '\u000B': 1007 case '\u000C': 1008 if (inContent) { 1009 if (contentNonXmlCharPolicy == XmlViolationPolicy.FATAL) { 1010 fatal("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character."); 1011 } else { 1012 if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) { 1013 c = buf[pos] = ' '; 1014 } 1015 warn("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character."); 1016 } 1017 } 1018 break; 1019 default: 1020 if ((c & 0xFC00) == 0xDC00) { 1021 // Got a low surrogate. See if prev was high surrogate 1022 if ((prev & 0xFC00) == 0xD800) { 1023 int intVal = (prev << 10) + c + SURROGATE_OFFSET; 1024 if (isNonCharacter(intVal)) { 1025 warn("Astral non-character."); 1026 } 1027 if (isAstralPrivateUse(intVal)) { 1028 warnAboutPrivateUseChar(); 1029 } 1030 } else { 1031 // XXX figure out what to do about lone high 1032 // surrogates 1033 err("Found low surrogate without high surrogate."); 1034 c = buf[pos] = '\uFFFD'; 1035 } 1036 } else if (inContent && (c < ' ' || isNonCharacter(c)) 1037 && (c != '\t')) { 1038 if (contentNonXmlCharPolicy == XmlViolationPolicy.FATAL) { 1039 fatal("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character."); 1040 } else { 1041 if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) { 1042 c = buf[pos] = '\uFFFD'; 1043 } 1044 warn("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character."); 1045 } 1046 } else if (isPrivateUse(c)) { 1047 warnAboutPrivateUseChar(); 1048 } 1049 } 1050 prev = c; 1051 if (contentModelFlag != ContentModelFlag.PCDATA) { 1052 prevFourPtr++; 1053 prevFourPtr %= 4; 1054 prevFour[prevFourPtr] = c; 1055 } 1056 return c; 1057 } 1058 } 1059 1060 /** 1061 * Emits a warning about private use characters if the warning has not been 1062 * emitted yet. 1063 * 1064 * @throws SAXException 1065 */ 1066 private void warnAboutPrivateUseChar() throws SAXException { 1067 if (!alreadyWarnedAboutPrivateUseCharacters) { 1068 warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)"); 1069 alreadyWarnedAboutPrivateUseCharacters = true; 1070 } 1071 } 1072 1073 /** 1074 * Tells if the argument is a BMP PUA character. 1075 * 1076 * @param c 1077 * the UTF-16 code unit to check 1078 * @return <code>true</code> if PUA character 1079 */ 1080 private boolean isPrivateUse(char c) { 1081 return c >= '\uE000' && c <= '\uF8FF'; 1082 } 1083 1084 /** 1085 * Tells if the argument is an astral PUA character. 1086 * 1087 * @param c 1088 * the code point to check 1089 * @return <code>true</code> if astral private use 1090 */ 1091 private boolean isAstralPrivateUse(int c) { 1092 return (c >= 0xF0000 && c <= 0xFFFFD) 1093 || (c >= 0x100000 && c <= 0x10FFFD); 1094 } 1095 1096 /** 1097 * Tells if the argument is a non-character (works for BMP and astral). 1098 * 1099 * @param c 1100 * the code point to check 1101 * @return <code>true</code> if non-character 1102 */ 1103 private boolean isNonCharacter(int c) { 1104 return (c & 0xFFFE) == 0xFFFE; 1105 } 1106 1107 /** 1108 * Flushes coalesced character tokens. 1109 * 1110 * @throws SAXException 1111 */ 1112 private void flushChars() throws SAXException, IOException { 1113 if (cstart != -1) { 1114 if (pos > cstart) { 1115 int currLine = line; 1116 int currCol = col; 1117 line = linePrev; 1118 col = colPrev; 1119 try { 1120 tokenHandler.characters(buf, cstart, pos - cstart); 1121 } finally { 1122 line = currLine; 1123 col = currCol; 1124 } 1125 } 1126 } 1127 cstart = -1; 1128 } 1129 1130 /** 1131 * Reports an condition that would make the infoset incompatible with XML 1132 * 1.0 as fatal. 1133 * 1134 * @param message 1135 * the message 1136 * @throws SAXException 1137 * @throws SAXParseException 1138 */ 1139 private void fatal(String message) throws SAXException { 1140 SAXParseException spe = new SAXParseException(message, this); 1141 if (errorHandler != null) { 1142 errorHandler.fatalError(spe); 1143 } 1144 throw spe; 1145 } 1146 1147 /** 1148 * Reports a Parse Error. 1149 * 1150 * @param message 1151 * the message 1152 * @throws SAXException 1153 */ 1154 private void err(String message) throws SAXException { 1155 if (errorHandler == null) { 1156 return; 1157 } 1158 SAXParseException spe = new SAXParseException(message, this); 1159 errorHandler.error(spe); 1160 } 1161 1162 /** 1163 * Reports a warning 1164 * 1165 * @param message 1166 * the message 1167 * @throws SAXException 1168 */ 1169 private void warn(String message) throws SAXException { 1170 if (errorHandler == null) { 1171 return; 1172 } 1173 SAXParseException spe = new SAXParseException(message, this); 1174 errorHandler.warning(spe); 1175 } 1176 1177 /** 1178 * Initializes a decoder from external decl. 1179 */ 1180 private CharsetDecoder decoderFromExternalDeclaration(String encoding) 1181 throws SAXException { 1182 if (encoding == null) { 1183 return null; 1184 } 1185 encoding = encoding.toUpperCase(); 1186 if ("ISO-8859-1".equals(encoding)) { 1187 encoding = "Windows-1252"; 1188 } 1189 if ("UTF-16".equals(encoding) || "UTF-32".equals(encoding)) { 1190 swallowBom = false; 1191 } 1192 try { 1193 Charset cs = Charset.forName(encoding); 1194 String canonName = cs.name(); 1195 if (canonName.startsWith("X-") || canonName.startsWith("x-") 1196 || canonName.startsWith("Mac")) { 1197 if (encoding.startsWith("X-")) { 1198 err("The encoding \u201C" 1199 + encoding 1200 + "\u201D is not an IANA-registered encoding. (Charmod C022)"); 1201 } else { 1202 err("The encoding \u201C" 1203 + encoding 1204 + "\u201D is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)"); 1205 } 1206 } else if (!canonName.equalsIgnoreCase(encoding)) { 1207 err("The encoding \u201C" 1208 + encoding 1209 + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" 1210 + canonName + "\u201D. (Charmod C024)"); 1211 } 1212 if (EncodingInfo.isObscure(canonName)) { 1213 warn("The character encoding \u201C" 1214 + encoding 1215 + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); 1216 } 1217 return cs.newDecoder(); 1218 } catch (IllegalCharsetNameException e) { 1219 err("Illegal character encoding name: \u201C" + encoding 1220 + "\u201D. Will sniff."); 1221 } catch (UnsupportedCharsetException e) { 1222 err("Unsupported character encoding name: \u201C" + encoding 1223 + "\u201D. Will sniff."); 1224 swallowBom = true; 1225 } 1226 return null; // keep the compiler happy 1227 } 1228 1229 private boolean currentIsVoid() { 1230 return Arrays.binarySearch(VOID_ELEMENTS, tagName) > -1; 1231 } 1232 1233 /** 1234 * Data state 1235 * 1236 * @throws IOException 1237 * @throws SAXException 1238 * 1239 */ 1240 private void dataState() throws SAXException, IOException { 1241 char c = '\u0000'; 1242 for (;;) { 1243 c = read(); 1244 if (c == '&' 1245 && (contentModelFlag == ContentModelFlag.PCDATA || (contentModelFlag == ContentModelFlag.RCDATA) 1246 && !escapeFlag)) { 1247 /* 1248 * U+0026 AMPERSAND (&) When the content model flag is set to 1249 * one of the PCDATA or RCDATA states: switch to the entity data 1250 * state. Otherwise: treat it as per the "anything else" entry 1251 * below. 1252 */ 1253 flushChars(); 1254 entityDataState(); 1255 continue; 1256 } else if (c == '<' 1257 && ((contentModelFlag == ContentModelFlag.PCDATA) || (escapeFlag == false && (contentModelFlag == ContentModelFlag.CDATA || contentModelFlag == ContentModelFlag.RCDATA)))) { 1258 /* 1259 * U+003C LESS-THAN SIGN (<) When the content model flag is set 1260 * to the PCDATA state: switch to the tag open state. When the 1261 * content model flag is set to either the RCDATA state or the 1262 * CDATA state and the escape flag is false: switch to the tag 1263 * open state. Otherwise: treat it as per the "anything else" 1264 * entry below. 1265 */ 1266 flushChars(); 1267 resetAttributes(); 1268 inContent = false; 1269 tagOpenState(); 1270 inContent = true; 1271 continue; 1272 } else if (c == '\u0000') { 1273 /* 1274 * EOF Emit an end-of-file token. 1275 */ 1276 flushChars(); 1277 return; // eof() called in parent finally block 1278 } else { 1279 if (c == '-' 1280 && (escapeFlag == false) 1281 && (contentModelFlag == ContentModelFlag.RCDATA || contentModelFlag == ContentModelFlag.CDATA) 1282 && lastLtExclHyph()) { 1283 /* 1284 * U+002D HYPHEN-MINUS (-) If the content model flag is set 1285 * to either the RCDATA state or the CDATA state, and the 1286 * escape flag is false, and there are at least three 1287 * characters before this one in the input stream, and the 1288 * last four characters in the input stream, including this 1289 * one, are U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, 1290 * U+002D HYPHEN-MINUS, and U+002D HYPHEN-MINUS ("<!--"), 1291 * then set the escape flag to true. 1292 * 1293 * In any case, emit the input character as a character 1294 * token. Stay in the data state. 1295 */ 1296 escapeFlag = true; 1297 } else if (c == '>' && escapeFlag && lastHyphHyph()) { 1298 /* 1299 * U+003E GREATER-THAN SIGN (>) If the content model flag is 1300 * set to either the RCDATA state or the CDATA state, and 1301 * the escape flag is true, and the last three characters in 1302 * the input stream including this one are U+002D 1303 * HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN 1304 * SIGN ("-->"), set the escape flag to false. 1305 * 1306 * In any case, emit the input character as a character 1307 * token. Stay in the data state. 1308 */ 1309 escapeFlag = false; 1310 } 1311 /* 1312 * Anything else Emit the input character as a character token. 1313 */ 1314 if (cstart == -1) { 1315 // start coalescing character tokens 1316 cstart = pos; 1317 } 1318 /* 1319 * Stay in the data state. 1320 */ 1321 continue; 1322 } 1323 } 1324 } 1325 1326 private boolean lastHyphHyph() { 1327 return prevFour[(prevFourPtr - 1 + 4) % 4] == '-' 1328 && prevFour[(prevFourPtr - 2 + 4) % 4] == '-'; 1329 } 1330 1331 private boolean lastLtExclHyph() { 1332 return prevFour[(prevFourPtr - 1 + 4) % 4] == '-' 1333 && prevFour[(prevFourPtr - 2 + 4) % 4] == '!' 1334 && prevFour[(prevFourPtr - 3 + 4) % 4] == '<'; 1335 } 1336 1337 /** 1338 * 1339 * Entity data state 1340 * 1341 * @throws IOException 1342 * @throws SAXException 1343 */ 1344 private void entityDataState() throws SAXException, IOException { 1345 /* 1346 * (This cannot happen if the content model flag is set to the CDATA 1347 * state.) 1348 * 1349 * Attempt to consume an entity. 1350 */ 1351 consumeEntity(false); 1352 /* 1353 * If nothing is returned, emit a U+0026 AMPERSAND character token. 1354 * 1355 * Otherwise, emit the character token that was returned. 1356 */ 1357 // Handled by consumeEntity() 1358 /* 1359 * Finally, switch to the data state. 1360 */ 1361 return; 1362 } 1363 1364 /** 1365 * Tag open state 1366 * 1367 * @throws IOException 1368 * @throws SAXException 1369 */ 1370 private void tagOpenState() throws SAXException, IOException { 1371 /* 1372 * The behaviour of this state depends on the content model flag. 1373 */ 1374 // this can't happen in PLAINTEXT, so using not PCDATA as the condition 1375 if (contentModelFlag != ContentModelFlag.PCDATA) { 1376 /* 1377 * If the content model flag is set to the RCDATA or CDATA states 1378 * Consume the next input character. 1379 */ 1380 char c = read(); 1381 if (c == '/') { 1382 /* 1383 * If it is a U+002F SOLIDUS (/) character, switch to the close 1384 * tag open state. 1385 */ 1386 closeTagOpenState(); 1387 return; 1388 } else { 1389 /* 1390 * Otherwise, emit a U+003C LESS-THAN SIGN character token 1391 */ 1392 tokenHandler.characters(LT_GT, 0, 1); 1393 /* 1394 * and reconsume the current input character in the data state. 1395 */ 1396 unread(c); 1397 return; 1398 } 1399 } else { 1400 /* 1401 * If the content model flag is set to the PCDATA state Consume the 1402 * next input character: 1403 */ 1404 char c = read(); 1405 if (c == '!') { 1406 /* 1407 * U+0021 EXCLAMATION MARK (!) Switch to the markup declaration 1408 * open state. 1409 */ 1410 markupDeclarationOpenState(); 1411 return; 1412 } else if (c == '/') { 1413 /* U+002F SOLIDUS (/) Switch to the close tag open state. */ 1414 closeTagOpenState(); 1415 return; 1416 } else if (c >= 'A' && c <= 'Z') { 1417 /* 1418 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL 1419 * LETTER Z Create a new start tag token, 1420 */ 1421 endTag = false; 1422 /* 1423 * set its tag name to the lowercase version of the input 1424 * character (add 0x0020 to the character's code point), 1425 */ 1426 clearStrBuf(); 1427 appendStrBuf((char) (c + 0x20)); 1428 /* then switch to the tag name state. */ 1429 tagNameState(); 1430 /* 1431 * (Don't emit the token yet; further details will be filled in 1432 * before it is emitted.) 1433 */ 1434 return; 1435 } else if (c >= 'a' && c <= 'z') { 1436 /* 1437 * U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL 1438 * LETTER Z Create a new start tag token, 1439 */ 1440 endTag = false; 1441 /* 1442 * set its tag name to the input character, 1443 */ 1444 clearStrBuf(); 1445 appendStrBuf(c); 1446 /* then switch to the tag name state. */ 1447 tagNameState(); 1448 /* 1449 * (Don't emit the token yet; further details will be filled in 1450 * before it is emitted.) 1451 */ 1452 return; 1453 } else if (c == '>') { 1454 /* 1455 * U+003E GREATER-THAN SIGN (>) Parse error. 1456 */ 1457 err("Bad character \u201C>\u201D in the tag open state."); 1458 /* 1459 * Emit a U+003C LESS-THAN SIGN character token and a U+003E 1460 * GREATER-THAN SIGN character token. 1461 */ 1462 tokenHandler.characters(LT_GT, 0, 2); 1463 /* Switch to the data state. */ 1464 return; 1465 } else if (c == '?') { 1466 /* 1467 * U+003F QUESTION MARK (?) Parse error. 1468 */ 1469 err("Bad character \u201C?\u201D in the tag open state."); 1470 /* 1471 * Switch to the bogus comment state. 1472 */ 1473 clearLongStrBuf(); 1474 appendLongStrBuf(c); 1475 bogusCommentState(); 1476 return; 1477 } else { 1478 /* 1479 * Anything else Parse error. 1480 */ 1481 err("Bad character \u201C" + c 1482 + "\u201D in the tag open state."); 1483 /* 1484 * Emit a U+003C LESS-THAN SIGN character token 1485 */ 1486 tokenHandler.characters(LT_GT, 0, 1); 1487 /* 1488 * and reconsume the current input character in the data state. 1489 */ 1490 unread(c); 1491 return; 1492 } 1493 } 1494 } 1495 1496 /** 1497 * Close tag open state 1498 * 1499 * @throws IOException 1500 * @throws SAXException 1501 */ 1502 private void closeTagOpenState() throws SAXException, IOException { 1503 // this can't happen in PLAINTEXT, so using not PCDATA as the condition 1504 if (contentModelFlag != ContentModelFlag.PCDATA 1505 && contentModelElement != null) { 1506 /* 1507 * If the content model flag is set to the RCDATA or CDATA states 1508 * but no start tag token has ever been emitted by this instance of 1509 * the tokeniser (fragment case), or, if the content model flag is 1510 * set to the RCDATA or CDATA states and the next few characters do 1511 * not match the tag name of the last start tag token emitted (case 1512 * insensitively), or if they do but they are not immediately 1513 * followed by one of the following characters: + U+0009 CHARACTER 1514 * TABULATION + U+000A LINE FEED (LF) + U+000B LINE TABULATION + 1515 * U+000C FORM FEED (FF) + U+0020 SPACE + U+003E GREATER-THAN SIGN 1516 * (>) + U+002F SOLIDUS (/) + EOF 1517 * 1518 * ...then emit a U+003C LESS-THAN SIGN character token, a U+002F 1519 * SOLIDUS character token, and switch to the data state to process 1520 * the next input character. 1521 */ 1522 // Let's implement the above without lookahead. strBuf holds 1523 // characters that need to be emitted if looking for an end tag 1524 // fails. 1525 // Duplicating the relevant part of tag name state here as well. 1526 clearStrBuf(); 1527 for (int i = 0; i < contentModelElement.length(); i++) { 1528 char e = contentModelElement.charAt(i); 1529 char c = read(); 1530 char folded = c; 1531 if (c >= 'A' && c <= 'Z') { 1532 folded += 0x20; 1533 } 1534 if (folded != e) { 1535 if (i > 0 || (folded >= 'a' && folded <= 'z')) { 1536 if (html4) { 1537 if (!"iframe".equals(contentModelElement)) { 1538 err((contentModelFlag == ContentModelFlag.CDATA ? "CDATA" 1539 : "RCDATA") 1540 + " element \u201C" 1541 + contentModelElement 1542 + "\u201D contained the string \u201C</\u201D, but it was not the start of the end tag. (HTML4-only error)"); 1543 } 1544 } else { 1545 warn((contentModelFlag == ContentModelFlag.CDATA ? "CDATA" 1546 : "RCDATA") 1547 + " element \u201C" 1548 + contentModelElement 1549 + "\u201D contained the string \u201C</\u201D, but this did not close the element."); 1550 } 1551 } 1552 tokenHandler.characters(LT_SOLIDUS, 0, 2); 1553 emitStrBuf(); 1554 unread(c); 1555 return; 1556 } 1557 appendStrBuf(c); 1558 } 1559 endTag = true; 1560 tagName = contentModelElement; 1561 char c = read(); 1562 switch (c) { 1563 case ' ': 1564 case '\t': 1565 case '\n': 1566 case '\u000B': 1567 case '\u000C': 1568 /* 1569 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 1570 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch 1571 * to the before attribute name state. 1572 */ 1573 beforeAttributeNameState(); 1574 return; 1575 case '>': 1576 /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */ 1577 emitCurrentTagToken(); 1578 /* 1579 * Switch to the data state. 1580 */ 1581 return; 1582 case '\u0000': 1583 /* 1584 * EOF Parse error. 1585 */ 1586 err("Expected \u201C>\u201D but saw end of file instead."); 1587 /* 1588 * Emit the current tag token. 1589 */ 1590 emitCurrentTagToken(); 1591 /* Reconsume the character in the data state. */ 1592 unread(c); 1593 return; 1594 case '/': 1595 /* 1596 * U+002F SOLIDUS (/) Parse error unless this is a permitted 1597 * slash. 1598 */ 1599 // never permitted here 1600 err("Stray \u201C/\u201D in end tag."); 1601 /* Switch to the before attribute name state. */ 1602 beforeAttributeNameState(); 1603 return; 1604 default: 1605 if (html4) { 1606 err((contentModelFlag == ContentModelFlag.CDATA ? "CDATA" 1607 : "RCDATA") 1608 + " element \u201C" 1609 + contentModelElement 1610 + "\u201D contained the string \u201C</\u201D, but it was not the start of the end tag. (HTML4-only error)"); 1611 } else { 1612 warn((contentModelFlag == ContentModelFlag.CDATA ? "CDATA" 1613 : "RCDATA") 1614 + " element \u201C" 1615 + contentModelElement 1616 + "\u201D contained the string \u201C</\u201D, but this did not close the element."); 1617 } 1618 tokenHandler.characters(LT_SOLIDUS, 0, 2); 1619 emitStrBuf(); 1620 cstart = pos; // don't drop the character 1621 return; 1622 } 1623 } else { 1624 /* 1625 * Otherwise, if the content model flag is set to the PCDATA state, 1626 * or if the next few characters do match that tag name, consume the 1627 * next input character: 1628 */ 1629 char c = read(); 1630 if (c >= 'A' && c <= 'Z') { 1631 /* 1632 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL 1633 * LETTER Z Create a new end tag token, 1634 */ 1635 endTag = true; 1636 clearStrBuf(); 1637 /* 1638 * set its tag name to the lowercase version of the input 1639 * character (add 0x0020 to the character's code point), 1640 */ 1641 appendStrBuf((char) (c + 0x20)); 1642 /* 1643 * then switch to the tag name state. (Don't emit the token yet; 1644 * further details will be filled in before it is emitted.) 1645 */ 1646 tagNameState(); 1647 return; 1648 } else if (c >= 'a' && c <= 'z') { 1649 /* 1650 * U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL 1651 * LETTER Z Create a new end tag token, 1652 */ 1653 endTag = true; 1654 clearStrBuf(); 1655 /* 1656 * set its tag name to the input character, 1657 */ 1658 appendStrBuf(c); 1659 /* 1660 * then switch to the tag name state. (Don't emit the token yet; 1661 * further details will be filled in before it is emitted.) 1662 */ 1663 tagNameState(); 1664 return; 1665 } else if (c == '>') { 1666 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 1667 err("Saw \u201C</>\u201D."); 1668 /* 1669 * Switch to the data state. 1670 */ 1671 return; 1672 } else if (c == '\u0000') { 1673 /* EOF Parse error. */ 1674 err("Saw \u201C</\u201D immediately before end of file."); 1675 /* 1676 * Emit a U+003C LESS-THAN SIGN character token and a U+002F 1677 * SOLIDUS character token. 1678 */ 1679 tokenHandler.characters(LT_SOLIDUS, 0, 2); 1680 /* 1681 * Reconsume the EOF character in the data state. 1682 */ 1683 unread(c); 1684 return; 1685 } else { 1686 /* Anything else Parse error. */ 1687 err("Garbage after \u201C</\u201D."); 1688 /* 1689 * Switch to the bogus comment state. 1690 */ 1691 clearLongStrBuf(); 1692 appendToComment(c); 1693 bogusCommentState(); 1694 return; 1695 } 1696 } 1697 } 1698 1699 /** 1700 * Tag name state 1701 * 1702 * @throws IOException 1703 * @throws SAXException 1704 */ 1705 private void tagNameState() throws SAXException, IOException { 1706 for (;;) { 1707 /* 1708 * Consume the next input character: 1709 */ 1710 char c = read(); 1711 switch (c) { 1712 case ' ': 1713 case '\t': 1714 case '\n': 1715 case '\u000B': 1716 case '\u000C': 1717 /* 1718 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 1719 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch 1720 * to the before attribute name state. 1721 */ 1722 tagName = strBufToElementNameString(); 1723 beforeAttributeNameState(); 1724 return; 1725 case '>': 1726 /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */ 1727 tagName = strBufToElementNameString(); 1728 emitCurrentTagToken(); 1729 /* 1730 * Switch to the data state. 1731 */ 1732 return; 1733 case '\u0000': 1734 /* 1735 * EOF Parse error. 1736 */ 1737 err("End of file seen when looking for tag name"); 1738 /* 1739 * Emit the current tag token. 1740 */ 1741 tagName = strBufToElementNameString(); 1742 emitCurrentTagToken(); 1743 /* 1744 * Reconsume the EOF character in the data state. 1745 */ 1746 unread(c); 1747 return; 1748 case '/': 1749 /* 1750 * U+002F SOLIDUS (/) Parse error unless this is a permitted 1751 * slash. 1752 */ 1753 tagName = strBufToElementNameString(); 1754 parseErrorUnlessPermittedSlash(); 1755 /* 1756 * Switch to the before attribute name state. 1757 */ 1758 beforeAttributeNameState(); 1759 return; 1760 default: 1761 if (c >= 'A' && c <= 'Z') { 1762 /* 1763 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN 1764 * CAPITAL LETTER Z Append the lowercase version of the 1765 * current input character (add 0x0020 to the 1766 * character's code point) to the current tag token's 1767 * tag name. 1768 */ 1769 appendStrBuf((char) (c + 0x20)); 1770 } else { 1771 /* 1772 * Anything else Append the current input character to 1773 * the current tag token's tag name. 1774 */ 1775 appendStrBuf(c); 1776 } 1777 /* 1778 * Stay in the tag name state. 1779 */ 1780 continue; 1781 } 1782 } 1783 } 1784 1785 private String strBufToElementNameString() { 1786 // TODO Generate a better interning function 1787 return strBufToString().intern(); 1788 } 1789 1790 /** 1791 * This method implements a wrapper loop for the attribute-related states to 1792 * avoid recursion to an arbitrary depth. 1793 * 1794 * @throws IOException 1795 * @throws SAXException 1796 */ 1797 private void beforeAttributeNameState() throws SAXException, IOException { 1798 while (beforeAttributeNameStateImpl()) { 1799 // Spin. 1800 } 1801 } 1802 1803 /** 1804 * 1805 */ 1806 private void resetAttributes() { 1807 attributes = null; // XXX figure out reuse 1808 } 1809 1810 /** 1811 * Before attribute name state 1812 * 1813 * @throws IOException 1814 * @throws SAXException 1815 */ 1816 private boolean beforeAttributeNameStateImpl() throws SAXException, 1817 IOException { 1818 /* 1819 * Consume the next input character: 1820 */ 1821 for (;;) { 1822 char c = read(); 1823 switch (c) { 1824 case ' ': 1825 case '\t': 1826 case '\n': 1827 case '\u000B': 1828 case '\u000C': 1829 /* 1830 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 1831 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay 1832 * in the before attribute name state. 1833 */ 1834 continue; 1835 case '>': 1836 /* 1837 * U+003E GREATER-THAN SIGN (>) Emit the current tag token. 1838 */ 1839 emitCurrentTagToken(); 1840 /* 1841 * Switch to the data state. 1842 */ 1843 return false; 1844 case '/': 1845 /* 1846 * U+002F SOLIDUS (/) Parse error unless this is a permitted 1847 * slash. 1848 */ 1849 parseErrorUnlessPermittedSlash(); 1850 /* 1851 * Stay in the before attribute name state. 1852 */ 1853 continue; 1854 case '\u0000': 1855 /* EOF Parse error. */ 1856 err("Saw end of file without the previous tag ending with \u201C>\u201C."); 1857 /* 1858 * Emit the current tag token. 1859 */ 1860 emitCurrentTagToken(); 1861 /* 1862 * Reconsume the EOF character in the data state. 1863 */ 1864 unread(c); 1865 return false; 1866 default: 1867 /* 1868 * Anything else Start a new attribute in the current tag 1869 * token. 1870 */ 1871 clearStrBuf(); 1872 1873 if (c >= 'A' && c <= 'Z') { 1874 /* 1875 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN 1876 * CAPITAL LETTER Z Set that attribute's name to the 1877 * lowercase version of the current input character (add 1878 * 0x0020 to the character's code point) 1879 */ 1880 appendStrBuf((char) (c + 0x20)); 1881 } else { 1882 /* 1883 * Set that attribute's name to the current input 1884 * character, 1885 */ 1886 appendStrBuf(c); 1887 } 1888 /* 1889 * and its value to the empty string. 1890 */ 1891 // Will do later. 1892 /* 1893 * Switch to the attribute name state. 1894 */ 1895 return attributeNameState(); 1896 } 1897 } 1898 } 1899 1900 private void parseErrorUnlessPermittedSlash() throws SAXException, 1901 IOException { 1902 /* 1903 * A permitted slash is a U+002F SOLIDUS character that is immediately 1904 * followed by a U+003E GREATER-THAN SIGN, if, and only if, the current 1905 * token being processed is a start tag token whose tag name is one of 1906 * the following: base, link, meta, hr, br, img, embed, param, area, 1907 * col, input 1908 */ 1909 if (endTag) { 1910 err("Stray \u201C/\u201D in an end tag."); 1911 return; 1912 } 1913 char c = read(); 1914 int saveLine = line; 1915 int saveCol = col; 1916 line = linePrev; 1917 col = colPrev; 1918 if (c == '>') { 1919 if (!currentIsVoid() && !html4) { 1920 if (html4) { 1921 err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is not permitted in HTML4."); 1922 } else { 1923 err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is only permitted on void elements."); 1924 } 1925 } else if (html4) { 1926 err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is not permitted in HTML4. (HTML4-only error)"); 1927 } 1928 } else { 1929 err("Stray \u201C/\u201D in tag."); 1930 } 1931 line = saveLine; 1932 col = saveCol; 1933 unread(c); 1934 } 1935 1936 private void emitCurrentTagToken() throws SAXException { 1937 if (namePolicy != XmlViolationPolicy.ALLOW) { 1938 if (!isNcname(tagName)) { 1939 if (namePolicy == XmlViolationPolicy.FATAL) { 1940 fatal((endTag ? "End" : "Start") + " tag \u201C" + tagName 1941 + "\u201D has a non-NCName name."); 1942 } else { 1943 warn((endTag ? "End" : "Start") + " tag \u201C" + tagName 1944 + "\u201D has a non-NCName name. Ignoring token."); 1945 return; 1946 } 1947 } 1948 } 1949 Attributes attrs = (attributes == null ? EmptyAttributes.EMPTY_ATTRIBUTES 1950 : attributes); 1951 if (endTag) { 1952 /* 1953 * When an end tag token is emitted, the content model flag must be 1954 * switched to the PCDATA state. 1955 */ 1956 escapeFlag = false; 1957 contentModelFlag = ContentModelFlag.PCDATA; 1958 if (attrs.getLength() != 0) { 1959 /* 1960 * When an end tag token is emitted with attributes, that is a 1961 * parse error. 1962 */ 1963 err("End tag had attributes."); 1964 } 1965 tokenHandler.endTag(tagName, attrs); 1966 } else { 1967 tokenHandler.startTag(tagName, attrs); 1968 } 1969 } 1970 1971 /** 1972 * Attribute name state 1973 * 1974 * @throws IOException 1975 * @throws SAXException 1976 */ 1977 private boolean attributeNameState() throws SAXException, IOException { 1978 for (;;) { 1979 /* 1980 * Consume the next input character: 1981 */ 1982 char c = read(); 1983 switch (c) { 1984 case ' ': 1985 case '\t': 1986 case '\n': 1987 case '\u000B': 1988 case '\u000C': 1989 /* 1990 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 1991 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch 1992 * to the after attribute name state. 1993 */ 1994 attributeNameComplete(); 1995 return afterAttributeNameState(); 1996 case '=': 1997 /* 1998 * U+003D EQUALS SIGN (=) Switch to the before attribute 1999 * value state. 2000 */ 2001 attributeNameComplete(); 2002 return beforeAttributeValueState(); 2003 case '>': 2004 /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */ 2005 attributeNameComplete(); 2006 addAttributeWithoutValue(); 2007 emitCurrentTagToken(); 2008 /* 2009 * Switch to the data state. 2010 */ 2011 return false; 2012 case '/': 2013 /* 2014 * U+002F SOLIDUS (/) Parse error unless this is a permitted 2015 * slash. 2016 */ 2017 parseErrorUnlessPermittedSlash(); 2018 /* Switch to the before attribute name state. */ 2019 attributeNameComplete(); 2020 addAttributeWithoutValue(); 2021 return true; 2022 case '\u0000': 2023 /* 2024 * EOF Parse error. 2025 */ 2026 err("End of file occurred in an attribute name."); 2027 /* 2028 * Emit the current tag token. 2029 */ 2030 attributeNameComplete(); 2031 addAttributeWithoutValue(); 2032 emitCurrentTagToken(); 2033 /* Reconsume the EOF character in the data state. */ 2034 unread(c); 2035 return false; 2036 default: 2037 if (c >= 'A' && c <= 'Z') { 2038 /* 2039 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN 2040 * CAPITAL LETTER Z Append the lowercase version of the 2041 * current input character (add 0x0020 to the 2042 * character's code point) to the current attribute's 2043 * name. 2044 */ 2045 appendStrBuf((char) (c + 0x20)); 2046 } else { 2047 /* 2048 * Anything else Append the current input character to 2049 * the current attribute's name. 2050 */ 2051 appendStrBuf(c); 2052 } 2053 } 2054 /* 2055 * Stay in the attribute name state. 2056 */ 2057 continue; 2058 } 2059 } 2060 2061 private void attributeNameComplete() throws SAXException { 2062 attributeName = strBufToString(); 2063 if (attributes == null) { 2064 attributes = newAttributes(); 2065 } 2066 /* 2067 * When the user agent leaves the attribute name state (and before 2068 * emitting the tag token, if appropriate), the complete attribute's 2069 * name must be compared to the other attributes on the same token; if 2070 * there is already an attribute on the token with the exact same name, 2071 * then this is a parse error and the new attribute must be dropped, 2072 * along with the value that gets associated with it (if any). 2073 */ 2074 if (attributes.getIndex(attributeName) == -1) { 2075 if (namePolicy == XmlViolationPolicy.ALLOW) { 2076 shouldAddAttributes = true; 2077 } else { 2078 if (isNcname(attributeName)) { 2079 shouldAddAttributes = true; 2080 } else { 2081 if (namePolicy == XmlViolationPolicy.FATAL) { 2082 fatal("Attribute name \u201C" + attributeName 2083 + "\u201D is not an NCName."); 2084 } else { 2085 shouldAddAttributes = false; 2086 warn("Attribute name \u201C" 2087 + attributeName 2088 + "\u201D is not an NCName. Ignoring the attribute."); 2089 } 2090 } 2091 } 2092 } else { 2093 shouldAddAttributes = false; 2094 err("Duplicate attribute \u201C" + attributeName + "\u201D."); 2095 } 2096 } 2097 2098 private void addAttributeWithoutValue() throws SAXException { 2099 if (metaBoundaryPassed && "charset".equals(attributeName) 2100 && "meta".equals(tagName)) { 2101 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); 2102 } 2103 if (shouldAddAttributes) { 2104 if (html4) { 2105 if (AttributeInfo.isBoolean(attributeName)) { 2106 if (html4ModeCompatibleWithXhtml1Schemata) { 2107 attributes.addAttribute(attributeName, attributeName); 2108 } else { 2109 attributes.addAttribute(attributeName, ""); 2110 } 2111 } else { 2112 err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)"); 2113 attributes.addAttribute(attributeName, ""); 2114 } 2115 } else { 2116 if ("src".equals(attributeName) || "href".equals(attributeName)) { 2117 warn("Attribute \u201C" 2118 + attributeName 2119 + "\u201D without an explicit value seen. The attribute may be dropped by IE7."); 2120 } 2121 attributes.addAttribute(attributeName, ""); 2122 } 2123 } 2124 } 2125 2126 private void addAttributeWithValue() throws SAXException { 2127 if (metaBoundaryPassed && "meta" == tagName 2128 && "charset".equals(attributeName)) { 2129 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); 2130 } 2131 if (shouldAddAttributes) { 2132 String value = longStrBufToString(); 2133 if (!endTag) { 2134 if ("xmlns".equals(attributeName)) { 2135 if ("html" == tagName 2136 && "http://www.w3.org/1999/xhtml".equals(value)) { 2137 if (xmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) { 2138 return; 2139 } 2140 } else { 2141 if (bogusXmlnsPolicy == XmlViolationPolicy.FATAL) { 2142 fatal("Forbidden attribute \u201C" 2143 + attributeName 2144 + "\u201D is not mappable to namespace-aware XML 1.0."); 2145 } else { 2146 warn("Forbidden attribute \u201C" 2147 + attributeName 2148 + "\u201D is not mappable to namespace-aware XML 1.0."); 2149 if (bogusXmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) { 2150 return; 2151 } 2152 } 2153 } 2154 } else if (attributeName.startsWith("xmlns:")) { 2155 if (bogusXmlnsPolicy == XmlViolationPolicy.FATAL) { 2156 fatal("Forbidden attribute \u201C" 2157 + attributeName 2158 + "\u201D is not mappable to namespace-aware XML 1.0."); 2159 } else { 2160 warn("Forbidden attribute \u201C" 2161 + attributeName 2162 + "\u201D is not mappable to namespace-aware XML 1.0."); 2163 if (bogusXmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) { 2164 return; 2165 } 2166 } 2167 } else if (html4 && html4ModeCompatibleWithXhtml1Schemata && AttributeInfo.isCaseFolded(attributeName)) { 2168 value = toAsciiLowerCase(value); 2169 } 2170 } 2171 attributes.addAttribute(attributeName, value); 2172 } 2173 } 2174 2175 private String toAsciiLowerCase(String str) { 2176 if (str == null) { 2177 return null; 2178 } 2179 char[] b = new char[str.length()]; 2180 for (int i = 0; i < str.length(); i++) { 2181 char c = str.charAt(i); 2182 if (c >= 'A' && c <= 'Z') { 2183 c += 0x20; 2184 } 2185 b[i] = c; 2186 } 2187 return new String(b); 2188 } 2189 2190 /** 2191 * After attribute name state 2192 * 2193 * @throws IOException 2194 * @throws SAXException 2195 */ 2196 private boolean afterAttributeNameState() throws SAXException, IOException { 2197 for (;;) { 2198 /* 2199 * Consume the next input character: 2200 */ 2201 char c = read(); 2202 switch (c) { 2203 case ' ': 2204 case '\t': 2205 case '\n': 2206 case '\u000B': 2207 case '\u000C': 2208 /* 2209 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 2210 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay 2211 * in the after attribute name state. 2212 */ 2213 continue; 2214 case '=': 2215 /* 2216 * U+003D EQUALS SIGN (=) Switch to the before attribute 2217 * value state. 2218 */ 2219 return beforeAttributeValueState(); 2220 case '>': 2221 /* 2222 * U+003E GREATER-THAN SIGN (>) Emit the current tag token. 2223 */ 2224 addAttributeWithoutValue(); 2225 emitCurrentTagToken(); 2226 /* 2227 * Switch to the data state. 2228 */ 2229 return false; 2230 case '/': 2231 /* 2232 * U+002F SOLIDUS (/) Parse error unless this is a permitted 2233 * slash. 2234 */ 2235 addAttributeWithoutValue(); 2236 parseErrorUnlessPermittedSlash(); 2237 /* Switch to the before attribute name state. */ 2238 return true; 2239 case '\u0000': 2240 /* EOF Parse error. */ 2241 err("Saw end of file without the previous tag ending with \u201C>\u201C."); 2242 /* 2243 * Emit the current tag token. 2244 */ 2245 addAttributeWithoutValue(); 2246 emitCurrentTagToken(); 2247 /* 2248 * Reconsume the character in the data state. 2249 */ 2250 unread(c); 2251 return false; 2252 default: 2253 /* 2254 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN 2255 * CAPITAL LETTER Z Start a new attribute in the current tag 2256 * token. Set that attribute's name to the lowercase version 2257 * of the current input character (add 0x0020 to the 2258 * character's code point), and its value to the empty 2259 * string. Switch to the attribute name state. 2260 * 2261 * Anything else Start a new attribute in the current tag 2262 * token. Set that attribute's name to the current input 2263 * character, and its value to the empty string. Switch to 2264 * the attribute name state. 2265 */ 2266 // let's do this by respinning through the attribute loop 2267 addAttributeWithoutValue(); 2268 unread(c); 2269 return true; 2270 } 2271 } 2272 } 2273 2274 /** 2275 * Before attribute value state 2276 * 2277 * @throws IOException 2278 * @throws SAXException 2279 */ 2280 private boolean beforeAttributeValueState() throws SAXException, 2281 IOException { 2282 clearLongStrBuf(); 2283 for (;;) { 2284 /* 2285 * Consume the next input character: 2286 */ 2287 char c = read(); 2288 switch (c) { 2289 case ' ': 2290 case '\t': 2291 case '\n': 2292 case '\u000B': 2293 case '\u000C': 2294 /* 2295 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 2296 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay 2297 * in the before attribute value state. 2298 */ 2299 continue; 2300 case '"': 2301 /* 2302 * U+0022 QUOTATION MARK (") Switch to the attribute value 2303 * (double-quoted) state. 2304 */ 2305 return attributeValueDoubleQuotedState(); 2306 case '&': 2307 /* 2308 * U+0026 AMPERSAND (&) Switch to the attribute value 2309 * (unquoted) state and reconsume this input character. 2310 */ 2311 unread(c); 2312 return attributeValueUnquotedState(); 2313 case '\'': 2314 /* 2315 * U+0027 APOSTROPHE (') Switch to the attribute value 2316 * (single-quoted) state. 2317 */ 2318 return attributeValueSingleQuotedState(); 2319 case '>': 2320 /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */ 2321 addAttributeWithoutValue(); 2322 emitCurrentTagToken(); 2323 /* 2324 * Switch to the data state. 2325 */ 2326 return false; 2327 case '\u0000': 2328 /* EOF Parse error. */ 2329 err("Saw end of file without the previous tag ending with \u201C>\u201C."); 2330 /* 2331 * Emit the current tag token. 2332 */ 2333 addAttributeWithoutValue(); 2334 emitCurrentTagToken(); 2335 /* 2336 * Reconsume the character in the data state. 2337 */ 2338 unread(c); 2339 return false; 2340 default: 2341 if (html4 2342 && !((c >= 'a' && c <= 'z') 2343 || (c >= 'A' && c <= 'Z') 2344 || (c >= '0' && c <= '9') || c == '.' 2345 || c == '-' || c == '_' || c == ':')) { 2346 err("Non-name character in an unquoted attribute value. (This is an HTML4-only error.)"); 2347 } 2348 /* 2349 * Anything else Append the current input character to the 2350 * current attribute's value. 2351 */ 2352 appendLongStrBuf(c); 2353 /* 2354 * Switch to the attribute value (unquoted) state. 2355 */ 2356 return attributeValueUnquotedState(); 2357 } 2358 } 2359 } 2360 2361 /** 2362 * Attribute value (double-quoted) state 2363 * 2364 * @throws IOException 2365 * @throws SAXException 2366 */ 2367 private boolean attributeValueDoubleQuotedState() throws SAXException, 2368 IOException { 2369 inContent = true; 2370 for (;;) { 2371 /* 2372 * Consume the next input character: 2373 */ 2374 char c = read(); 2375 switch (c) { 2376 case '"': 2377 /* 2378 * U+0022 QUOTATION MARK (") Switch to the before attribute 2379 * name state. 2380 */ 2381 addAttributeWithValue(); 2382 inContent = false; 2383 return true; 2384 case '&': 2385 /* 2386 * U+0026 AMPERSAND (&) Switch to the entity in attribute 2387 * value state. 2388 */ 2389 entityInAttributeValueState(); 2390 continue; 2391 case '\u0000': 2392 /* EOF Parse error. */ 2393 err("End of file reached when inside a quoted attribute value."); 2394 /* Emit the current tag token. */ 2395 addAttributeWithValue(); 2396 emitCurrentTagToken(); 2397 /* 2398 * Reconsume the character in the data state. 2399 */ 2400 unread(c); 2401 inContent = false; 2402 return false; 2403 default: 2404 /* 2405 * Anything else Append the current input character to the 2406 * current attribute's value. 2407 */ 2408 appendLongStrBuf(c); 2409 /* 2410 * Stay in the attribute value (double-quoted) state. 2411 */ 2412 continue; 2413 } 2414 } 2415 } 2416 2417 /** 2418 * Attribute value (single-quoted) state 2419 * 2420 * @throws SAXException 2421 * @throws IOException 2422 */ 2423 private boolean attributeValueSingleQuotedState() throws SAXException, 2424 IOException { 2425 inContent = true; 2426 for (;;) { 2427 /* 2428 * Consume the next input character: 2429 */ 2430 char c = read(); 2431 switch (c) { 2432 case '\'': 2433 /* 2434 * U+0027 APOSTROPHE (') Switch to the before attribute name 2435 * state. 2436 */ 2437 addAttributeWithValue(); 2438 inContent = false; 2439 return true; 2440 case '&': 2441 /* 2442 * U+0026 AMPERSAND (&) Switch to the entity in attribute 2443 * value state. 2444 */ 2445 entityInAttributeValueState(); 2446 continue; 2447 case '\u0000': 2448 /* EOF Parse error. */ 2449 err("End of file reached when inside a quoted attribute value."); 2450 /* Emit the current tag token. */ 2451 addAttributeWithValue(); 2452 emitCurrentTagToken(); 2453 /* 2454 * Reconsume the character in the data state. 2455 */ 2456 unread(c); 2457 inContent = false; 2458 return false; 2459 default: 2460 /* 2461 * Anything else Append the current input character to the 2462 * current attribute's value. 2463 */ 2464 appendLongStrBuf(c); 2465 /* 2466 * Stay in the attribute value (double-quoted) state. 2467 */ 2468 continue; 2469 } 2470 } 2471 } 2472 2473 /** 2474 * Attribute value (unquoted) state 2475 * 2476 * @throws IOException 2477 * @throws SAXException 2478 */ 2479 private boolean attributeValueUnquotedState() throws SAXException, 2480 IOException { 2481 inContent = true; 2482 for (;;) { 2483 /* 2484 * Consume the next input character: 2485 */ 2486 char c = read(); 2487 switch (c) { 2488 case ' ': 2489 case '\t': 2490 case '\n': 2491 case '\u000B': 2492 case '\u000C': 2493 /* 2494 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 2495 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch 2496 * to the before attribute name state. 2497 */ 2498 addAttributeWithValue(); 2499 inContent = false; 2500 return true; 2501 case '&': 2502 /* 2503 * U+0026 AMPERSAND (&) Switch to the entity in attribute 2504 * value state. 2505 */ 2506 entityInAttributeValueState(); 2507 continue; 2508 case '>': 2509 /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */ 2510 addAttributeWithValue(); 2511 emitCurrentTagToken(); 2512 /* 2513 * Switch to the data state. 2514 */ 2515 inContent = false; 2516 return false; 2517 case '\u0000': 2518 /* EOF Parse error. */ 2519 err("Saw end of file without the previous tag ending with \u201C>\u201C."); 2520 /* 2521 * Emit the current tag token. 2522 */ 2523 addAttributeWithValue(); 2524 emitCurrentTagToken(); 2525 /* 2526 * Reconsume the character in the data state. 2527 */ 2528 unread(c); 2529 inContent = false; 2530 return false; 2531 case '<': 2532 warn("\u201C<\u201D in an unquoted attribute value. This does not end the tag."); 2533 // fall through 2534 default: 2535 if (html4 2536 && !((c >= 'a' && c <= 'z') 2537 || (c >= 'A' && c <= 'Z') 2538 || (c >= '0' && c <= '9') || c == '.' 2539 || c == '-' || c == '_' || c == ':')) { 2540 err("Non-name character in an unquoted attribute value. (This is an HTML4-only error.)"); 2541 } 2542 /* 2543 * Anything else Append the current input character to the 2544 * current attribute's value. 2545 */ 2546 appendLongStrBuf(c); 2547 /* 2548 * Stay in the attribute value (unquoted) state. 2549 */ 2550 continue; 2551 } 2552 } 2553 } 2554 2555 /** 2556 * Entity in attribute value state 2557 * 2558 * @throws IOException 2559 * @throws SAXException 2560 */ 2561 private void entityInAttributeValueState() throws SAXException, IOException { 2562 /* 2563 * Attempt to consume an entity. 2564 */ 2565 consumeEntity(true); 2566 /* 2567 * If nothing is returned, append a U+0026 AMPERSAND character to the 2568 * current attribute's value. 2569 * 2570 * Otherwise, append the returned character token to the current 2571 * attribute's value. 2572 */ 2573 // handled in consumeEntity(); 2574 /* 2575 * Finally, switch back to the attribute value state that you were in 2576 * when were switched into this state. 2577 */ 2578 return; 2579 } 2580 2581 /** 2582 * Bogus comment state 2583 * 2584 * @throws IOException 2585 * @throws SAXException 2586 */ 2587 private void bogusCommentState() throws SAXException, IOException { 2588 /* 2589 * (This can only happen if the content model flag is set to the PCDATA 2590 * state.) 2591 * 2592 * Consume every character up to the first U+003E GREATER-THAN SIGN 2593 * character (>) or the end of the file (EOF), whichever comes first. 2594 * Emit a comment token whose data is the concatenation of all the 2595 * characters starting from and including the character that caused the 2596 * state machine to switch into the bogus comment state, up to and 2597 * including the last consumed character before the U+003E character, if 2598 * any, or up to the end of the file otherwise. (If the comment was 2599 * started by the end of the file (EOF), the token is empty.) 2600 * 2601 * Switch to the data state. 2602 * 2603 * If the end of the file was reached, reconsume the EOF character. 2604 */ 2605 for (;;) { 2606 char c = read(); 2607 switch (c) { 2608 case '>': 2609 emitComment(); 2610 return; 2611 case '\u0000': 2612 emitComment(); 2613 unread(c); 2614 return; 2615 default: 2616 appendToComment(c); 2617 } 2618 } 2619 } 2620 2621 /** 2622 * Markup declaration open state 2623 * 2624 * @throws IOException 2625 * @throws SAXException 2626 */ 2627 private void markupDeclarationOpenState() throws SAXException, IOException { 2628 /* 2629 * (This can only happen if the content model flag is set to the PCDATA 2630 * state.) 2631 */ 2632 clearLongStrBuf(); 2633 /* 2634 * If the next two characters are both U+002D HYPHEN-MINUS (-) 2635 * characters, consume those two characters, create a comment token 2636 * whose data is the empty string, and switch to the comment start 2637 * state. 2638 * 2639 * Otherwise if the next seven characters are a case-insensitive match 2640 * for the word "DOCTYPE", then consume those characters and switch to 2641 * the DOCTYPE state. 2642 * 2643 * Otherwise, is is a parse error. Switch to the bogus comment state. 2644 * The next character that is consumed, if any, is the first character 2645 * that will be in the comment. 2646 */ 2647 char c = read(); 2648 switch (c) { 2649 case '-': 2650 c = read(); 2651 if (c == '-') { 2652 commentStates(); 2653 return; 2654 } else { 2655 err("Bogus comment."); 2656 appendToComment('-'); 2657 unread(c); 2658 bogusCommentState(); 2659 return; 2660 } 2661 case 'd': 2662 case 'D': 2663 appendToComment(c); 2664 for (int i = 0; i < OCTYPE.length; i++) { 2665 c = read(); 2666 char folded = c; 2667 if (c >= 'A' && c <= 'Z') { 2668 folded += 0x20; 2669 } 2670 if (folded == OCTYPE[i]) { 2671 appendToComment(c); 2672 } else { 2673 err("Bogus comment."); 2674 unread(c); 2675 bogusCommentState(); 2676 return; 2677 } 2678 } 2679 doctypeState(); 2680 return; 2681 default: 2682 err("Bogus comment."); 2683 unread(c); 2684 bogusCommentState(); 2685 return; 2686 } 2687 } 2688 2689 private enum CommentState { 2690 COMMENT_START_STATE, COMMENT_START_DASH_STATE, COMMENT_STATE, COMMENT_END_DASH_STATE, COMMENT_END_STATE 2691 } 2692 2693 /** 2694 * Comment start state, Comment start dash state, Comment state, Comment end 2695 * dash state and Comment end state 2696 * 2697 * @throws IOException 2698 * @throws SAXException 2699 */ 2700 private void commentStates() throws SAXException, IOException { 2701 CommentState state = CommentState.COMMENT_START_STATE; 2702 for (;;) { 2703 char c = read(); 2704 switch (state) { 2705 case COMMENT_START_STATE: 2706 /* 2707 * Comment start state 2708 * 2709 * 2710 * Consume the next input character: 2711 */ 2712 switch (c) { 2713 case '-': 2714 /* 2715 * U+002D HYPHEN-MINUS (-) Switch to the comment 2716 * start dash state. 2717 */ 2718 state = CommentState.COMMENT_START_DASH_STATE; 2719 continue; 2720 case '>': 2721 /* 2722 * U+003E GREATER-THAN SIGN (>) Parse error. 2723 */ 2724 err("Premature end of comment."); 2725 /* Emit the comment token. */ 2726 emitComment(); 2727 /* 2728 * Switch to the data state. 2729 */ 2730 return; 2731 case '\u0000': 2732 /* 2733 * EOF Parse error. 2734 */ 2735 err("End of file inside comment."); 2736 /* Emit the comment token. */ 2737 emitComment(); 2738 /* 2739 * Reconsume the EOF character in the data state. 2740 */ 2741 unread(c); 2742 return; 2743 default: 2744 /* 2745 * Anything else Append the input character to the 2746 * comment token's data. 2747 */ 2748 appendToComment(c); 2749 /* 2750 * Switch to the comment state. 2751 */ 2752 state = CommentState.COMMENT_STATE; 2753 continue; 2754 } 2755 case COMMENT_START_DASH_STATE: 2756 /* 2757 * Comment start dash state 2758 * 2759 * Consume the next input character: 2760 */ 2761 switch (c) { 2762 case '-': 2763 /* 2764 * U+002D HYPHEN-MINUS (-) Switch to the comment end 2765 * state 2766 */ 2767 state = CommentState.COMMENT_END_STATE; 2768 continue; 2769 case '>': 2770 /* 2771 * U+003E GREATER-THAN SIGN (>) Parse error. 2772 */ 2773 err("Premature end of comment."); 2774 /* Emit the comment token. */ 2775 emitComment(); 2776 /* 2777 * Switch to the data state. 2778 */ 2779 return; 2780 case '\u0000': 2781 /* 2782 * EOF Parse error. 2783 */ 2784 err("End of file inside comment."); 2785 /* Emit the comment token. */ 2786 emitComment(); 2787 /* 2788 * Reconsume the EOF character in the data state. 2789 */ 2790 unread(c); 2791 return; 2792 default: 2793 /* 2794 * Anything else Append a U+002D HYPHEN-MINUS (-) 2795 * character and the input character to the comment 2796 * token's data. 2797 */ 2798 appendToComment('-'); 2799 appendToComment(c); 2800 /* 2801 * Switch to the comment state. 2802 */ 2803 state = CommentState.COMMENT_STATE; 2804 continue; 2805 } 2806 case COMMENT_STATE: 2807 /* 2808 * Comment state Consume the next input character: 2809 */ 2810 switch (c) { 2811 case '-': 2812 /* 2813 * U+002D HYPHEN-MINUS (-) Switch to the comment end 2814 * dash state 2815 */ 2816 state = CommentState.COMMENT_END_DASH_STATE; 2817 continue; 2818 case '\u0000': 2819 /* 2820 * EOF Parse error. 2821 */ 2822 err("End of file inside comment."); 2823 /* Emit the comment token. */ 2824 emitComment(); 2825 /* 2826 * Reconsume the EOF character in the data state. 2827 */ 2828 unread(c); 2829 return; 2830 default: 2831 /* 2832 * Anything else Append the input character to the 2833 * comment token's data. 2834 */ 2835 appendToComment(c); 2836 /* 2837 * Stay in the comment state. 2838 */ 2839 continue; 2840 } 2841 case COMMENT_END_DASH_STATE: 2842 /* 2843 * Comment end dash state Consume the next input character: 2844 */ 2845 switch (c) { 2846 case '-': 2847 /* 2848 * U+002D HYPHEN-MINUS (-) Switch to the comment end 2849 * state 2850 */ 2851 state = CommentState.COMMENT_END_STATE; 2852 continue; 2853 case '\u0000': 2854 /* 2855 * EOF Parse error. 2856 */ 2857 err("End of file inside comment."); 2858 /* Emit the comment token. */ 2859 emitComment(); 2860 /* 2861 * Reconsume the EOF character in the data state. 2862 */ 2863 unread(c); 2864 return; 2865 default: 2866 /* 2867 * Anything else Append a U+002D HYPHEN-MINUS (-) 2868 * character and the input character to the comment 2869 * token's data. 2870 */ 2871 appendToComment('-'); 2872 appendToComment(c); 2873 /* 2874 * Switch to the comment state. 2875 */ 2876 state = CommentState.COMMENT_STATE; 2877 continue; 2878 } 2879 case COMMENT_END_STATE: 2880 /* 2881 * Comment end dash state Consume the next input character: 2882 */ 2883 switch (c) { 2884 case '>': 2885 /* 2886 * U+003E GREATER-THAN SIGN (>) Emit the comment 2887 * token. 2888 */ 2889 emitComment(); 2890 /* 2891 * Switch to the data state. 2892 */ 2893 return; 2894 case '-': 2895 /* U+002D HYPHEN-MINUS (-) Parse error. */ 2896 err("Consecutive hyphens did not terminate a comment."); 2897 /* 2898 * Append a U+002D HYPHEN-MINUS (-) character to the 2899 * comment token's data. 2900 */ 2901 appendToComment('-'); 2902 /* 2903 * Stay in the comment end state. 2904 */ 2905 continue; 2906 case '\u0000': 2907 /* 2908 * EOF Parse error. 2909 */ 2910 err("End of file inside comment."); 2911 /* Emit the comment token. */ 2912 emitComment(); 2913 /* 2914 * Reconsume the EOF character in the data state. 2915 */ 2916 unread(c); 2917 return; 2918 default: 2919 /* 2920 * Anything else Parse error. 2921 */ 2922 err("Consecutive hyphens did not terminate a comment."); 2923 /* 2924 * Append two U+002D HYPHEN-MINUS (-) characters and 2925 * the input character to the comment token's data. 2926 */ 2927 appendToComment('-'); 2928 appendToComment('-'); 2929 appendToComment(c); 2930 /* 2931 * Switch to the comment state. 2932 */ 2933 state = CommentState.COMMENT_STATE; 2934 continue; 2935 } 2936 } 2937 } 2938 } 2939 2940 /** 2941 * DOCTYPE state 2942 * 2943 * @throws IOException 2944 * @throws SAXException 2945 */ 2946 private void doctypeState() throws SAXException, IOException { 2947 systemIdentifier = null; 2948 publicIdentifier = null; 2949 doctypeName = null; 2950 /* 2951 * Consume the next input character: 2952 */ 2953 char c = read(); 2954 switch (c) { 2955 case ' ': 2956 case '\t': 2957 case '\n': 2958 case '\u000B': 2959 case '\u000C': 2960 /* 2961 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B LINE 2962 * TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch to the 2963 * before DOCTYPE name state. 2964 */ 2965 beforeDoctypeNameState(); 2966 return; 2967 default: 2968 /* 2969 * Anything else Parse error. 2970 */ 2971 err("Missing space before doctype name."); 2972 /* 2973 * Reconsume the current character in the before DOCTYPE name 2974 * state. 2975 */ 2976 unread(c); 2977 beforeDoctypeNameState(); 2978 return; 2979 } 2980 } 2981 2982 /** 2983 * Before DOCTYPE name state 2984 * 2985 * @throws IOException 2986 * @throws SAXException 2987 */ 2988 private void beforeDoctypeNameState() throws SAXException, IOException { 2989 for (;;) { 2990 /* 2991 * Consume the next input character: 2992 */ 2993 char c = read(); 2994 switch (c) { 2995 case ' ': 2996 case '\t': 2997 case '\n': 2998 case '\u000B': 2999 case '\u000C': 3000 /* 3001 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 3002 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay 3003 * in the before DOCTYPE name state. 3004 */ 3005 continue; 3006 case '>': 3007 /* 3008 * U+003E GREATER-THAN SIGN (>) Parse error. 3009 */ 3010 err("Nameless doctype."); 3011 /* 3012 * Create a new DOCTYPE token. Set its correctness flag to 3013 * incorrect. Emit the token. 3014 */ 3015 tokenHandler.doctype("", null, null, false); 3016 /* 3017 * Switch to the data state. 3018 */ 3019 return; 3020 case '\u0000': 3021 /* EOF Parse error. */ 3022 err("End of file inside doctype."); 3023 /* 3024 * Create a new DOCTYPE token. Set its correctness flag to 3025 * incorrect. Emit the token. 3026 */ 3027 tokenHandler.doctype("", null, null, false); 3028 /* 3029 * Reconsume the EOF character in the data state. 3030 */ 3031 unread(c); 3032 return; 3033 default: 3034 /* Anything else Create a new DOCTYPE token. */ 3035 clearStrBuf(); 3036 /* 3037 * Set the token's name name to the current input character. 3038 */ 3039 appendStrBuf(c); 3040 /* 3041 * Switch to the DOCTYPE name state. 3042 */ 3043 doctypeNameState(); 3044 return; 3045 } 3046 } 3047 } 3048 3049 /** 3050 * DOCTYPE name state 3051 * 3052 * @throws IOException 3053 * @throws SAXException 3054 */ 3055 private void doctypeNameState() throws SAXException, IOException { 3056 for (;;) { 3057 /* 3058 * First, consume the next input character: 3059 */ 3060 char c = read(); 3061 switch (c) { 3062 case ' ': 3063 case '\t': 3064 case '\n': 3065 case '\u000B': 3066 case '\u000C': 3067 /* 3068 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 3069 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch 3070 * to the after DOCTYPE name state. 3071 */ 3072 doctypeName = strBufToString(); 3073 afterDoctypeNameState(); 3074 return; 3075 case '>': 3076 /* 3077 * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE 3078 * token. 3079 */ 3080 tokenHandler.doctype(strBufToString(), null, null, true); 3081 /* 3082 * Switch to the data state. 3083 */ 3084 return; 3085 case '\u0000': 3086 /* EOF Parse error. */ 3087 err("End of file inside doctype."); 3088 /* 3089 * Set the DOCTYPE token's correctness flag to incorrect. 3090 * Emit that DOCTYPE token. 3091 */ 3092 tokenHandler.doctype(strBufToString(), null, null, false); 3093 /* 3094 * Reconsume the EOF character in the data state. 3095 */ 3096 unread(c); 3097 return; 3098 default: 3099 /* 3100 * Anything else Append the current input character to the 3101 * current DOCTYPE token's name. 3102 */ 3103 appendStrBuf(c); 3104 /* 3105 * Stay in the DOCTYPE name state. 3106 */ 3107 continue; 3108 } 3109 } 3110 } 3111 3112 /** 3113 * After DOCTYPE name state 3114 * 3115 * @throws IOException 3116 * @throws SAXException 3117 */ 3118 private void afterDoctypeNameState() throws SAXException, IOException { 3119 for (;;) { 3120 /* 3121 * Consume the next input character: 3122 */ 3123 char c = read(); 3124 switch (c) { 3125 case ' ': 3126 case '\t': 3127 case '\n': 3128 case '\u000B': 3129 case '\u000C': 3130 /* 3131 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 3132 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay 3133 * in the after DOCTYPE name state. 3134 */ 3135 continue; 3136 case '>': 3137 /* 3138 * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE 3139 * token. 3140 */ 3141 tokenHandler.doctype(doctypeName, null, null, true); 3142 /* 3143 * Switch to the data state. 3144 */ 3145 return; 3146 case '\u0000': 3147 /* EOF Parse error. */ 3148 err("End of file inside doctype."); 3149 /* 3150 * Set the DOCTYPE token's correctness flag to incorrect. 3151 * Emit that DOCTYPE token. 3152 */ 3153 tokenHandler.doctype(doctypeName, null, null, false); 3154 /* 3155 * Reconsume the EOF character in the data state. 3156 */ 3157 unread(c); 3158 return; 3159 case 'p': 3160 case 'P': 3161 /* 3162 * If the next six characters are a case-insensitive match 3163 * for the word "PUBLIC", then consume those characters and 3164 * switch to the before DOCTYPE public identifier state. 3165 */ 3166 for (int i = 0; i < UBLIC.length; i++) { 3167 c = read(); 3168 char folded = c; 3169 if (c >= 'A' && c <= 'Z') { 3170 folded += 0x20; 3171 } 3172 if (folded != UBLIC[i]) { 3173 err("Bogus doctype."); 3174 unread(c); 3175 bogusDoctypeState(); 3176 return; 3177 } 3178 } 3179 beforeDoctypePublicIdentifierState(); 3180 return; 3181 case 's': 3182 case 'S': 3183 /* 3184 * Otherwise, if the next six characters are a 3185 * case-insensitive match for the word "SYSTEM", then 3186 * consume those characters and switch to the before DOCTYPE 3187 * system identifier state. 3188 */ 3189 for (int i = 0; i < YSTEM.length; i++) { 3190 c = read(); 3191 char folded = c; 3192 if (c >= 'A' && c <= 'Z') { 3193 folded += 0x20; 3194 } 3195 if (folded != YSTEM[i]) { 3196 err("Bogus doctype."); 3197 unread(c); 3198 bogusDoctypeState(); 3199 return; 3200 } 3201 } 3202 beforeDoctypeSystemIdentifierState(); 3203 return; 3204 default: 3205 /* 3206 * Otherwise, this is the parse error. 3207 */ 3208 err("Bogus doctype."); 3209 /* 3210 * Switch to the bogus DOCTYPE state. 3211 */ 3212 bogusDoctypeState(); 3213 return; 3214 } 3215 } 3216 } 3217 3218 /** 3219 * Before DOCTYPE public identifier state 3220 * 3221 * @throws IOException 3222 * @throws SAXException 3223 */ 3224 private void beforeDoctypePublicIdentifierState() throws SAXException, 3225 IOException { 3226 for (;;) { 3227 /* 3228 * Consume the next input character: 3229 */ 3230 char c = read(); 3231 switch (c) { 3232 case ' ': 3233 case '\t': 3234 case '\n': 3235 case '\u000B': 3236 case '\u000C': 3237 /* 3238 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 3239 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay 3240 * in the before DOCTYPE public identifier state. 3241 */ 3242 continue; 3243 case '"': 3244 /* 3245 * U+0022 QUOTATION MARK (") Set the DOCTYPE token's public 3246 * identifier to the empty string, 3247 */ 3248 clearLongStrBuf(); 3249 /* 3250 * then switch to the DOCTYPE public identifier 3251 * (double-quoted) state. 3252 */ 3253 doctypePublicIdentifierDoubleQuotedState(); 3254 return; 3255 case '\'': 3256 /* 3257 * U+0027 APOSTROPHE (') Set the DOCTYPE token's public 3258 * identifier to the empty string, 3259 */ 3260 clearLongStrBuf(); 3261 /* 3262 * then switch to the DOCTYPE public identifier 3263 * (single-quoted) state. 3264 */ 3265 doctypePublicIdentifierSingleQuotedState(); 3266 return; 3267 case '>': 3268 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 3269 err("Expected a public identifier but the doctype ended."); 3270 /* 3271 * Set the DOCTYPE token's correctness flag to incorrect. 3272 * Emit that DOCTYPE token. 3273 */ 3274 tokenHandler.doctype(doctypeName, null, null, false); 3275 /* 3276 * Switch to the data state. 3277 */ 3278 return; 3279 case '\u0000': 3280 /* EOF Parse error. */ 3281 err("End of file inside a doctype."); 3282 /* 3283 * Set the DOCTYPE token's correctness flag to incorrect. 3284 * Emit that DOCTYPE token. 3285 */ 3286 tokenHandler.doctype(doctypeName, null, null, false); 3287 /* 3288 * Reconsume the EOF character in the data state. 3289 */ 3290 unread(c); 3291 return; 3292 default: 3293 /* Anything else Parse error. */ 3294 err("Bogus doctype."); 3295 /* 3296 * Switch to the bogus DOCTYPE state. 3297 */ 3298 bogusDoctypeState(); 3299 return; 3300 } 3301 } 3302 } 3303 3304 /** 3305 * DOCTYPE public identifier (double-quoted) state 3306 * 3307 * @throws IOException 3308 * @throws SAXException 3309 */ 3310 private void doctypePublicIdentifierDoubleQuotedState() 3311 throws SAXException, IOException { 3312 for (;;) { 3313 /* 3314 * Consume the next input character: 3315 */ 3316 char c = read(); 3317 switch (c) { 3318 case '"': 3319 /* 3320 * U+0022 QUOTATION MARK (") Switch to the after DOCTYPE 3321 * public identifier state. 3322 */ 3323 publicIdentifier = longStrBufToString(); 3324 afterDoctypePublicIdentifierState(); 3325 return; 3326 case '\u0000': 3327 /* EOF Parse error. */ 3328 err("End of file inside public identifier."); 3329 /* 3330 * Set the DOCTYPE token's correctness flag to incorrect. 3331 * Emit that DOCTYPE token. 3332 */ 3333 tokenHandler.doctype(doctypeName, longStrBufToString(), 3334 null, false); 3335 /* 3336 * Reconsume the EOF character in the data state. 3337 */ 3338 unread(c); 3339 return; 3340 default: 3341 /* 3342 * Anything else Append the current input character to the 3343 * current DOCTYPE token's public identifier. 3344 */ 3345 appendLongStrBuf(c); 3346 /* 3347 * Stay in the DOCTYPE public identifier (double-quoted) 3348 * state. 3349 */ 3350 continue; 3351 } 3352 } 3353 } 3354 3355 /** 3356 * DOCTYPE public identifier (single-quoted) state 3357 * 3358 * @throws IOException 3359 * @throws SAXException 3360 */ 3361 private void doctypePublicIdentifierSingleQuotedState() 3362 throws SAXException, IOException { 3363 for (;;) { 3364 /* 3365 * Consume the next input character: 3366 */ 3367 char c = read(); 3368 switch (c) { 3369 case '\'': 3370 /* 3371 * U+0027 APOSTROPHE (') Switch to the after DOCTYPE public 3372 * identifier state. 3373 */ 3374 publicIdentifier = longStrBufToString(); 3375 afterDoctypePublicIdentifierState(); 3376 return; 3377 case '\u0000': 3378 /* EOF Parse error. */ 3379 err("End of file inside public identifier."); 3380 /* 3381 * Set the DOCTYPE token's correctness flag to incorrect. 3382 * Emit that DOCTYPE token. 3383 */ 3384 tokenHandler.doctype(doctypeName, longStrBufToString(), 3385 null, false); 3386 /* 3387 * Reconsume the EOF character in the data state. 3388 */ 3389 unread(c); 3390 return; 3391 default: 3392 /* 3393 * Anything else Append the current input character to the 3394 * current DOCTYPE token's public identifier. 3395 */ 3396 appendLongStrBuf(c); 3397 /* 3398 * Stay in the DOCTYPE public identifier (single-quoted) 3399 * state. 3400 */ 3401 continue; 3402 } 3403 } 3404 } 3405 3406 /** 3407 * After DOCTYPE public identifier state 3408 * 3409 * @throws IOException 3410 * @throws SAXException 3411 * 3412 */ 3413 private void afterDoctypePublicIdentifierState() throws SAXException, 3414 IOException { 3415 for (;;) { 3416 /* 3417 * Consume the next input character: 3418 */ 3419 char c = read(); 3420 switch (c) { 3421 case ' ': 3422 case '\t': 3423 case '\n': 3424 case '\u000B': 3425 case '\u000C': 3426 /* 3427 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 3428 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay 3429 * in the after DOCTYPE public identifier state. 3430 */ 3431 continue; 3432 case '"': 3433 /* 3434 * U+0022 QUOTATION MARK (") Set the DOCTYPE token's system 3435 * identifier to the empty string, 3436 */ 3437 clearLongStrBuf(); 3438 /* 3439 * then switch to the DOCTYPE system identifier 3440 * (double-quoted) state. 3441 */ 3442 doctypeSystemIdentifierDoubleQuotedState(); 3443 return; 3444 case '\'': 3445 /* 3446 * U+0027 APOSTROPHE (') Set the DOCTYPE token's system 3447 * identifier to the empty string, 3448 */ 3449 clearLongStrBuf(); 3450 /* 3451 * then switch to the DOCTYPE system identifier 3452 * (single-quoted) state. 3453 */ 3454 doctypeSystemIdentifierSingleQuotedState(); 3455 return; 3456 case '>': 3457 /* 3458 * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE 3459 * token. 3460 */ 3461 tokenHandler.doctype(doctypeName, publicIdentifier, null, 3462 true); 3463 /* 3464 * Switch to the data state. 3465 */ 3466 return; 3467 case '\u0000': 3468 /* EOF Parse error. */ 3469 err("End of file inside doctype."); 3470 /* 3471 * Set the DOCTYPE token's correctness flag to incorrect. 3472 * Emit that DOCTYPE token. 3473 */ 3474 tokenHandler.doctype(doctypeName, publicIdentifier, null, 3475 false); 3476 /* 3477 * Reconsume the EOF character in the data state. 3478 */ 3479 unread(c); 3480 return; 3481 default: 3482 /* Anything else Parse error. */ 3483 err("Bogus doctype."); 3484 /* 3485 * Switch to the bogus DOCTYPE state. 3486 */ 3487 bogusDoctypeState(); 3488 return; 3489 } 3490 } 3491 } 3492 3493 /** 3494 * Before DOCTYPE system identifier state 3495 * 3496 * @throws IOException 3497 * @throws SAXException 3498 */ 3499 private void beforeDoctypeSystemIdentifierState() throws SAXException, 3500 IOException { 3501 for (;;) { 3502 /* 3503 * Consume the next input character: 3504 */ 3505 char c = read(); 3506 switch (c) { 3507 case ' ': 3508 case '\t': 3509 case '\n': 3510 case '\u000B': 3511 case '\u000C': 3512 /* 3513 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 3514 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay 3515 * in the before DOCTYPE system identifier state. 3516 */ 3517 continue; 3518 case '"': 3519 /* 3520 * U+0022 QUOTATION MARK (") Set the DOCTYPE token's system 3521 * identifier to the empty string, 3522 */ 3523 clearLongStrBuf(); 3524 /* 3525 * then switch to the DOCTYPE system identifier 3526 * (double-quoted) state. 3527 */ 3528 doctypeSystemIdentifierDoubleQuotedState(); 3529 return; 3530 case '\'': 3531 /* 3532 * U+0027 APOSTROPHE (') Set the DOCTYPE token's system 3533 * identifier to the empty string, 3534 */ 3535 clearLongStrBuf(); 3536 /* 3537 * then switch to the DOCTYPE system identifier 3538 * (single-quoted) state. 3539 */ 3540 doctypeSystemIdentifierSingleQuotedState(); 3541 return; 3542 case '>': 3543 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 3544 err("Expected a system identifier but the doctype ended."); 3545 /* 3546 * Set the DOCTYPE token's correctness flag to incorrect. 3547 * Emit that DOCTYPE token. 3548 */ 3549 tokenHandler.doctype(doctypeName, null, null, false); 3550 /* 3551 * Switch to the data state. 3552 */ 3553 return; 3554 case '\u0000': 3555 /* EOF Parse error. */ 3556 err("End of file inside a doctype."); 3557 /* 3558 * Set the DOCTYPE token's correctness flag to incorrect. 3559 * Emit that DOCTYPE token. 3560 */ 3561 tokenHandler.doctype(doctypeName, null, null, false); 3562 /* 3563 * Reconsume the EOF character in the data state. 3564 */ 3565 unread(c); 3566 return; 3567 default: 3568 /* Anything else Parse error. */ 3569 err("Bogus doctype."); 3570 /* 3571 * Switch to the bogus DOCTYPE state. 3572 */ 3573 bogusDoctypeState(); 3574 return; 3575 } 3576 } 3577 } 3578 3579 /** 3580 * DOCTYPE system identifier (double-quoted) state 3581 * 3582 * @throws IOException 3583 * @throws SAXException 3584 */ 3585 private void doctypeSystemIdentifierDoubleQuotedState() 3586 throws SAXException, IOException { 3587 for (;;) { 3588 /* 3589 * Consume the next input character: 3590 */ 3591 char c = read(); 3592 switch (c) { 3593 case '"': 3594 /* 3595 * U+0022 QUOTATION MARK (") Switch to the after DOCTYPE 3596 * system identifier state. 3597 */ 3598 systemIdentifier = longStrBufToString(); 3599 afterDoctypeSystemIdentifierState(); 3600 return; 3601 case '\u0000': 3602 /* EOF Parse error. */ 3603 err("End of file inside system identifier."); 3604 /* 3605 * Set the DOCTYPE token's correctness flag to incorrect. 3606 * Emit that DOCTYPE token. 3607 */ 3608 tokenHandler.doctype(doctypeName, publicIdentifier, 3609 longStrBufToString(), false); 3610 /* 3611 * Reconsume the EOF character in the data state. 3612 */ 3613 unread(c); 3614 return; 3615 default: 3616 /* 3617 * Anything else Append the current input character to the 3618 * current DOCTYPE token's system identifier. 3619 */ 3620 appendLongStrBuf(c); 3621 /* 3622 * Stay in the DOCTYPE system identifier (double-quoted) 3623 * state. 3624 */ 3625 continue; 3626 } 3627 } 3628 } 3629 3630 /** 3631 * DOCTYPE system identifier (single-quoted) state 3632 * 3633 * @throws IOException 3634 * @throws SAXException 3635 */ 3636 private void doctypeSystemIdentifierSingleQuotedState() 3637 throws SAXException, IOException { 3638 for (;;) { 3639 /* 3640 * Consume the next input character: 3641 */ 3642 char c = read(); 3643 switch (c) { 3644 case '\'': 3645 /* 3646 * U+0027 APOSTROPHE (') Switch to the after DOCTYPE system 3647 * identifier state. 3648 */ 3649 systemIdentifier = longStrBufToString(); 3650 afterDoctypeSystemIdentifierState(); 3651 return; 3652 case '\u0000': 3653 /* EOF Parse error. */ 3654 err("End of file inside system identifier."); 3655 /* 3656 * Set the DOCTYPE token's correctness flag to incorrect. 3657 * Emit that DOCTYPE token. 3658 */ 3659 tokenHandler.doctype(doctypeName, publicIdentifier, 3660 longStrBufToString(), false); 3661 /* 3662 * Reconsume the EOF character in the data state. 3663 */ 3664 unread(c); 3665 return; 3666 default: 3667 /* 3668 * Anything else Append the current input character to the 3669 * current DOCTYPE token's system identifier. 3670 */ 3671 appendLongStrBuf(c); 3672 /* 3673 * Stay in the DOCTYPE system identifier (double-quoted) 3674 * state. 3675 */ 3676 continue; 3677 } 3678 } 3679 } 3680 3681 /** 3682 * After DOCTYPE system identifier state 3683 * 3684 * @throws IOException 3685 * @throws SAXException 3686 */ 3687 private void afterDoctypeSystemIdentifierState() throws SAXException, 3688 IOException { 3689 for (;;) { 3690 /* 3691 * Consume the next input character: 3692 */ 3693 char c = read(); 3694 switch (c) { 3695 case ' ': 3696 case '\t': 3697 case '\n': 3698 case '\u000B': 3699 case '\u000C': 3700 /* 3701 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B 3702 * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay 3703 * in the after DOCTYPE system identifier state. 3704 */ 3705 continue; 3706 case '>': 3707 /* 3708 * U+003E GREATER-THAN SIGN (>) Emit the current DOCTYPE 3709 * token. 3710 */ 3711 tokenHandler.doctype(doctypeName, publicIdentifier, 3712 systemIdentifier, true); 3713 /* 3714 * Switch to the data state. 3715 */ 3716 return; 3717 case '\u0000': 3718 /* EOF Parse error. */ 3719 err("End of file inside doctype."); 3720 /* 3721 * Set the DOCTYPE token's correctness flag to incorrect. 3722 * Emit that DOCTYPE token. 3723 */ 3724 tokenHandler.doctype(doctypeName, publicIdentifier, 3725 systemIdentifier, false); 3726 /* 3727 * Reconsume the EOF character in the data state. 3728 */ 3729 unread(c); 3730 return; 3731 default: 3732 /* Anything else Parse error. */ 3733 err("Bogus doctype."); 3734 /* 3735 * Switch to the bogus DOCTYPE state. 3736 */ 3737 bogusDoctypeState(); 3738 return; 3739 } 3740 } 3741 } 3742 3743 /** 3744 * Bogus DOCTYPE state 3745 * 3746 * @throws IOException 3747 * @throws SAXException 3748 */ 3749 private void bogusDoctypeState() throws SAXException, IOException { 3750 for (;;) { 3751 /* 3752 * Consume the next input character: 3753 */ 3754 char c = read(); 3755 switch (c) { 3756 case '>': 3757 /* 3758 * U+003E GREATER-THAN SIGN (>) Set the DOCTYPE token's 3759 * correctness flag to incorrect. Emit that DOCTYPE token. 3760 */ 3761 tokenHandler.doctype(doctypeName, publicIdentifier, 3762 systemIdentifier, false); 3763 /* 3764 * Switch to the data state. 3765 */ 3766 return; 3767 case '\u0000': 3768 /* EOF Parse error. */ 3769 err("End of file inside doctype."); 3770 /* 3771 * Set the DOCTYPE token's correctness flag to incorrect. 3772 * Emit that DOCTYPE token. 3773 */ 3774 tokenHandler.doctype(doctypeName, publicIdentifier, 3775 systemIdentifier, false); 3776 /* 3777 * Reconsume the EOF character in the data state. 3778 */ 3779 unread(c); 3780 return; 3781 default: 3782 /* 3783 * Anything else Stay in the bogus DOCTYPE state. 3784 */ 3785 continue; 3786 } 3787 } 3788 } 3789 3790 /** 3791 * Consume entity 3792 * 3793 * Unlike the definition is the spec, this method does not return a value 3794 * and never requires the caller to backtrack. This method takes care of 3795 * emitting characters or appending to the current attribute value. It also 3796 * takes care of that in the case when consuming the entity fails. 3797 * 3798 * @throws IOException 3799 * @throws SAXException 3800 */ 3801 private void consumeEntity(boolean inAttribute) throws SAXException, 3802 IOException { 3803 clearStrBuf(); 3804 appendStrBuf('&'); 3805 /* 3806 * This section defines how to consume an entity. This definition is 3807 * used when parsing entities in text and in attributes. 3808 * 3809 * The behaviour depends on the identity of the next character (the one 3810 * immediately after the U+0026 AMPERSAND character): 3811 */ 3812 char c = read(); 3813 switch (c) { 3814 case ' ': 3815 case '\t': 3816 case '\n': 3817 case '\u000B': 3818 case '\u000C': 3819 case '<': 3820 case '&': 3821 case '\u0000': 3822 /* 3823 * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B LINE 3824 * TABULATION U+000C FORM FEED (FF) U+0020 SPACE U+003C 3825 * LESS-THAN SIGN U+0026 AMPERSAND EOF Not an entity. No 3826 * characters are consumed, and nothing is returned. (This is 3827 * not an error, either.) 3828 */ 3829 if (inAttribute) { 3830 appendStrBufToLongStrBuf(); 3831 } else { 3832 emitStrBuf(); 3833 } 3834 unread(c); 3835 return; 3836 case '#': 3837 /* 3838 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER SIGN. 3839 */ 3840 appendStrBuf('#'); 3841 consumeNCR(inAttribute); 3842 return; 3843 default: 3844 unread(c); 3845 int entCol = -1; 3846 int lo = 0; 3847 int hi = (Entities.NAMES.length - 1); 3848 int candidate = -1; 3849 int strBufMark = 0; 3850 outer: for (;;) { 3851 entCol++; 3852 c = read(); 3853 /* 3854 * Anything else Consume the maximum number of characters 3855 * possible, with the consumed characters case-sensitively 3856 * matching one of the identifiers in the first column of 3857 * the entities table. 3858 */ 3859 hiloop: for (;;) { 3860 if (hi == -1) { 3861 break; 3862 } 3863 if (entCol == Entities.NAMES[hi].length()) { 3864 break hiloop; 3865 } 3866 if (entCol > Entities.NAMES[hi].length()) { 3867 break outer; 3868 } else if (c < Entities.NAMES[hi].charAt(entCol)) { 3869 hi--; 3870 } else { 3871 break hiloop; 3872 } 3873 } 3874 3875 loloop: for (;;) { 3876 if (hi < lo) { 3877 break outer; 3878 } 3879 if (entCol == Entities.NAMES[lo].length()) { 3880 candidate = lo; 3881 strBufMark = strBufLen; 3882 lo++; 3883 } else if (entCol > Entities.NAMES[lo].length()) { 3884 break outer; 3885 } else if (c > Entities.NAMES[lo].charAt(entCol)) { 3886 lo++; 3887 } else { 3888 break loloop; 3889 } 3890 } 3891 if (hi < lo) { 3892 break outer; 3893 } 3894 appendStrBuf(c); 3895 } 3896 unread(c); 3897 // TODO warn about apos (IE) and TRADE (Opera) 3898 if (candidate == -1) { 3899 /* If no match can be made, then this is a parse error. */ 3900 err("Text after \u201C&\u201D did not match an entity name."); 3901 /* 3902 * No characters are consumed, and nothing is returned. 3903 */ 3904 if (inAttribute) { 3905 appendStrBufToLongStrBuf(); 3906 } else { 3907 emitStrBuf(); 3908 } 3909 return; 3910 } else { 3911 if (!Entities.NAMES[candidate].endsWith(";")) { 3912 /* 3913 * If the last character matched is not a U+003B 3914 * SEMICOLON (;), there is a parse error. 3915 */ 3916 err("Entity reference was not terminated by a semicolon."); 3917 if (inAttribute) { 3918 /* 3919 * If the entity is being consumed as part of an 3920 * attribute, and the last character matched is not 3921 * a U+003B SEMICOLON (;), 3922 */ 3923 if (strBufMark == strBufLen) { 3924 c = read(); 3925 unread(c); 3926 } else { 3927 c = strBuf[strBufMark]; 3928 } 3929 if ((c >= '0' && c <= '9') 3930 || (c >= 'A' && c <= 'Z') 3931 || (c >= 'a' && c <= 'z')) { 3932 /* 3933 * and the next character is in the range U+0030 3934 * DIGIT ZERO to U+0039 DIGIT NINE, U+0041 LATIN 3935 * CAPITAL LETTER A to U+005A LATIN CAPITAL 3936 * LETTER Z, or U+0061 LATIN SMALL LETTER A to 3937 * U+007A LATIN SMALL LETTER Z, then, for 3938 * historical reasons, all the characters that 3939 * were matched after the U+0026 AMPERSAND (&) 3940 * must be unconsumed, and nothing is returned. 3941 */ 3942 appendStrBufToLongStrBuf(); 3943 return; 3944 } 3945 } 3946 } 3947 3948 /* 3949 * Otherwise, return a character token for the character 3950 * corresponding to the entity name (as given by the second 3951 * column of the entities table). 3952 */ 3953 char[] val = Entities.VALUES[candidate]; 3954 emitOrAppend(val, inAttribute); 3955 // this is so complicated! 3956 if (strBufMark < strBufLen) { 3957 if (inAttribute) { 3958 for (int i = strBufMark; i < strBufLen; i++) { 3959 appendLongStrBuf(strBuf[i]); 3960 } 3961 } else { 3962 tokenHandler.characters(strBuf, strBufMark, 3963 strBufLen - strBufMark); 3964 } 3965 } 3966 return; 3967 /* 3968 * If the markup contains I'm ¬it; I tell you, the entity 3969 * is parsed as "not", as in, I'm ¬it; I tell you. But if 3970 * the markup was I'm ∉ I tell you, the entity would 3971 * be parsed as "notin;", resulting in I'm ∉ I tell you. 3972 */ 3973 } 3974 3975 } 3976 } 3977 3978 private void consumeNCR(boolean inAttribute) throws SAXException, 3979 IOException { 3980 int prevValue = -1; 3981 int value = 0; 3982 boolean seenDigits = false; 3983 boolean hex = false; 3984 /* 3985 * The behaviour further depends on the character after the U+0023 3986 * NUMBER SIGN: 3987 */ 3988 char c = read(); 3989 if (c == 'x' || c == 'X') { 3990 /* 3991 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL LETTER X Consume 3992 * the X. 3993 * 3994 * Follow the steps below, but using the range of characters U+0030 3995 * DIGIT ZERO through to U+0039 DIGIT NINE, U+0061 LATIN SMALL 3996 * LETTER A through to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN 3997 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL LETTER F (in 3998 * other words, 0-9, A-F, a-f). 3999 * 4000 * When it comes to interpreting the number, interpret it as a 4001 * hexadecimal number. 4002 */ 4003 appendStrBuf(c); 4004 hex = true; 4005 } else { 4006 unread(c); 4007 /* 4008 * Anything else Follow the steps below, but using the range of 4009 * characters U+0030 DIGIT ZERO through to U+0039 DIGIT NINE (i.e. 4010 * just 0-9). 4011 * 4012 * When it comes to interpreting the number, interpret it as a 4013 * decimal number. 4014 */ 4015 } 4016 for (;;) { 4017 // Deal with overflow gracefully 4018 if (value < prevValue) { 4019 value = 0x110000; // Value above Unicode range but within int 4020 // range 4021 } 4022 prevValue = value; 4023 /* 4024 * Consume as many characters as match the range of characters given 4025 * above. 4026 */ 4027 c = read(); 4028 if (c >= '0' && c <= '9') { 4029 seenDigits = true; 4030 if (hex) { 4031 value *= 16; 4032 } else { 4033 value *= 10; 4034 } 4035 value += c - '0'; 4036 } else if (hex && c >= 'A' && c <= 'F') { 4037 seenDigits = true; 4038 value *= 16; 4039 value += c - 'A' + 10; 4040 } else if (hex && c >= 'a' && c <= 'f') { 4041 seenDigits = true; 4042 value *= 16; 4043 value += c - 'a' + 10; 4044 } else if (c == ';') { 4045 if (seenDigits) { 4046 handleNCRValue(value, inAttribute); 4047 return; 4048 } else { 4049 err("No digits after \u201C" + strBufToString() + "\u201D."); 4050 appendStrBuf(';'); 4051 if (inAttribute) { 4052 appendStrBufToLongStrBuf(); 4053 } else { 4054 emitStrBuf(); 4055 } 4056 return; 4057 } 4058 } else { 4059 /* 4060 * If no characters match the range, then don't consume any 4061 * characters (and unconsume the U+0023 NUMBER SIGN character 4062 * and, if appropriate, the X character). This is a parse error; 4063 * nothing is returned. 4064 * 4065 * Otherwise, if the next character is a U+003B SEMICOLON, 4066 * consume that too. If it isn't, there is a parse error. 4067 */ 4068 unread(c); 4069 if (seenDigits) { 4070 err("Character reference was not terminated by a semicolon."); 4071 handleNCRValue(value, inAttribute); 4072 return; 4073 } else { 4074 err("No digits after \u201C" + strBufToString() + "\u201D."); 4075 if (inAttribute) { 4076 appendStrBufToLongStrBuf(); 4077 } else { 4078 emitStrBuf(); 4079 } 4080 return; 4081 } 4082 } 4083 } 4084 } 4085 4086 private void handleNCRValue(int value, boolean inAttribute) 4087 throws SAXException, IOException { 4088 /* 4089 * If one or more characters match the range, then take them all and 4090 * interpret the string of characters as a number (either hexadecimal or 4091 * decimal as appropriate). 4092 */ 4093 if (value >= 0x80 && value <= 0x9f) { 4094 /* 4095 * If that number is one of the numbers in the first column of the 4096 * following table, then this is a parse error. 4097 */ 4098 err("A numeric character reference expanded to the C1 controls range."); 4099 /* 4100 * Find the row with that number in the first column, and return a 4101 * character token for the Unicode character given in the second 4102 * column of that row. 4103 */ 4104 char[] val = Entities.WINDOWS_1252[value - 0x80]; 4105 emitOrAppend(val, inAttribute); 4106 return; 4107 } else if (value == 0x0D) { 4108 err("A numeric character reference expanded to carriage return."); 4109 emitOrAppend(LF, inAttribute); 4110 return; 4111 } else if (value == 0) { 4112 /* 4113 * Otherwise, if the number is zero, if the number is higher than 4114 * 0x10FFFF, or if it's one of the surrogate characters (characters 4115 * in the range 0xD800 to 0xDFFF), then this is a parse error; 4116 * return a character token for the U+FFFD REPLACEMENT CHARACTER 4117 * character instead. 4118 */ 4119 err("Character reference expands to U+0000."); 4120 emitOrAppend(REPLACEMENT_CHARACTER, inAttribute); 4121 return; 4122 } else if ((contentSpacePolicy != XmlViolationPolicy.ALLOW) 4123 && (value == 0xB || value == 0xC)) { 4124 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) { 4125 emitOrAppend(SPACE, inAttribute); 4126 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) { 4127 fatal("A character reference expanded to a space character that is not legal XML 1.0 white space."); 4128 } 4129 } else if ((value & 0xF800) == 0xD800) { 4130 err("Character reference expands to a surrogate."); 4131 emitOrAppend(REPLACEMENT_CHARACTER, inAttribute); 4132 return; 4133 } else if (value <= 0xFFFF) { 4134 /* 4135 * Otherwise, return a character token for the Unicode character 4136 * whose code point is that number. 4137 */ 4138 char c = (char) value; 4139 if (c < '\t' || (c > '\r' && c < ' ') || isNonCharacter(c)) { 4140 if (contentNonXmlCharPolicy != XmlViolationPolicy.FATAL) { 4141 if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) { 4142 c = '\uFFFD'; 4143 } 4144 warn("Character reference expanded to a character that is not a legal XML 1.0 character."); 4145 } else { 4146 fatal("Character reference expanded to a character that is not a legal XML 1.0 character."); 4147 } 4148 } 4149 if (isPrivateUse(c)) { 4150 warnAboutPrivateUseChar(); 4151 } 4152 bmpChar[0] = c; 4153 emitOrAppend(bmpChar, inAttribute); 4154 return; 4155 } else if (value <= 0x10FFFF) { 4156 if (isNonCharacter(value)) { 4157 warn("Character reference expands to an astral non-character."); 4158 } 4159 if (isAstralPrivateUse(value)) { 4160 warnAboutPrivateUseChar(); 4161 } 4162 astralChar[0] = (char) (LEAD_OFFSET + (value >> 10)); 4163 astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); 4164 emitOrAppend(astralChar, inAttribute); 4165 return; 4166 } else { 4167 err("Character reference outside the permissible Unicode range."); 4168 emitOrAppend(REPLACEMENT_CHARACTER, inAttribute); 4169 return; 4170 } 4171 } 4172 4173 /** 4174 * @param val 4175 * @throws SAXException 4176 * @throws IOException 4177 */ 4178 private void emitOrAppend(char[] val, boolean inAttribute) 4179 throws SAXException, IOException { 4180 if (inAttribute) { 4181 appendLongStrBuf(val); 4182 } else { 4183 tokenHandler.characters(val, 0, val.length); 4184 } 4185 } 4186 4187 /** 4188 * Returns the mappingLangToXmlLang. 4189 * 4190 * @return the mappingLangToXmlLang 4191 */ 4192 public boolean isMappingLangToXmlLang() { 4193 return mappingLangToXmlLang; 4194 } 4195 4196 /** 4197 * Sets the mappingLangToXmlLang. 4198 * 4199 * @param mappingLangToXmlLang 4200 * the mappingLangToXmlLang to set 4201 */ 4202 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { 4203 this.mappingLangToXmlLang = mappingLangToXmlLang; 4204 } 4205 }