001 /*
002 * Copyright (c) 2009 Mozilla Foundation
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package nu.validator.htmlparser.io;
024
025 import java.io.IOException;
026 import java.nio.charset.UnsupportedCharsetException;
027
028 import nu.validator.htmlparser.common.ByteReadable;
029 import nu.validator.htmlparser.impl.MetaScanner;
030
031 import org.xml.sax.ErrorHandler;
032 import org.xml.sax.Locator;
033 import org.xml.sax.SAXException;
034 import org.xml.sax.SAXParseException;
035
036 public class MetaSniffer extends MetaScanner implements Locator {
037
038 private Encoding characterEncoding = null;
039
040 private final ErrorHandler errorHandler;
041
042 private final Locator locator;
043
044 private int line = 1;
045
046 private int col = 0;
047
048 private boolean prevWasCR = false;
049
050 public MetaSniffer(ErrorHandler eh, Locator locator) {
051 this.errorHandler = eh;
052 this.locator = locator;
053 this.characterEncoding = null;
054 }
055
056 /**
057 * -1 means end.
058 * @return
059 * @throws IOException
060 */
061 protected int read() throws IOException {
062 int b = readable.readByte();
063 // [NOCPP[
064 switch (b) {
065 case '\n':
066 if (!prevWasCR) {
067 line++;
068 col = 0;
069 }
070 prevWasCR = false;
071 break;
072 case '\r':
073 line++;
074 col = 0;
075 prevWasCR = true;
076 break;
077 default:
078 col++;
079 prevWasCR = false;
080 break;
081 }
082 // ]NOCPP]
083 return b;
084 }
085
086 /**
087 * Main loop.
088 *
089 * @return
090 *
091 * @throws SAXException
092 * @throws IOException
093 * @throws
094 */
095 public Encoding sniff(ByteReadable readable) throws SAXException, IOException {
096 this.readable = readable;
097 stateLoop(stateSave);
098 return characterEncoding;
099 }
100
101
102 /**
103 * @param string
104 * @throws SAXException
105 */
106 private void err(String message) throws SAXException {
107 if (errorHandler != null) {
108 SAXParseException spe = new SAXParseException(message, this);
109 errorHandler.error(spe);
110 }
111 }
112
113 /**
114 * @param string
115 * @throws SAXException
116 */
117 private void warn(String message) throws SAXException {
118 if (errorHandler != null) {
119 SAXParseException spe = new SAXParseException(message, this);
120 errorHandler.warning(spe);
121 }
122 }
123
124 public int getColumnNumber() {
125 return col;
126 }
127
128 public int getLineNumber() {
129 return line;
130 }
131
132 public String getPublicId() {
133 if (locator != null) {
134 return locator.getPublicId();
135 }
136 return null;
137 }
138
139 public String getSystemId() {
140 if (locator != null) {
141 return locator.getSystemId();
142 }
143 return null;
144 }
145
146 protected boolean tryCharset(String encoding) throws SAXException {
147 encoding = Encoding.toAsciiLowerCase(encoding);
148 try {
149 // XXX spec says only UTF-16
150 if ("utf-16".equals(encoding) || "utf-16be".equals(encoding) || "utf-16le".equals(encoding) || "utf-32".equals(encoding) || "utf-32be".equals(encoding) || "utf-32le".equals(encoding)) {
151 this.characterEncoding = Encoding.UTF8;
152 err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead.");
153 return true;
154 } else {
155 Encoding cs = Encoding.forName(encoding);
156 String canonName = cs.getCanonName();
157 if (!cs.isAsciiSuperset()) {
158 err("The encoding \u201C"
159 + encoding
160 + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
161 return false;
162 }
163 if (!cs.isRegistered()) {
164 if (encoding.startsWith("x-")) {
165 err("The encoding \u201C"
166 + encoding
167 + "\u201D is not an IANA-registered encoding. (Charmod C022)");
168 } else {
169 err("The encoding \u201C"
170 + encoding
171 + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
172 }
173 } else if (!cs.getCanonName().equals(encoding)) {
174 err("The encoding \u201C" + encoding
175 + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
176 + canonName + "\u201D. (Charmod C024)");
177 }
178 if (cs.isShouldNot()) {
179 warn("Authors should not use the character encoding \u201C"
180 + encoding
181 + "\u201D. It is recommended to use \u201CUTF-8\u201D.");
182 } else if (cs.isObscure()) {
183 warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
184 }
185 Encoding actual = cs.getActualHtmlEncoding();
186 if (actual == null) {
187 this.characterEncoding = cs;
188 } else {
189 warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D.");
190 this.characterEncoding = actual;
191 }
192 return true;
193 }
194 } catch (UnsupportedCharsetException e) {
195 err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
196 }
197 return false;
198 }
199 }