001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package nu.validator.htmlparser.impl;
024
025 import java.io.IOException;
026 import java.nio.charset.Charset;
027 import java.nio.charset.CharsetDecoder;
028 import java.nio.charset.IllegalCharsetNameException;
029 import java.nio.charset.UnsupportedCharsetException;
030 import java.util.regex.Matcher;
031 import java.util.regex.Pattern;
032
033
034 import org.xml.sax.ErrorHandler;
035 import org.xml.sax.Locator;
036 import org.xml.sax.SAXException;
037 import org.xml.sax.SAXParseException;
038
039 public final class MetaSniffer implements Locator {
040
041 private class StopSniffingException extends Exception {
042
043 }
044
045 private static final Pattern CONTENT = Pattern.compile("^[^;]*;[\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*[cC][hH][aA][rR][sS][eE][tT][\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*=[\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*(?:(?:([^'\"\\x09\\x0A\\x0B\\x0C\\x0D\\x20][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*)(?:[\\x09\\x0A\\x0B\\x0C\\x0D\\x20].*)?)|(?:\"([^\"]*)\".*)|(?:'([^']*)'.*))$", Pattern.DOTALL);
046
047 private enum MetaState {
048 NO, M, E, T, A
049 }
050
051 private final ByteReadable source;
052
053 private final ErrorHandler errorHandler;
054
055 private CharsetDecoder charsetDecoder = null;
056
057 private StringBuilder attributeName = new StringBuilder();
058
059 private StringBuilder attributeValue = new StringBuilder();
060
061 private MetaState metaState = MetaState.NO;
062
063 private int unread = -1;
064
065 private int line = 1;
066
067 private int col = 0;
068
069 private boolean prevWasCR = false;
070
071 private final Locator locator;
072
073 /**
074 * @param source
075 * @param errorHandler
076 * @param publicId
077 * @param systemId
078 */
079 public MetaSniffer(ByteReadable source, ErrorHandler eh, Locator locator) {
080 this.source = source;
081 this.errorHandler = eh;
082 this.locator = locator;
083 }
084
085 // Making this method return an int instead of a char was
086 // probably a mistake :-(
087 private int read() throws IOException, StopSniffingException {
088 if (unread == -1) {
089 int b = source.readByte();
090 switch (b) {
091 case -1: // end
092 throw new StopSniffingException();
093 case 0x0A: // LF
094 if (!prevWasCR) {
095 line++;
096 col = 0;
097 }
098 prevWasCR = false;
099 break;
100 case 0x0D: // CR
101 line++;
102 col = 0;
103 prevWasCR = true;
104 break;
105 default:
106 col++;
107 prevWasCR = false;
108 break;
109 }
110 return b;
111 } else {
112 int b = unread;
113 unread = -1;
114 return b;
115 }
116 }
117
118 private void unread(int b) {
119 this.unread = b;
120 }
121
122 /**
123 * Main loop.
124 *
125 * @return
126 *
127 * @throws SAXException
128 * @throws IOException
129 * @throws
130 */
131 public CharsetDecoder sniff() throws SAXException, IOException {
132 try {
133 for (;;) {
134 if (read() == 0x3C) { // <
135 markup();
136 }
137 }
138 } catch (StopSniffingException e) {
139 return charsetDecoder;
140 }
141 }
142
143 /**
144 * <
145 *
146 * @throws SAXException
147 * @throws StopSniffingException
148 * @throws IOException
149 */
150 private void markup() throws SAXException, StopSniffingException, IOException {
151 int b = read();
152 if (b == 0x21) { // !
153 markupDecl();
154 } else if (b == 0x2F) { // /
155 endTag();
156 } else if (b == 0x3F) { // ?
157 consumeUntilAndIncludingGt();
158 } else if (b == 0x4D || b == 0x6D) { // m or M
159 metaState = MetaState.M;
160 tag();
161 } else if ((b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) { // ASCII
162 // letter
163 metaState = MetaState.NO;
164 tag();
165 }
166 }
167
168 /**
169 * < , x
170 *
171 * @throws SAXException
172 * @throws StopSniffingException
173 * @throws IOException
174 */
175 private void tag() throws SAXException, StopSniffingException, IOException {
176 int b;
177 loop: for (;;) {
178 b = read();
179 switch (b) {
180 case 0x09: // tab
181 case 0x0A: // LF
182 case 0x0B: // VT
183 case 0x0C: // FF
184 case 0x0D: // CR
185 case 0x20: // space
186 case 0x3E: // >
187 case 0x3C: // <
188 break loop;
189 case 0x45: // E
190 case 0x65: // e
191 if (metaState == MetaState.M) {
192 metaState = MetaState.E;
193 } else {
194 metaState = MetaState.NO;
195 }
196 continue loop;
197 case 0x54: // T
198 case 0x74: // t
199 if (metaState == MetaState.E) {
200 metaState = MetaState.T;
201 } else {
202 metaState = MetaState.NO;
203 }
204 continue loop;
205 case 0x41: // A
206 case 0x61: // a
207 if (metaState == MetaState.T) {
208 metaState = MetaState.A;
209 } else {
210 metaState = MetaState.NO;
211 }
212 continue loop;
213 default:
214 metaState = MetaState.NO;
215 continue loop;
216 }
217 }
218 unread(b);
219 if (b != 0x3C) {
220 while (attribute())
221 ;
222 }
223 }
224
225 /**
226 * The "get an attribute" subalgorithm.
227 *
228 * @return <code>false</code> when to stop
229 * @throws SAXException
230 * @throws StopSniffingException
231 * @throws IOException
232 */
233 private boolean attribute() throws SAXException, StopSniffingException, IOException {
234 int b;
235 loop: for (;;) {
236 b = read();
237 switch (b) {
238 case 0x09: // tab
239 case 0x0A: // LF
240 case 0x0B: // VT
241 case 0x0C: // FF
242 case 0x0D: // CR
243 case 0x20: // space
244 case 0x2F: // /
245 continue loop;
246 default:
247 break loop;
248 }
249 }
250 if (b == 0x3C) { // <
251 unread(b);
252 return false;
253 }
254 if (b == 0x3E) { // >
255 return false;
256 }
257 attributeName.setLength(0);
258 attributeValue.setLength(0);
259 unread(b); // this is a bit ugly
260 name: for (;;) {
261 b = read();
262 switch (b) {
263 case 0x3D: // =
264 // not actually advancing here yet
265 break name;
266 case 0x09: // tab
267 case 0x0A: // LF
268 case 0x0B: // VT
269 case 0x0C: // FF
270 case 0x0D: // CR
271 case 0x20: // space
272 spaces: for (;;) {
273 b = read();
274 switch (b) {
275 case 0x09: // tab
276 case 0x0A: // LF
277 case 0x0B: // VT
278 case 0x0C: // FF
279 case 0x0D: // CR
280 case 0x20: // space
281 continue spaces;
282 default:
283 break name;
284 }
285 }
286 case 0x2f: // /
287 return true;
288 case 0x3C: // <
289 unread(b);
290 return false;
291 case 0x3E: // >
292 return false;
293 default:
294 if (metaState == MetaState.A) {
295 // could use a highly-efficient state machine
296 // here instead of a buffer...
297 if (b >= 0x41 && b <= 0x5A) {
298 attributeName.append((char) (b + 0x20));
299 } else {
300 attributeName.append((char) b);
301 }
302 }
303 continue name;
304 }
305 }
306 if (b != 0x3D) {
307 // "If the byte at position is not 0x3D (ASCII '='), stop looking
308 // for
309 // an attribute. Move position back to the previous byte."
310 unread(b);
311 return true;
312 }
313 value: for (;;) {
314 b = read();
315 switch (b) {
316 case 0x09: // tab
317 case 0x0A: // LF
318 case 0x0B: // VT
319 case 0x0C: // FF
320 case 0x0D: // CR
321 case 0x20: // space
322 continue value;
323 default:
324 break value;
325 }
326 }
327 switch (b) {
328 case 0x22: // "
329 quotedAttribute(0x22);
330 return true;
331 case 0x27: // '
332 quotedAttribute(0x27);
333 return true;
334 case 0x3C: // <
335 unread(b);
336 return false;
337 case 0x3E: // >
338 return false;
339 default:
340 unread(b);
341 return unquotedAttribute();
342 }
343 }
344
345 private boolean unquotedAttribute() throws SAXException, StopSniffingException, IOException {
346 int b;
347 for (;;) {
348 b = read();
349 switch (b) {
350 case 0x09: // tab
351 case 0x0A: // LF
352 case 0x0B: // VT
353 case 0x0C: // FF
354 case 0x0D: // CR
355 case 0x20: // space
356 checkAttribute();
357 return true;
358 case 0x3E: // >
359 checkAttribute();
360 return false;
361 case 0x3C: // <
362 checkAttribute();
363 unread(b);
364 return false;
365 default:
366 // omitting uppercasing
367 if (metaState == MetaState.A) {
368 attributeValue.append((char) b);
369 }
370 break;
371 }
372 }
373 }
374
375 private void checkAttribute() throws SAXException, StopSniffingException {
376 if (metaState == MetaState.A) {
377 String name = attributeName.toString();
378 if ("charset".equals(name)) {
379 // XXX revisit trim() to trime only space characters
380 tryCharset(attributeValue.toString().trim());
381 } else if ("content".equals(name)) {
382 Matcher m = CONTENT.matcher(attributeValue);
383 if (m.matches()) {
384 String value = null;
385 for (int i = 1; i < 4; i++) {
386 value = m.group(i);
387 if (value != null) {
388 tryCharset(value);
389 break;
390 }
391 }
392 }
393 }
394 }
395 }
396
397 private void tryCharset(String encoding) throws SAXException, StopSniffingException {
398 encoding = encoding.toUpperCase();
399 try {
400 // XXX deviating from the spec as per mjs on IRC.
401 if ("UTF-16".equals(encoding) || "UTF-16BE".equals(encoding) || "UTF-16LE".equals(encoding) || "UTF-32".equals(encoding) || "UTF-32BE".equals(encoding) || "UTF-32LE".equals(encoding)) {
402 this.charsetDecoder = Charset.forName("UTF-8").newDecoder();
403 err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead.");
404 throw new StopSniffingException();
405 } else {
406 Charset cs = Charset.forName(encoding);
407 String canonName = cs.name();
408 if (!EncodingInfo.isAsciiSuperset(canonName)) {
409 err("The encoding \u201C"
410 + encoding
411 + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
412 return;
413 }
414 if (canonName.startsWith("X-") || canonName.startsWith("x-")
415 || canonName.startsWith("Mac")) {
416 if (encoding.startsWith("X-")) {
417 err("The encoding \u201C" + encoding
418 + "\u201D is not an IANA-registered encoding. (Charmod C022)");
419 } else {
420 err("The encoding \u201C" + encoding
421 + "\u201D is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
422 }
423 } else if (!canonName.equalsIgnoreCase(encoding)) {
424 err("The encoding \u201C" + encoding
425 + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
426 + canonName + "\u201D. (Charmod C024)");
427 }
428 if (EncodingInfo.isObscure(canonName)) {
429 warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
430 }
431 this.charsetDecoder = cs.newDecoder();
432 throw new StopSniffingException();
433 }
434 } catch (IllegalCharsetNameException e) {
435 err("Illegal character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
436 } catch (UnsupportedCharsetException e) {
437 err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
438 }
439 }
440
441 /**
442 * @param string
443 * @throws SAXException
444 */
445 private void err(String message) throws SAXException {
446 if (errorHandler != null) {
447 SAXParseException spe = new SAXParseException(message, this);
448 errorHandler.error(spe);
449 }
450 }
451
452 /**
453 * @param string
454 * @throws SAXException
455 */
456 private void warn(String message) throws SAXException {
457 if (errorHandler != null) {
458 SAXParseException spe = new SAXParseException(message, this);
459 errorHandler.warning(spe);
460 }
461 }
462
463 private void quotedAttribute(int delim) throws SAXException, StopSniffingException, IOException {
464 int b;
465 for (;;) {
466 b = read();
467 if (b == delim) {
468 checkAttribute();
469 return;
470 } else {
471 if (metaState == MetaState.A) {
472 attributeValue.append((char) b);
473 }
474 }
475 }
476 }
477
478 private void consumeUntilAndIncludingGt() throws IOException, StopSniffingException {
479 for (;;) {
480 if (read() == 0x3E) { // >
481 return;
482 }
483 }
484 }
485
486 /**
487 * Seen < , /
488 *
489 * @throws SAXException
490 * @throws StopSniffingException
491 * @throws IOException
492 */
493 private void endTag() throws SAXException, StopSniffingException, IOException {
494 int b = read();
495 if ((b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) { // ASCII
496 // letter
497 metaState = MetaState.NO;
498 tag();
499 } else {
500 consumeUntilAndIncludingGt();
501 }
502 }
503
504 /**
505 * Seen < , !
506 * @throws IOException
507 * @throws StopSniffingException
508 */
509 private void markupDecl() throws IOException, StopSniffingException {
510 if (read() == 0x2D) { // -
511 comment();
512 } else {
513 consumeUntilAndIncludingGt();
514 }
515 }
516
517 /**
518 * Seen < , ! , -
519 * @throws IOException
520 * @throws StopSniffingException
521 */
522 private void comment() throws IOException, StopSniffingException {
523 if (read() == 0x2D) { // -
524 int hyphensSeen = 2;
525 for (;;) {
526 int b = read();
527 if (b == 0x2D) { // -
528 hyphensSeen++;
529 } else if (b == 0x3E) { // >
530 if (hyphensSeen >= 2) {
531 return;
532 } else {
533 hyphensSeen = 0;
534 }
535 } else {
536 hyphensSeen = 0;
537 }
538 }
539 } else {
540 consumeUntilAndIncludingGt();
541 }
542 }
543
544 public int getColumnNumber() {
545 return col;
546 }
547
548 public int getLineNumber() {
549 return line;
550 }
551
552 public String getPublicId() {
553 if (locator != null) {
554 return locator.getPublicId();
555 }
556 return null;
557 }
558
559 public String getSystemId() {
560 if (locator != null) {
561 return locator.getSystemId();
562 }
563 return null;
564 }
565
566 }