001 /* 002 * Copyright (c) 2007 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package nu.validator.htmlparser.impl; 024 025 import java.io.IOException; 026 import java.nio.charset.Charset; 027 import java.nio.charset.CharsetDecoder; 028 import java.nio.charset.IllegalCharsetNameException; 029 import java.nio.charset.UnsupportedCharsetException; 030 import java.util.regex.Matcher; 031 import java.util.regex.Pattern; 032 033 034 import org.xml.sax.ErrorHandler; 035 import org.xml.sax.Locator; 036 import org.xml.sax.SAXException; 037 import org.xml.sax.SAXParseException; 038 039 public final class MetaSniffer implements Locator { 040 041 private class StopSniffingException extends Exception { 042 043 } 044 045 private static final Pattern CONTENT = Pattern.compile("^[^;]*;[\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*[cC][hH][aA][rR][sS][eE][tT][\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*=[\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*(?:(?:([^'\"\\x09\\x0A\\x0B\\x0C\\x0D\\x20][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*)(?:[\\x09\\x0A\\x0B\\x0C\\x0D\\x20].*)?)|(?:\"([^\"]*)\".*)|(?:'([^']*)'.*))$", Pattern.DOTALL); 046 047 private enum MetaState { 048 NO, M, E, T, A 049 } 050 051 private final ByteReadable source; 052 053 private final ErrorHandler errorHandler; 054 055 private CharsetDecoder charsetDecoder = null; 056 057 private StringBuilder attributeName = new StringBuilder(); 058 059 private StringBuilder attributeValue = new StringBuilder(); 060 061 private MetaState metaState = MetaState.NO; 062 063 private int unread = -1; 064 065 private int line = 1; 066 067 private int col = 0; 068 069 private boolean prevWasCR = false; 070 071 private final Locator locator; 072 073 /** 074 * @param source 075 * @param errorHandler 076 * @param publicId 077 * @param systemId 078 */ 079 public MetaSniffer(ByteReadable source, ErrorHandler eh, Locator locator) { 080 this.source = source; 081 this.errorHandler = eh; 082 this.locator = locator; 083 } 084 085 // Making this method return an int instead of a char was 086 // probably a mistake :-( 087 private int read() throws IOException, StopSniffingException { 088 if (unread == -1) { 089 int b = source.readByte(); 090 switch (b) { 091 case -1: // end 092 throw new StopSniffingException(); 093 case 0x0A: // LF 094 if (!prevWasCR) { 095 line++; 096 col = 0; 097 } 098 prevWasCR = false; 099 break; 100 case 0x0D: // CR 101 line++; 102 col = 0; 103 prevWasCR = true; 104 break; 105 default: 106 col++; 107 prevWasCR = false; 108 break; 109 } 110 return b; 111 } else { 112 int b = unread; 113 unread = -1; 114 return b; 115 } 116 } 117 118 private void unread(int b) { 119 this.unread = b; 120 } 121 122 /** 123 * Main loop. 124 * 125 * @return 126 * 127 * @throws SAXException 128 * @throws IOException 129 * @throws 130 */ 131 public CharsetDecoder sniff() throws SAXException, IOException { 132 try { 133 for (;;) { 134 if (read() == 0x3C) { // < 135 markup(); 136 } 137 } 138 } catch (StopSniffingException e) { 139 return charsetDecoder; 140 } 141 } 142 143 /** 144 * < 145 * 146 * @throws SAXException 147 * @throws StopSniffingException 148 * @throws IOException 149 */ 150 private void markup() throws SAXException, StopSniffingException, IOException { 151 int b = read(); 152 if (b == 0x21) { // ! 153 markupDecl(); 154 } else if (b == 0x2F) { // / 155 endTag(); 156 } else if (b == 0x3F) { // ? 157 consumeUntilAndIncludingGt(); 158 } else if (b == 0x4D || b == 0x6D) { // m or M 159 metaState = MetaState.M; 160 tag(); 161 } else if ((b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) { // ASCII 162 // letter 163 metaState = MetaState.NO; 164 tag(); 165 } 166 } 167 168 /** 169 * < , x 170 * 171 * @throws SAXException 172 * @throws StopSniffingException 173 * @throws IOException 174 */ 175 private void tag() throws SAXException, StopSniffingException, IOException { 176 int b; 177 loop: for (;;) { 178 b = read(); 179 switch (b) { 180 case 0x09: // tab 181 case 0x0A: // LF 182 case 0x0B: // VT 183 case 0x0C: // FF 184 case 0x0D: // CR 185 case 0x20: // space 186 case 0x3E: // > 187 case 0x3C: // < 188 break loop; 189 case 0x45: // E 190 case 0x65: // e 191 if (metaState == MetaState.M) { 192 metaState = MetaState.E; 193 } else { 194 metaState = MetaState.NO; 195 } 196 continue loop; 197 case 0x54: // T 198 case 0x74: // t 199 if (metaState == MetaState.E) { 200 metaState = MetaState.T; 201 } else { 202 metaState = MetaState.NO; 203 } 204 continue loop; 205 case 0x41: // A 206 case 0x61: // a 207 if (metaState == MetaState.T) { 208 metaState = MetaState.A; 209 } else { 210 metaState = MetaState.NO; 211 } 212 continue loop; 213 default: 214 metaState = MetaState.NO; 215 continue loop; 216 } 217 } 218 unread(b); 219 if (b != 0x3C) { 220 while (attribute()) 221 ; 222 } 223 } 224 225 /** 226 * The "get an attribute" subalgorithm. 227 * 228 * @return <code>false</code> when to stop 229 * @throws SAXException 230 * @throws StopSniffingException 231 * @throws IOException 232 */ 233 private boolean attribute() throws SAXException, StopSniffingException, IOException { 234 int b; 235 loop: for (;;) { 236 b = read(); 237 switch (b) { 238 case 0x09: // tab 239 case 0x0A: // LF 240 case 0x0B: // VT 241 case 0x0C: // FF 242 case 0x0D: // CR 243 case 0x20: // space 244 case 0x2F: // / 245 continue loop; 246 default: 247 break loop; 248 } 249 } 250 if (b == 0x3C) { // < 251 unread(b); 252 return false; 253 } 254 if (b == 0x3E) { // > 255 return false; 256 } 257 attributeName.setLength(0); 258 attributeValue.setLength(0); 259 unread(b); // this is a bit ugly 260 name: for (;;) { 261 b = read(); 262 switch (b) { 263 case 0x3D: // = 264 // not actually advancing here yet 265 break name; 266 case 0x09: // tab 267 case 0x0A: // LF 268 case 0x0B: // VT 269 case 0x0C: // FF 270 case 0x0D: // CR 271 case 0x20: // space 272 spaces: for (;;) { 273 b = read(); 274 switch (b) { 275 case 0x09: // tab 276 case 0x0A: // LF 277 case 0x0B: // VT 278 case 0x0C: // FF 279 case 0x0D: // CR 280 case 0x20: // space 281 continue spaces; 282 default: 283 break name; 284 } 285 } 286 case 0x2f: // / 287 return true; 288 case 0x3C: // < 289 unread(b); 290 return false; 291 case 0x3E: // > 292 return false; 293 default: 294 if (metaState == MetaState.A) { 295 // could use a highly-efficient state machine 296 // here instead of a buffer... 297 if (b >= 0x41 && b <= 0x5A) { 298 attributeName.append((char) (b + 0x20)); 299 } else { 300 attributeName.append((char) b); 301 } 302 } 303 continue name; 304 } 305 } 306 if (b != 0x3D) { 307 // "If the byte at position is not 0x3D (ASCII '='), stop looking 308 // for 309 // an attribute. Move position back to the previous byte." 310 unread(b); 311 return true; 312 } 313 value: for (;;) { 314 b = read(); 315 switch (b) { 316 case 0x09: // tab 317 case 0x0A: // LF 318 case 0x0B: // VT 319 case 0x0C: // FF 320 case 0x0D: // CR 321 case 0x20: // space 322 continue value; 323 default: 324 break value; 325 } 326 } 327 switch (b) { 328 case 0x22: // " 329 quotedAttribute(0x22); 330 return true; 331 case 0x27: // ' 332 quotedAttribute(0x27); 333 return true; 334 case 0x3C: // < 335 unread(b); 336 return false; 337 case 0x3E: // > 338 return false; 339 default: 340 unread(b); 341 return unquotedAttribute(); 342 } 343 } 344 345 private boolean unquotedAttribute() throws SAXException, StopSniffingException, IOException { 346 int b; 347 for (;;) { 348 b = read(); 349 switch (b) { 350 case 0x09: // tab 351 case 0x0A: // LF 352 case 0x0B: // VT 353 case 0x0C: // FF 354 case 0x0D: // CR 355 case 0x20: // space 356 checkAttribute(); 357 return true; 358 case 0x3E: // > 359 checkAttribute(); 360 return false; 361 case 0x3C: // < 362 checkAttribute(); 363 unread(b); 364 return false; 365 default: 366 // omitting uppercasing 367 if (metaState == MetaState.A) { 368 attributeValue.append((char) b); 369 } 370 break; 371 } 372 } 373 } 374 375 private void checkAttribute() throws SAXException, StopSniffingException { 376 if (metaState == MetaState.A) { 377 String name = attributeName.toString(); 378 if ("charset".equals(name)) { 379 // XXX revisit trim() to trime only space characters 380 tryCharset(attributeValue.toString().trim()); 381 } else if ("content".equals(name)) { 382 Matcher m = CONTENT.matcher(attributeValue); 383 if (m.matches()) { 384 String value = null; 385 for (int i = 1; i < 4; i++) { 386 value = m.group(i); 387 if (value != null) { 388 tryCharset(value); 389 break; 390 } 391 } 392 } 393 } 394 } 395 } 396 397 private void tryCharset(String encoding) throws SAXException, StopSniffingException { 398 encoding = encoding.toUpperCase(); 399 try { 400 // XXX deviating from the spec as per mjs on IRC. 401 if ("UTF-16".equals(encoding) || "UTF-16BE".equals(encoding) || "UTF-16LE".equals(encoding) || "UTF-32".equals(encoding) || "UTF-32BE".equals(encoding) || "UTF-32LE".equals(encoding)) { 402 this.charsetDecoder = Charset.forName("UTF-8").newDecoder(); 403 err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead."); 404 throw new StopSniffingException(); 405 } else { 406 Charset cs = Charset.forName(encoding); 407 String canonName = cs.name(); 408 if (!EncodingInfo.isAsciiSuperset(canonName)) { 409 err("The encoding \u201C" 410 + encoding 411 + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm."); 412 return; 413 } 414 if (canonName.startsWith("X-") || canonName.startsWith("x-") 415 || canonName.startsWith("Mac")) { 416 if (encoding.startsWith("X-")) { 417 err("The encoding \u201C" + encoding 418 + "\u201D is not an IANA-registered encoding. (Charmod C022)"); 419 } else { 420 err("The encoding \u201C" + encoding 421 + "\u201D is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)"); 422 } 423 } else if (!canonName.equalsIgnoreCase(encoding)) { 424 err("The encoding \u201C" + encoding 425 + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" 426 + canonName + "\u201D. (Charmod C024)"); 427 } 428 if (EncodingInfo.isObscure(canonName)) { 429 warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); 430 } 431 this.charsetDecoder = cs.newDecoder(); 432 throw new StopSniffingException(); 433 } 434 } catch (IllegalCharsetNameException e) { 435 err("Illegal character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing."); 436 } catch (UnsupportedCharsetException e) { 437 err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing."); 438 } 439 } 440 441 /** 442 * @param string 443 * @throws SAXException 444 */ 445 private void err(String message) throws SAXException { 446 if (errorHandler != null) { 447 SAXParseException spe = new SAXParseException(message, this); 448 errorHandler.error(spe); 449 } 450 } 451 452 /** 453 * @param string 454 * @throws SAXException 455 */ 456 private void warn(String message) throws SAXException { 457 if (errorHandler != null) { 458 SAXParseException spe = new SAXParseException(message, this); 459 errorHandler.warning(spe); 460 } 461 } 462 463 private void quotedAttribute(int delim) throws SAXException, StopSniffingException, IOException { 464 int b; 465 for (;;) { 466 b = read(); 467 if (b == delim) { 468 checkAttribute(); 469 return; 470 } else { 471 if (metaState == MetaState.A) { 472 attributeValue.append((char) b); 473 } 474 } 475 } 476 } 477 478 private void consumeUntilAndIncludingGt() throws IOException, StopSniffingException { 479 for (;;) { 480 if (read() == 0x3E) { // > 481 return; 482 } 483 } 484 } 485 486 /** 487 * Seen < , / 488 * 489 * @throws SAXException 490 * @throws StopSniffingException 491 * @throws IOException 492 */ 493 private void endTag() throws SAXException, StopSniffingException, IOException { 494 int b = read(); 495 if ((b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) { // ASCII 496 // letter 497 metaState = MetaState.NO; 498 tag(); 499 } else { 500 consumeUntilAndIncludingGt(); 501 } 502 } 503 504 /** 505 * Seen < , ! 506 * @throws IOException 507 * @throws StopSniffingException 508 */ 509 private void markupDecl() throws IOException, StopSniffingException { 510 if (read() == 0x2D) { // - 511 comment(); 512 } else { 513 consumeUntilAndIncludingGt(); 514 } 515 } 516 517 /** 518 * Seen < , ! , - 519 * @throws IOException 520 * @throws StopSniffingException 521 */ 522 private void comment() throws IOException, StopSniffingException { 523 if (read() == 0x2D) { // - 524 int hyphensSeen = 2; 525 for (;;) { 526 int b = read(); 527 if (b == 0x2D) { // - 528 hyphensSeen++; 529 } else if (b == 0x3E) { // > 530 if (hyphensSeen >= 2) { 531 return; 532 } else { 533 hyphensSeen = 0; 534 } 535 } else { 536 hyphensSeen = 0; 537 } 538 } 539 } else { 540 consumeUntilAndIncludingGt(); 541 } 542 } 543 544 public int getColumnNumber() { 545 return col; 546 } 547 548 public int getLineNumber() { 549 return line; 550 } 551 552 public String getPublicId() { 553 if (locator != null) { 554 return locator.getPublicId(); 555 } 556 return null; 557 } 558 559 public String getSystemId() { 560 if (locator != null) { 561 return locator.getSystemId(); 562 } 563 return null; 564 } 565 566 }