001 /* 002 * Copyright (c) 2007 Henri Sivonen 003 * Copyright (c) 2008-2010 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.htmlparser.impl; 025 026 import java.io.IOException; 027 028 import nu.validator.htmlparser.annotation.Auto; 029 import nu.validator.htmlparser.annotation.Inline; 030 import nu.validator.htmlparser.common.ByteReadable; 031 032 import org.xml.sax.SAXException; 033 034 public abstract class MetaScanner { 035 036 /** 037 * Constant for "charset". 038 */ 039 private static final char[] CHARSET = "harset".toCharArray(); 040 041 /** 042 * Constant for "content". 043 */ 044 private static final char[] CONTENT = "ontent".toCharArray(); 045 046 /** 047 * Constant for "http-equiv". 048 */ 049 private static final char[] HTTP_EQUIV = "ttp-equiv".toCharArray(); 050 051 /** 052 * Constant for "content-type". 053 */ 054 private static final char[] CONTENT_TYPE = "content-type".toCharArray(); 055 056 private static final int NO = 0; 057 058 private static final int M = 1; 059 060 private static final int E = 2; 061 062 private static final int T = 3; 063 064 private static final int A = 4; 065 066 private static final int DATA = 0; 067 068 private static final int TAG_OPEN = 1; 069 070 private static final int SCAN_UNTIL_GT = 2; 071 072 private static final int TAG_NAME = 3; 073 074 private static final int BEFORE_ATTRIBUTE_NAME = 4; 075 076 private static final int ATTRIBUTE_NAME = 5; 077 078 private static final int AFTER_ATTRIBUTE_NAME = 6; 079 080 private static final int BEFORE_ATTRIBUTE_VALUE = 7; 081 082 private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8; 083 084 private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9; 085 086 private static final int ATTRIBUTE_VALUE_UNQUOTED = 10; 087 088 private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11; 089 090 private static final int MARKUP_DECLARATION_OPEN = 13; 091 092 private static final int MARKUP_DECLARATION_HYPHEN = 14; 093 094 private static final int COMMENT_START = 15; 095 096 private static final int COMMENT_START_DASH = 16; 097 098 private static final int COMMENT = 17; 099 100 private static final int COMMENT_END_DASH = 18; 101 102 private static final int COMMENT_END = 19; 103 104 private static final int SELF_CLOSING_START_TAG = 20; 105 106 private static final int HTTP_EQUIV_NOT_SEEN = 0; 107 108 private static final int HTTP_EQUIV_CONTENT_TYPE = 1; 109 110 private static final int HTTP_EQUIV_OTHER = 2; 111 112 /** 113 * The data source. 114 */ 115 protected ByteReadable readable; 116 117 /** 118 * The state of the state machine that recognizes the tag name "meta". 119 */ 120 private int metaState = NO; 121 122 /** 123 * The current position in recognizing the attribute name "content". 124 */ 125 private int contentIndex = Integer.MAX_VALUE; 126 127 /** 128 * The current position in recognizing the attribute name "charset". 129 */ 130 private int charsetIndex = Integer.MAX_VALUE; 131 132 /** 133 * The current position in recognizing the attribute name "http-equive". 134 */ 135 private int httpEquivIndex = Integer.MAX_VALUE; 136 137 /** 138 * The current position in recognizing the attribute value "content-type". 139 */ 140 private int contentTypeIndex = Integer.MAX_VALUE; 141 142 /** 143 * The tokenizer state. 144 */ 145 protected int stateSave = DATA; 146 147 /** 148 * The currently filled length of strBuf. 149 */ 150 private int strBufLen; 151 152 /** 153 * Accumulation buffer for attribute values. 154 */ 155 private @Auto char[] strBuf; 156 157 private String content; 158 159 private String charset; 160 161 private int httpEquivState; 162 163 public MetaScanner() { 164 this.readable = null; 165 this.metaState = NO; 166 this.contentIndex = Integer.MAX_VALUE; 167 this.charsetIndex = Integer.MAX_VALUE; 168 this.httpEquivIndex = Integer.MAX_VALUE; 169 this.contentTypeIndex = Integer.MAX_VALUE; 170 this.stateSave = DATA; 171 this.strBufLen = 0; 172 this.strBuf = new char[36]; 173 this.content = null; 174 this.charset = null; 175 this.httpEquivState = HTTP_EQUIV_NOT_SEEN; 176 } 177 178 @SuppressWarnings("unused") private void destructor() { 179 Portability.releaseString(content); 180 Portability.releaseString(charset); 181 } 182 183 // [NOCPP[ 184 185 /** 186 * Reads a byte from the data source. 187 * 188 * -1 means end. 189 * @return 190 * @throws IOException 191 */ 192 protected int read() throws IOException { 193 return readable.readByte(); 194 } 195 196 // ]NOCPP] 197 198 // WARNING When editing this, makes sure the bytecode length shown by javap 199 // stays under 8000 bytes! 200 /** 201 * The runs the meta scanning algorithm. 202 */ 203 protected final void stateLoop(int state) 204 throws SAXException, IOException { 205 int c = -1; 206 boolean reconsume = false; 207 stateloop: for (;;) { 208 switch (state) { 209 case DATA: 210 dataloop: for (;;) { 211 if (reconsume) { 212 reconsume = false; 213 } else { 214 c = read(); 215 } 216 switch (c) { 217 case -1: 218 break stateloop; 219 case '<': 220 state = MetaScanner.TAG_OPEN; 221 break dataloop; // FALL THROUGH continue 222 // stateloop; 223 default: 224 continue; 225 } 226 } 227 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 228 case TAG_OPEN: 229 tagopenloop: for (;;) { 230 c = read(); 231 switch (c) { 232 case -1: 233 break stateloop; 234 case 'm': 235 case 'M': 236 metaState = M; 237 state = MetaScanner.TAG_NAME; 238 break tagopenloop; 239 // continue stateloop; 240 case '!': 241 state = MetaScanner.MARKUP_DECLARATION_OPEN; 242 continue stateloop; 243 case '?': 244 case '/': 245 state = MetaScanner.SCAN_UNTIL_GT; 246 continue stateloop; 247 case '>': 248 state = MetaScanner.DATA; 249 continue stateloop; 250 default: 251 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { 252 metaState = NO; 253 state = MetaScanner.TAG_NAME; 254 break tagopenloop; 255 // continue stateloop; 256 } 257 state = MetaScanner.DATA; 258 reconsume = true; 259 continue stateloop; 260 } 261 } 262 // FALL THROUGH DON'T REORDER 263 case TAG_NAME: 264 tagnameloop: for (;;) { 265 c = read(); 266 switch (c) { 267 case -1: 268 break stateloop; 269 case ' ': 270 case '\t': 271 case '\n': 272 case '\u000C': 273 state = MetaScanner.BEFORE_ATTRIBUTE_NAME; 274 break tagnameloop; 275 // continue stateloop; 276 case '/': 277 state = MetaScanner.SELF_CLOSING_START_TAG; 278 continue stateloop; 279 case '>': 280 state = MetaScanner.DATA; 281 continue stateloop; 282 case 'e': 283 case 'E': 284 if (metaState == M) { 285 metaState = E; 286 } else { 287 metaState = NO; 288 } 289 continue; 290 case 't': 291 case 'T': 292 if (metaState == E) { 293 metaState = T; 294 } else { 295 metaState = NO; 296 } 297 continue; 298 case 'a': 299 case 'A': 300 if (metaState == T) { 301 metaState = A; 302 } else { 303 metaState = NO; 304 } 305 continue; 306 default: 307 metaState = NO; 308 continue; 309 } 310 } 311 // FALLTHRU DON'T REORDER 312 case BEFORE_ATTRIBUTE_NAME: 313 beforeattributenameloop: for (;;) { 314 if (reconsume) { 315 reconsume = false; 316 } else { 317 c = read(); 318 } 319 /* 320 * Consume the next input character: 321 */ 322 switch (c) { 323 case -1: 324 break stateloop; 325 case ' ': 326 case '\t': 327 case '\n': 328 case '\u000C': 329 continue; 330 case '/': 331 state = MetaScanner.SELF_CLOSING_START_TAG; 332 continue stateloop; 333 case '>': 334 if (handleTag()) { 335 break stateloop; 336 } 337 state = DATA; 338 continue stateloop; 339 case 'c': 340 case 'C': 341 contentIndex = 0; 342 charsetIndex = 0; 343 httpEquivIndex = Integer.MAX_VALUE; 344 contentTypeIndex = Integer.MAX_VALUE; 345 state = MetaScanner.ATTRIBUTE_NAME; 346 break beforeattributenameloop; 347 case 'h': 348 case 'H': 349 contentIndex = Integer.MAX_VALUE; 350 charsetIndex = Integer.MAX_VALUE; 351 httpEquivIndex = 0; 352 contentTypeIndex = Integer.MAX_VALUE; 353 state = MetaScanner.ATTRIBUTE_NAME; 354 break beforeattributenameloop; 355 default: 356 contentIndex = Integer.MAX_VALUE; 357 charsetIndex = Integer.MAX_VALUE; 358 httpEquivIndex = Integer.MAX_VALUE; 359 contentTypeIndex = Integer.MAX_VALUE; 360 state = MetaScanner.ATTRIBUTE_NAME; 361 break beforeattributenameloop; 362 // continue stateloop; 363 } 364 } 365 // FALLTHRU DON'T REORDER 366 case ATTRIBUTE_NAME: 367 attributenameloop: for (;;) { 368 c = read(); 369 switch (c) { 370 case -1: 371 break stateloop; 372 case ' ': 373 case '\t': 374 case '\n': 375 case '\u000C': 376 state = MetaScanner.AFTER_ATTRIBUTE_NAME; 377 continue stateloop; 378 case '/': 379 state = MetaScanner.SELF_CLOSING_START_TAG; 380 continue stateloop; 381 case '=': 382 strBufLen = 0; 383 contentTypeIndex = 0; 384 state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; 385 break attributenameloop; 386 // continue stateloop; 387 case '>': 388 if (handleTag()) { 389 break stateloop; 390 } 391 state = MetaScanner.DATA; 392 continue stateloop; 393 default: 394 if (metaState == A) { 395 if (c >= 'A' && c <= 'Z') { 396 c += 0x20; 397 } 398 if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) { 399 ++contentIndex; 400 } else { 401 contentIndex = Integer.MAX_VALUE; 402 } 403 if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) { 404 ++charsetIndex; 405 } else { 406 charsetIndex = Integer.MAX_VALUE; 407 } 408 if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) { 409 ++httpEquivIndex; 410 } else { 411 httpEquivIndex = Integer.MAX_VALUE; 412 } 413 } 414 continue; 415 } 416 } 417 // FALLTHRU DON'T REORDER 418 case BEFORE_ATTRIBUTE_VALUE: 419 beforeattributevalueloop: for (;;) { 420 c = read(); 421 switch (c) { 422 case -1: 423 break stateloop; 424 case ' ': 425 case '\t': 426 case '\n': 427 case '\u000C': 428 continue; 429 case '"': 430 state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED; 431 break beforeattributevalueloop; 432 // continue stateloop; 433 case '\'': 434 state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED; 435 continue stateloop; 436 case '>': 437 if (handleTag()) { 438 break stateloop; 439 } 440 state = MetaScanner.DATA; 441 continue stateloop; 442 default: 443 handleCharInAttributeValue(c); 444 state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED; 445 continue stateloop; 446 } 447 } 448 // FALLTHRU DON'T REORDER 449 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 450 attributevaluedoublequotedloop: for (;;) { 451 if (reconsume) { 452 reconsume = false; 453 } else { 454 c = read(); 455 } 456 switch (c) { 457 case -1: 458 break stateloop; 459 case '"': 460 handleAttributeValue(); 461 state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; 462 break attributevaluedoublequotedloop; 463 // continue stateloop; 464 default: 465 handleCharInAttributeValue(c); 466 continue; 467 } 468 } 469 // FALLTHRU DON'T REORDER 470 case AFTER_ATTRIBUTE_VALUE_QUOTED: 471 afterattributevaluequotedloop: for (;;) { 472 c = read(); 473 switch (c) { 474 case -1: 475 break stateloop; 476 case ' ': 477 case '\t': 478 case '\n': 479 case '\u000C': 480 state = MetaScanner.BEFORE_ATTRIBUTE_NAME; 481 continue stateloop; 482 case '/': 483 state = MetaScanner.SELF_CLOSING_START_TAG; 484 break afterattributevaluequotedloop; 485 // continue stateloop; 486 case '>': 487 if (handleTag()) { 488 break stateloop; 489 } 490 state = MetaScanner.DATA; 491 continue stateloop; 492 default: 493 state = MetaScanner.BEFORE_ATTRIBUTE_NAME; 494 reconsume = true; 495 continue stateloop; 496 } 497 } 498 // FALLTHRU DON'T REORDER 499 case SELF_CLOSING_START_TAG: 500 c = read(); 501 switch (c) { 502 case -1: 503 break stateloop; 504 case '>': 505 if (handleTag()) { 506 break stateloop; 507 } 508 state = MetaScanner.DATA; 509 continue stateloop; 510 default: 511 state = MetaScanner.BEFORE_ATTRIBUTE_NAME; 512 reconsume = true; 513 continue stateloop; 514 } 515 // XXX reorder point 516 case ATTRIBUTE_VALUE_UNQUOTED: 517 for (;;) { 518 if (reconsume) { 519 reconsume = false; 520 } else { 521 c = read(); 522 } 523 switch (c) { 524 case -1: 525 break stateloop; 526 case ' ': 527 case '\t': 528 case '\n': 529 530 case '\u000C': 531 handleAttributeValue(); 532 state = MetaScanner.BEFORE_ATTRIBUTE_NAME; 533 continue stateloop; 534 case '>': 535 handleAttributeValue(); 536 if (handleTag()) { 537 break stateloop; 538 } 539 state = MetaScanner.DATA; 540 continue stateloop; 541 default: 542 handleCharInAttributeValue(c); 543 continue; 544 } 545 } 546 // XXX reorder point 547 case AFTER_ATTRIBUTE_NAME: 548 for (;;) { 549 c = read(); 550 switch (c) { 551 case -1: 552 break stateloop; 553 case ' ': 554 case '\t': 555 case '\n': 556 case '\u000C': 557 continue; 558 case '/': 559 handleAttributeValue(); 560 state = MetaScanner.SELF_CLOSING_START_TAG; 561 continue stateloop; 562 case '=': 563 strBufLen = 0; 564 contentTypeIndex = 0; 565 state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; 566 continue stateloop; 567 case '>': 568 handleAttributeValue(); 569 if (handleTag()) { 570 break stateloop; 571 } 572 state = MetaScanner.DATA; 573 continue stateloop; 574 case 'c': 575 case 'C': 576 contentIndex = 0; 577 charsetIndex = 0; 578 state = MetaScanner.ATTRIBUTE_NAME; 579 continue stateloop; 580 default: 581 contentIndex = -1; 582 charsetIndex = -1; 583 state = MetaScanner.ATTRIBUTE_NAME; 584 continue stateloop; 585 } 586 } 587 // XXX reorder point 588 case MARKUP_DECLARATION_OPEN: 589 markupdeclarationopenloop: for (;;) { 590 c = read(); 591 switch (c) { 592 case -1: 593 break stateloop; 594 case '-': 595 state = MetaScanner.MARKUP_DECLARATION_HYPHEN; 596 break markupdeclarationopenloop; 597 // continue stateloop; 598 default: 599 state = MetaScanner.SCAN_UNTIL_GT; 600 reconsume = true; 601 continue stateloop; 602 } 603 } 604 // FALLTHRU DON'T REORDER 605 case MARKUP_DECLARATION_HYPHEN: 606 markupdeclarationhyphenloop: for (;;) { 607 c = read(); 608 switch (c) { 609 case -1: 610 break stateloop; 611 case '-': 612 state = MetaScanner.COMMENT_START; 613 break markupdeclarationhyphenloop; 614 // continue stateloop; 615 default: 616 state = MetaScanner.SCAN_UNTIL_GT; 617 reconsume = true; 618 continue stateloop; 619 } 620 } 621 // FALLTHRU DON'T REORDER 622 case COMMENT_START: 623 commentstartloop: for (;;) { 624 c = read(); 625 switch (c) { 626 case -1: 627 break stateloop; 628 case '-': 629 state = MetaScanner.COMMENT_START_DASH; 630 continue stateloop; 631 case '>': 632 state = MetaScanner.DATA; 633 continue stateloop; 634 default: 635 state = MetaScanner.COMMENT; 636 break commentstartloop; 637 // continue stateloop; 638 } 639 } 640 // FALLTHRU DON'T REORDER 641 case COMMENT: 642 commentloop: for (;;) { 643 c = read(); 644 switch (c) { 645 case -1: 646 break stateloop; 647 case '-': 648 state = MetaScanner.COMMENT_END_DASH; 649 break commentloop; 650 // continue stateloop; 651 default: 652 continue; 653 } 654 } 655 // FALLTHRU DON'T REORDER 656 case COMMENT_END_DASH: 657 commentenddashloop: for (;;) { 658 c = read(); 659 switch (c) { 660 case -1: 661 break stateloop; 662 case '-': 663 state = MetaScanner.COMMENT_END; 664 break commentenddashloop; 665 // continue stateloop; 666 default: 667 state = MetaScanner.COMMENT; 668 continue stateloop; 669 } 670 } 671 // FALLTHRU DON'T REORDER 672 case COMMENT_END: 673 for (;;) { 674 c = read(); 675 switch (c) { 676 case -1: 677 break stateloop; 678 case '>': 679 state = MetaScanner.DATA; 680 continue stateloop; 681 case '-': 682 continue; 683 default: 684 state = MetaScanner.COMMENT; 685 continue stateloop; 686 } 687 } 688 // XXX reorder point 689 case COMMENT_START_DASH: 690 c = read(); 691 switch (c) { 692 case -1: 693 break stateloop; 694 case '-': 695 state = MetaScanner.COMMENT_END; 696 continue stateloop; 697 case '>': 698 state = MetaScanner.DATA; 699 continue stateloop; 700 default: 701 state = MetaScanner.COMMENT; 702 continue stateloop; 703 } 704 // XXX reorder point 705 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 706 for (;;) { 707 if (reconsume) { 708 reconsume = false; 709 } else { 710 c = read(); 711 } 712 switch (c) { 713 case -1: 714 break stateloop; 715 case '\'': 716 handleAttributeValue(); 717 state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; 718 continue stateloop; 719 default: 720 handleCharInAttributeValue(c); 721 continue; 722 } 723 } 724 // XXX reorder point 725 case SCAN_UNTIL_GT: 726 for (;;) { 727 if (reconsume) { 728 reconsume = false; 729 } else { 730 c = read(); 731 } 732 switch (c) { 733 case -1: 734 break stateloop; 735 case '>': 736 state = MetaScanner.DATA; 737 continue stateloop; 738 default: 739 continue; 740 } 741 } 742 } 743 } 744 stateSave = state; 745 } 746 747 private void handleCharInAttributeValue(int c) { 748 if (metaState == A) { 749 if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) { 750 addToBuffer(c); 751 } else if (httpEquivIndex == HTTP_EQUIV.length) { 752 if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) { 753 ++contentTypeIndex; 754 } else { 755 contentTypeIndex = Integer.MAX_VALUE; 756 } 757 } 758 } 759 } 760 761 @Inline private int toAsciiLowerCase(int c) { 762 if (c >= 'A' && c <= 'Z') { 763 return c + 0x20; 764 } 765 return c; 766 } 767 768 /** 769 * Adds a character to the accumulation buffer. 770 * @param c the character to add 771 */ 772 private void addToBuffer(int c) { 773 if (strBufLen == strBuf.length) { 774 char[] newBuf = new char[strBuf.length + (strBuf.length << 1)]; 775 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); 776 strBuf = newBuf; 777 } 778 strBuf[strBufLen++] = (char)c; 779 } 780 781 /** 782 * Attempts to extract a charset name from the accumulation buffer. 783 * @return <code>true</code> if successful 784 * @throws SAXException 785 */ 786 private void handleAttributeValue() throws SAXException { 787 if (metaState != A) { 788 return; 789 } 790 if (contentIndex == CONTENT.length && content == null) { 791 content = Portability.newStringFromBuffer(strBuf, 0, strBufLen); 792 return; 793 } 794 if (charsetIndex == CHARSET.length && charset == null) { 795 charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen); 796 return; 797 } 798 if (httpEquivIndex == HTTP_EQUIV.length 799 && httpEquivState == HTTP_EQUIV_NOT_SEEN) { 800 httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE 801 : HTTP_EQUIV_OTHER; 802 return; 803 } 804 } 805 806 private boolean handleTag() throws SAXException { 807 boolean stop = handleTagInner(); 808 Portability.releaseString(content); 809 content = null; 810 Portability.releaseString(charset); 811 charset = null; 812 httpEquivState = HTTP_EQUIV_NOT_SEEN; 813 return stop; 814 } 815 816 private boolean handleTagInner() throws SAXException { 817 if (charset != null && tryCharset(charset)) { 818 return true; 819 } 820 if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) { 821 String extract = TreeBuilder.extractCharsetFromContent(content); 822 if (extract == null) { 823 return false; 824 } 825 boolean success = tryCharset(extract); 826 Portability.releaseString(extract); 827 return success; 828 } 829 return false; 830 } 831 832 /** 833 * Tries to switch to an encoding. 834 * 835 * @param encoding 836 * @return <code>true</code> if successful 837 * @throws SAXException 838 */ 839 protected abstract boolean tryCharset(String encoding) throws SAXException; 840 841 }