001 /* 002 * Copyright (c) 2005-2007 Henri Sivonen 003 * Copyright (c) 2007-2010 Mozilla Foundation 004 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 005 * Foundation, and Opera Software ASA. 006 * 007 * Permission is hereby granted, free of charge, to any person obtaining a 008 * copy of this software and associated documentation files (the "Software"), 009 * to deal in the Software without restriction, including without limitation 010 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 011 * and/or sell copies of the Software, and to permit persons to whom the 012 * Software is furnished to do so, subject to the following conditions: 013 * 014 * The above copyright notice and this permission notice shall be included in 015 * all copies or substantial portions of the Software. 016 * 017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 018 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 019 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 020 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 021 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 022 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 023 * DEALINGS IN THE SOFTWARE. 024 */ 025 026 /* 027 * The comments following this one that use the same comment syntax as this 028 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 029 * amended as of June 18 2008 and May 31 2010. 030 * That document came with this statement: 031 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and 032 * Opera Software ASA. You are granted a license to use, reproduce and 033 * create derivative works of this document." 034 */ 035 036 package nu.validator.htmlparser.impl; 037 038 import nu.validator.htmlparser.annotation.Auto; 039 import nu.validator.htmlparser.annotation.CharacterName; 040 import nu.validator.htmlparser.annotation.Const; 041 import nu.validator.htmlparser.annotation.Inline; 042 import nu.validator.htmlparser.annotation.Local; 043 import nu.validator.htmlparser.annotation.NoLength; 044 import nu.validator.htmlparser.common.EncodingDeclarationHandler; 045 import nu.validator.htmlparser.common.Interner; 046 import nu.validator.htmlparser.common.TokenHandler; 047 import nu.validator.htmlparser.common.XmlViolationPolicy; 048 049 import org.xml.sax.ErrorHandler; 050 import org.xml.sax.Locator; 051 import org.xml.sax.SAXException; 052 import org.xml.sax.SAXParseException; 053 054 /** 055 * An implementation of 056 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html 057 * 058 * This class implements the <code>Locator</code> interface. This is not an 059 * incidental implementation detail: Users of this class are encouraged to make 060 * use of the <code>Locator</code> nature. 061 * 062 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer 063 * can be configured to treat these conditions as fatal or to coerce the infoset 064 * to something that XML 1.0 allows. 065 * 066 * @version $Id$ 067 * @author hsivonen 068 */ 069 public class Tokenizer implements Locator { 070 071 private static final int DATA_AND_RCDATA_MASK = ~1; 072 073 public static final int DATA = 0; 074 075 public static final int RCDATA = 1; 076 077 public static final int SCRIPT_DATA = 2; 078 079 public static final int RAWTEXT = 3; 080 081 public static final int SCRIPT_DATA_ESCAPED = 4; 082 083 public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; 084 085 public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; 086 087 public static final int ATTRIBUTE_VALUE_UNQUOTED = 7; 088 089 public static final int PLAINTEXT = 8; 090 091 public static final int TAG_OPEN = 9; 092 093 public static final int CLOSE_TAG_OPEN = 10; 094 095 public static final int TAG_NAME = 11; 096 097 public static final int BEFORE_ATTRIBUTE_NAME = 12; 098 099 public static final int ATTRIBUTE_NAME = 13; 100 101 public static final int AFTER_ATTRIBUTE_NAME = 14; 102 103 public static final int BEFORE_ATTRIBUTE_VALUE = 15; 104 105 public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16; 106 107 public static final int BOGUS_COMMENT = 17; 108 109 public static final int MARKUP_DECLARATION_OPEN = 18; 110 111 public static final int DOCTYPE = 19; 112 113 public static final int BEFORE_DOCTYPE_NAME = 20; 114 115 public static final int DOCTYPE_NAME = 21; 116 117 public static final int AFTER_DOCTYPE_NAME = 22; 118 119 public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; 120 121 public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; 122 123 public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; 124 125 public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; 126 127 public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; 128 129 public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; 130 131 public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; 132 133 public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; 134 135 public static final int BOGUS_DOCTYPE = 31; 136 137 public static final int COMMENT_START = 32; 138 139 public static final int COMMENT_START_DASH = 33; 140 141 public static final int COMMENT = 34; 142 143 public static final int COMMENT_END_DASH = 35; 144 145 public static final int COMMENT_END = 36; 146 147 public static final int COMMENT_END_BANG = 37; 148 149 public static final int NON_DATA_END_TAG_NAME = 38; 150 151 public static final int MARKUP_DECLARATION_HYPHEN = 39; 152 153 public static final int MARKUP_DECLARATION_OCTYPE = 40; 154 155 public static final int DOCTYPE_UBLIC = 41; 156 157 public static final int DOCTYPE_YSTEM = 42; 158 159 public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; 160 161 public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; 162 163 public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; 164 165 public static final int CONSUME_CHARACTER_REFERENCE = 46; 166 167 public static final int CONSUME_NCR = 47; 168 169 public static final int CHARACTER_REFERENCE_TAIL = 48; 170 171 public static final int HEX_NCR_LOOP = 49; 172 173 public static final int DECIMAL_NRC_LOOP = 50; 174 175 public static final int HANDLE_NCR_VALUE = 51; 176 177 public static final int HANDLE_NCR_VALUE_RECONSUME = 52; 178 179 public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53; 180 181 public static final int SELF_CLOSING_START_TAG = 54; 182 183 public static final int CDATA_START = 55; 184 185 public static final int CDATA_SECTION = 56; 186 187 public static final int CDATA_RSQB = 57; 188 189 public static final int CDATA_RSQB_RSQB = 58; 190 191 public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59; 192 193 public static final int SCRIPT_DATA_ESCAPE_START = 60; 194 195 public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61; 196 197 public static final int SCRIPT_DATA_ESCAPED_DASH = 62; 198 199 public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63; 200 201 public static final int BOGUS_COMMENT_HYPHEN = 64; 202 203 public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; 204 205 public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; 206 207 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; 208 209 public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68; 210 211 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; 212 213 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; 214 215 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; 216 217 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; 218 219 /** 220 * Magic value for UTF-16 operations. 221 */ 222 private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); 223 224 /** 225 * UTF-16 code unit array containing less than and greater than for emitting 226 * those characters on certain parse errors. 227 */ 228 private static final @NoLength char[] LT_GT = { '<', '>' }; 229 230 /** 231 * UTF-16 code unit array containing less than and solidus for emitting 232 * those characters on certain parse errors. 233 */ 234 private static final @NoLength char[] LT_SOLIDUS = { '<', '/' }; 235 236 /** 237 * UTF-16 code unit array containing ]] for emitting those characters on 238 * state transitions. 239 */ 240 private static final @NoLength char[] RSQB_RSQB = { ']', ']' }; 241 242 /** 243 * Array version of U+FFFD. 244 */ 245 private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; 246 247 // [NOCPP[ 248 249 /** 250 * Array version of space. 251 */ 252 private static final @NoLength char[] SPACE = { ' ' }; 253 254 // ]NOCPP] 255 256 /** 257 * Array version of line feed. 258 */ 259 private static final @NoLength char[] LF = { '\n' }; 260 261 /** 262 * Buffer growth parameter. 263 */ 264 private static final int BUFFER_GROW_BY = 1024; 265 266 /** 267 * "CDATA[" as <code>char[]</code> 268 */ 269 private static final @NoLength char[] CDATA_LSQB = "CDATA[".toCharArray(); 270 271 /** 272 * "octype" as <code>char[]</code> 273 */ 274 private static final @NoLength char[] OCTYPE = "octype".toCharArray(); 275 276 /** 277 * "ublic" as <code>char[]</code> 278 */ 279 private static final @NoLength char[] UBLIC = "ublic".toCharArray(); 280 281 /** 282 * "ystem" as <code>char[]</code> 283 */ 284 private static final @NoLength char[] YSTEM = "ystem".toCharArray(); 285 286 private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' }; 287 288 private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' }; 289 290 private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' }; 291 292 private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't', 293 'e', 'x', 't' }; 294 295 private static final char[] XMP_ARR = { 'x', 'm', 'p' }; 296 297 private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r', 298 'e', 'a' }; 299 300 private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' }; 301 302 private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e', 303 'd' }; 304 305 private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i', 306 'p', 't' }; 307 308 private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm', 309 'e', 's' }; 310 311 /** 312 * The token handler. 313 */ 314 protected final TokenHandler tokenHandler; 315 316 protected EncodingDeclarationHandler encodingDeclarationHandler; 317 318 // [NOCPP[ 319 320 /** 321 * The error handler. 322 */ 323 protected ErrorHandler errorHandler; 324 325 // ]NOCPP] 326 327 /** 328 * Whether the previous char read was CR. 329 */ 330 protected boolean lastCR; 331 332 protected int stateSave; 333 334 private int returnStateSave; 335 336 protected int index; 337 338 private boolean forceQuirks; 339 340 private char additional; 341 342 private int entCol; 343 344 private int firstCharKey; 345 346 private int lo; 347 348 private int hi; 349 350 private int candidate; 351 352 private int strBufMark; 353 354 private int prevValue; 355 356 protected int value; 357 358 private boolean seenDigits; 359 360 protected int cstart; 361 362 /** 363 * The SAX public id for the resource being tokenized. (Only passed to back 364 * as part of locator data.) 365 */ 366 private String publicId; 367 368 /** 369 * The SAX system id for the resource being tokenized. (Only passed to back 370 * as part of locator data.) 371 */ 372 private String systemId; 373 374 /** 375 * Buffer for short identifiers. 376 */ 377 private @Auto char[] strBuf; 378 379 /** 380 * Number of significant <code>char</code>s in <code>strBuf</code>. 381 */ 382 private int strBufLen; 383 384 /** 385 * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise 386 * an offset to the main buffer. 387 */ 388 // private int strBufOffset = -1; 389 /** 390 * Buffer for long strings. 391 */ 392 private @Auto char[] longStrBuf; 393 394 /** 395 * Number of significant <code>char</code>s in <code>longStrBuf</code>. 396 */ 397 private int longStrBufLen; 398 399 /** 400 * <code>-1</code> to indicate that <code>longStrBuf</code> is used or 401 * otherwise an offset to the main buffer. 402 */ 403 // private int longStrBufOffset = -1; 404 405 /** 406 * Buffer for expanding NCRs falling into the Basic Multilingual Plane. 407 */ 408 private final @Auto char[] bmpChar; 409 410 /** 411 * Buffer for expanding astral NCRs. 412 */ 413 private final @Auto char[] astralChar; 414 415 /** 416 * The element whose end tag closes the current CDATA or RCDATA element. 417 */ 418 protected ElementName endTagExpectation = null; 419 420 private char[] endTagExpectationAsArray; // not @Auto! 421 422 /** 423 * <code>true</code> if tokenizing an end tag 424 */ 425 protected boolean endTag; 426 427 /** 428 * The current tag token name. 429 */ 430 private ElementName tagName = null; 431 432 /** 433 * The current attribute name. 434 */ 435 protected AttributeName attributeName = null; 436 437 // [NOCPP[ 438 439 /** 440 * Whether comment tokens are emitted. 441 */ 442 private boolean wantsComments = false; 443 444 /** 445 * <code>true</code> when HTML4-specific additional errors are requested. 446 */ 447 protected boolean html4; 448 449 /** 450 * Whether the stream is past the first 512 bytes. 451 */ 452 private boolean metaBoundaryPassed; 453 454 // ]NOCPP] 455 456 /** 457 * The name of the current doctype token. 458 */ 459 private @Local String doctypeName; 460 461 /** 462 * The public id of the current doctype token. 463 */ 464 private String publicIdentifier; 465 466 /** 467 * The system id of the current doctype token. 468 */ 469 private String systemIdentifier; 470 471 /** 472 * The attribute holder. 473 */ 474 private HtmlAttributes attributes; 475 476 // [NOCPP[ 477 478 /** 479 * The policy for vertical tab and form feed. 480 */ 481 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET; 482 483 /** 484 * The policy for comments. 485 */ 486 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET; 487 488 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET; 489 490 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET; 491 492 private boolean html4ModeCompatibleWithXhtml1Schemata; 493 494 private final boolean newAttributesEachTime; 495 496 // ]NOCPP] 497 498 private int mappingLangToXmlLang; 499 500 private boolean shouldSuspend; 501 502 protected boolean confident; 503 504 private int line; 505 506 private Interner interner; 507 508 // [NOCPP[ 509 510 protected LocatorImpl ampersandLocation; 511 512 public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { 513 this.tokenHandler = tokenHandler; 514 this.encodingDeclarationHandler = null; 515 this.newAttributesEachTime = newAttributesEachTime; 516 this.bmpChar = new char[1]; 517 this.astralChar = new char[2]; 518 this.tagName = null; 519 this.attributeName = null; 520 this.doctypeName = null; 521 this.publicIdentifier = null; 522 this.systemIdentifier = null; 523 this.attributes = null; 524 } 525 526 // ]NOCPP] 527 528 /** 529 * The constructor. 530 * 531 * @param tokenHandler 532 * the handler for receiving tokens 533 */ 534 public Tokenizer(TokenHandler tokenHandler) { 535 this.tokenHandler = tokenHandler; 536 this.encodingDeclarationHandler = null; 537 // [NOCPP[ 538 this.newAttributesEachTime = false; 539 // ]NOCPP] 540 this.bmpChar = new char[1]; 541 this.astralChar = new char[2]; 542 this.tagName = null; 543 this.attributeName = null; 544 this.doctypeName = null; 545 this.publicIdentifier = null; 546 this.systemIdentifier = null; 547 this.attributes = null; 548 } 549 550 public void setInterner(Interner interner) { 551 this.interner = interner; 552 } 553 554 public void initLocation(String newPublicId, String newSystemId) { 555 this.systemId = newSystemId; 556 this.publicId = newPublicId; 557 558 } 559 560 // [NOCPP[ 561 562 /** 563 * Returns the mappingLangToXmlLang. 564 * 565 * @return the mappingLangToXmlLang 566 */ 567 public boolean isMappingLangToXmlLang() { 568 return mappingLangToXmlLang == AttributeName.HTML_LANG; 569 } 570 571 /** 572 * Sets the mappingLangToXmlLang. 573 * 574 * @param mappingLangToXmlLang 575 * the mappingLangToXmlLang to set 576 */ 577 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { 578 this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG 579 : AttributeName.HTML; 580 } 581 582 /** 583 * Sets the error handler. 584 * 585 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 586 */ 587 public void setErrorHandler(ErrorHandler eh) { 588 this.errorHandler = eh; 589 } 590 591 public ErrorHandler getErrorHandler() { 592 return this.errorHandler; 593 } 594 595 /** 596 * Sets the commentPolicy. 597 * 598 * @param commentPolicy 599 * the commentPolicy to set 600 */ 601 public void setCommentPolicy(XmlViolationPolicy commentPolicy) { 602 this.commentPolicy = commentPolicy; 603 } 604 605 /** 606 * Sets the contentNonXmlCharPolicy. 607 * 608 * @param contentNonXmlCharPolicy 609 * the contentNonXmlCharPolicy to set 610 */ 611 public void setContentNonXmlCharPolicy( 612 XmlViolationPolicy contentNonXmlCharPolicy) { 613 if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) { 614 throw new IllegalArgumentException( 615 "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW."); 616 } 617 } 618 619 /** 620 * Sets the contentSpacePolicy. 621 * 622 * @param contentSpacePolicy 623 * the contentSpacePolicy to set 624 */ 625 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { 626 this.contentSpacePolicy = contentSpacePolicy; 627 } 628 629 /** 630 * Sets the xmlnsPolicy. 631 * 632 * @param xmlnsPolicy 633 * the xmlnsPolicy to set 634 */ 635 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { 636 if (xmlnsPolicy == XmlViolationPolicy.FATAL) { 637 throw new IllegalArgumentException("Can't use FATAL here."); 638 } 639 this.xmlnsPolicy = xmlnsPolicy; 640 } 641 642 public void setNamePolicy(XmlViolationPolicy namePolicy) { 643 this.namePolicy = namePolicy; 644 } 645 646 /** 647 * Sets the html4ModeCompatibleWithXhtml1Schemata. 648 * 649 * @param html4ModeCompatibleWithXhtml1Schemata 650 * the html4ModeCompatibleWithXhtml1Schemata to set 651 */ 652 public void setHtml4ModeCompatibleWithXhtml1Schemata( 653 boolean html4ModeCompatibleWithXhtml1Schemata) { 654 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata; 655 } 656 657 // ]NOCPP] 658 659 // For the token handler to call 660 /** 661 * Sets the tokenizer state and the associated element name. This should 662 * only ever used to put the tokenizer into one of the states that have 663 * a special end tag expectation. 664 * 665 * @param specialTokenizerState 666 * the tokenizer state to set 667 * @param endTagExpectation 668 * the expected end tag for transitioning back to normal 669 */ 670 public void setStateAndEndTagExpectation(int specialTokenizerState, 671 @Local String endTagExpectation) { 672 this.stateSave = specialTokenizerState; 673 if (specialTokenizerState == Tokenizer.DATA) { 674 return; 675 } 676 @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation); 677 this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0, 678 asArray.length, interner); 679 endTagExpectationToArray(); 680 } 681 682 /** 683 * Sets the tokenizer state and the associated element name. This should 684 * only ever used to put the tokenizer into one of the states that have 685 * a special end tag expectation. 686 * 687 * @param specialTokenizerState 688 * the tokenizer state to set 689 * @param endTagExpectation 690 * the expected end tag for transitioning back to normal 691 */ 692 public void setStateAndEndTagExpectation(int specialTokenizerState, 693 ElementName endTagExpectation) { 694 this.stateSave = specialTokenizerState; 695 this.endTagExpectation = endTagExpectation; 696 endTagExpectationToArray(); 697 } 698 699 private void endTagExpectationToArray() { 700 switch (endTagExpectation.getGroup()) { 701 case TreeBuilder.TITLE: 702 endTagExpectationAsArray = TITLE_ARR; 703 return; 704 case TreeBuilder.SCRIPT: 705 endTagExpectationAsArray = SCRIPT_ARR; 706 return; 707 case TreeBuilder.STYLE: 708 endTagExpectationAsArray = STYLE_ARR; 709 return; 710 case TreeBuilder.PLAINTEXT: 711 endTagExpectationAsArray = PLAINTEXT_ARR; 712 return; 713 case TreeBuilder.XMP: 714 endTagExpectationAsArray = XMP_ARR; 715 return; 716 case TreeBuilder.TEXTAREA: 717 endTagExpectationAsArray = TEXTAREA_ARR; 718 return; 719 case TreeBuilder.IFRAME: 720 endTagExpectationAsArray = IFRAME_ARR; 721 return; 722 case TreeBuilder.NOEMBED: 723 endTagExpectationAsArray = NOEMBED_ARR; 724 return; 725 case TreeBuilder.NOSCRIPT: 726 endTagExpectationAsArray = NOSCRIPT_ARR; 727 return; 728 case TreeBuilder.NOFRAMES: 729 endTagExpectationAsArray = NOFRAMES_ARR; 730 return; 731 default: 732 assert false: "Bad end tag expectation."; 733 return; 734 } 735 } 736 737 /** 738 * For C++ use only. 739 */ 740 public void setLineNumber(int line) { 741 this.line = line; 742 } 743 744 // start Locator impl 745 746 /** 747 * @see org.xml.sax.Locator#getLineNumber() 748 */ 749 @Inline public int getLineNumber() { 750 return line; 751 } 752 753 // [NOCPP[ 754 755 /** 756 * @see org.xml.sax.Locator#getColumnNumber() 757 */ 758 @Inline public int getColumnNumber() { 759 return -1; 760 } 761 762 /** 763 * @see org.xml.sax.Locator#getPublicId() 764 */ 765 public String getPublicId() { 766 return publicId; 767 } 768 769 /** 770 * @see org.xml.sax.Locator#getSystemId() 771 */ 772 public String getSystemId() { 773 return systemId; 774 } 775 776 // end Locator impl 777 778 // end public API 779 780 public void notifyAboutMetaBoundary() { 781 metaBoundaryPassed = true; 782 } 783 784 void turnOnAdditionalHtml4Errors() { 785 html4 = true; 786 } 787 788 // ]NOCPP] 789 790 HtmlAttributes emptyAttributes() { 791 // [NOCPP[ 792 if (newAttributesEachTime) { 793 return new HtmlAttributes(mappingLangToXmlLang); 794 } else { 795 // ]NOCPP] 796 return HtmlAttributes.EMPTY_ATTRIBUTES; 797 // [NOCPP[ 798 } 799 // ]NOCPP] 800 } 801 802 @Inline private void clearStrBufAndAppend(char c) { 803 strBuf[0] = c; 804 strBufLen = 1; 805 } 806 807 @Inline private void clearStrBuf() { 808 strBufLen = 0; 809 } 810 811 /** 812 * Appends to the smaller buffer. 813 * 814 * @param c 815 * the UTF-16 code unit to append 816 */ 817 private void appendStrBuf(char c) { 818 if (strBufLen == strBuf.length) { 819 char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY]; 820 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); 821 strBuf = newBuf; 822 } 823 strBuf[strBufLen++] = c; 824 } 825 826 /** 827 * The smaller buffer as a String. Currently only used for error reporting. 828 * 829 * <p> 830 * C++ memory note: The return value must be released. 831 * 832 * @return the smaller buffer as a string 833 */ 834 protected String strBufToString() { 835 return Portability.newStringFromBuffer(strBuf, 0, strBufLen); 836 } 837 838 /** 839 * Returns the short buffer as a local name. The return value is released in 840 * emitDoctypeToken(). 841 * 842 * @return the smaller buffer as local name 843 */ 844 private void strBufToDoctypeName() { 845 doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen, 846 interner); 847 } 848 849 /** 850 * Emits the smaller buffer as character tokens. 851 * 852 * @throws SAXException 853 * if the token handler threw 854 */ 855 private void emitStrBuf() throws SAXException { 856 if (strBufLen > 0) { 857 tokenHandler.characters(strBuf, 0, strBufLen); 858 } 859 } 860 861 @Inline private void clearLongStrBuf() { 862 longStrBufLen = 0; 863 } 864 865 @Inline private void clearLongStrBufAndAppend(char c) { 866 longStrBuf[0] = c; 867 longStrBufLen = 1; 868 } 869 870 /** 871 * Appends to the larger buffer. 872 * 873 * @param c 874 * the UTF-16 code unit to append 875 */ 876 private void appendLongStrBuf(char c) { 877 if (longStrBufLen == longStrBuf.length) { 878 char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)]; 879 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); 880 longStrBuf = newBuf; 881 } 882 longStrBuf[longStrBufLen++] = c; 883 } 884 885 @Inline private void appendSecondHyphenToBogusComment() throws SAXException { 886 // [NOCPP[ 887 switch (commentPolicy) { 888 case ALTER_INFOSET: 889 // detachLongStrBuf(); 890 appendLongStrBuf(' '); 891 // FALLTHROUGH 892 case ALLOW: 893 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 894 // ]NOCPP] 895 appendLongStrBuf('-'); 896 // [NOCPP[ 897 break; 898 case FATAL: 899 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 900 break; 901 } 902 // ]NOCPP] 903 } 904 905 // [NOCPP[ 906 private void maybeAppendSpaceToBogusComment() throws SAXException { 907 switch (commentPolicy) { 908 case ALTER_INFOSET: 909 // detachLongStrBuf(); 910 appendLongStrBuf(' '); 911 // FALLTHROUGH 912 case ALLOW: 913 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 914 break; 915 case FATAL: 916 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 917 break; 918 } 919 } 920 921 // ]NOCPP] 922 923 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c) 924 throws SAXException { 925 errConsecutiveHyphens(); 926 // [NOCPP[ 927 switch (commentPolicy) { 928 case ALTER_INFOSET: 929 // detachLongStrBuf(); 930 longStrBufLen--; 931 appendLongStrBuf(' '); 932 appendLongStrBuf('-'); 933 // FALLTHROUGH 934 case ALLOW: 935 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 936 // ]NOCPP] 937 appendLongStrBuf(c); 938 // [NOCPP[ 939 break; 940 case FATAL: 941 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 942 break; 943 } 944 // ]NOCPP] 945 } 946 947 private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) { 948 int reqLen = longStrBufLen + length; 949 if (longStrBuf.length < reqLen) { 950 char[] newBuf = new char[reqLen + (reqLen >> 1)]; 951 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); 952 longStrBuf = newBuf; 953 } 954 System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length); 955 longStrBufLen = reqLen; 956 } 957 958 /** 959 * Append the contents of the smaller buffer to the larger one. 960 */ 961 @Inline private void appendStrBufToLongStrBuf() { 962 appendLongStrBuf(strBuf, 0, strBufLen); 963 } 964 965 /** 966 * The larger buffer as a string. 967 * 968 * <p> 969 * C++ memory note: The return value must be released. 970 * 971 * @return the larger buffer as a string 972 */ 973 private String longStrBufToString() { 974 return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen); 975 } 976 977 /** 978 * Emits the current comment token. 979 * 980 * @param pos 981 * TODO 982 * 983 * @throws SAXException 984 */ 985 private void emitComment(int provisionalHyphens, int pos) 986 throws SAXException { 987 // [NOCPP[ 988 if (wantsComments) { 989 // ]NOCPP] 990 // if (longStrBufOffset != -1) { 991 // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen 992 // - provisionalHyphens); 993 // } else { 994 tokenHandler.comment(longStrBuf, 0, longStrBufLen 995 - provisionalHyphens); 996 // } 997 // [NOCPP[ 998 } 999 // ]NOCPP] 1000 cstart = pos + 1; 1001 } 1002 1003 /** 1004 * Flushes coalesced character tokens. 1005 * 1006 * @param buf 1007 * TODO 1008 * @param pos 1009 * TODO 1010 * 1011 * @throws SAXException 1012 */ 1013 protected void flushChars(@NoLength char[] buf, int pos) 1014 throws SAXException { 1015 if (pos > cstart) { 1016 tokenHandler.characters(buf, cstart, pos - cstart); 1017 } 1018 cstart = Integer.MAX_VALUE; 1019 } 1020 1021 /** 1022 * Reports an condition that would make the infoset incompatible with XML 1023 * 1.0 as fatal. 1024 * 1025 * @param message 1026 * the message 1027 * @throws SAXException 1028 * @throws SAXParseException 1029 */ 1030 public void fatal(String message) throws SAXException { 1031 SAXParseException spe = new SAXParseException(message, this); 1032 if (errorHandler != null) { 1033 errorHandler.fatalError(spe); 1034 } 1035 throw spe; 1036 } 1037 1038 /** 1039 * Reports a Parse Error. 1040 * 1041 * @param message 1042 * the message 1043 * @throws SAXException 1044 */ 1045 public void err(String message) throws SAXException { 1046 if (errorHandler == null) { 1047 return; 1048 } 1049 SAXParseException spe = new SAXParseException(message, this); 1050 errorHandler.error(spe); 1051 } 1052 1053 public void errTreeBuilder(String message) throws SAXException { 1054 ErrorHandler eh = null; 1055 if (tokenHandler instanceof TreeBuilder<?>) { 1056 TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler; 1057 eh = treeBuilder.getErrorHandler(); 1058 } 1059 if (eh == null) { 1060 eh = errorHandler; 1061 } 1062 if (eh == null) { 1063 return; 1064 } 1065 SAXParseException spe = new SAXParseException(message, this); 1066 eh.error(spe); 1067 } 1068 1069 /** 1070 * Reports a warning 1071 * 1072 * @param message 1073 * the message 1074 * @throws SAXException 1075 */ 1076 public void warn(String message) throws SAXException { 1077 if (errorHandler == null) { 1078 return; 1079 } 1080 SAXParseException spe = new SAXParseException(message, this); 1081 errorHandler.warning(spe); 1082 } 1083 1084 /** 1085 * 1086 */ 1087 private void resetAttributes() { 1088 // [NOCPP[ 1089 if (newAttributesEachTime) { 1090 // ]NOCPP] 1091 attributes = null; 1092 // [NOCPP[ 1093 } else { 1094 attributes.clear(mappingLangToXmlLang); 1095 } 1096 // ]NOCPP] 1097 } 1098 1099 private void strBufToElementNameString() { 1100 // if (strBufOffset != -1) { 1101 // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen); 1102 // } else { 1103 tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen, 1104 interner); 1105 // } 1106 } 1107 1108 private int emitCurrentTagToken(boolean selfClosing, int pos) 1109 throws SAXException { 1110 cstart = pos + 1; 1111 maybeErrSlashInEndTag(selfClosing); 1112 stateSave = Tokenizer.DATA; 1113 HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES 1114 : attributes); 1115 if (endTag) { 1116 /* 1117 * When an end tag token is emitted, the content model flag must be 1118 * switched to the PCDATA state. 1119 */ 1120 maybeErrAttributesOnEndTag(attrs); 1121 tokenHandler.endTag(tagName); 1122 Portability.delete(attributes); 1123 } else { 1124 tokenHandler.startTag(tagName, attrs, selfClosing); 1125 } 1126 tagName.release(); 1127 tagName = null; 1128 resetAttributes(); 1129 /* 1130 * The token handler may have called setStateAndEndTagExpectation 1131 * and changed stateSave since the start of this method. 1132 */ 1133 return stateSave; 1134 } 1135 1136 private void attributeNameComplete() throws SAXException { 1137 // if (strBufOffset != -1) { 1138 // attributeName = AttributeName.nameByBuffer(buf, strBufOffset, 1139 // strBufLen, namePolicy != XmlViolationPolicy.ALLOW); 1140 // } else { 1141 attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen 1142 // [NOCPP[ 1143 , namePolicy != XmlViolationPolicy.ALLOW 1144 // ]NOCPP] 1145 , interner); 1146 // } 1147 1148 if (attributes == null) { 1149 attributes = new HtmlAttributes(mappingLangToXmlLang); 1150 } 1151 1152 /* 1153 * When the user agent leaves the attribute name state (and before 1154 * emitting the tag token, if appropriate), the complete attribute's 1155 * name must be compared to the other attributes on the same token; if 1156 * there is already an attribute on the token with the exact same name, 1157 * then this is a parse error and the new attribute must be dropped, 1158 * along with the value that gets associated with it (if any). 1159 */ 1160 if (attributes.contains(attributeName)) { 1161 errDuplicateAttribute(); 1162 attributeName.release(); 1163 attributeName = null; 1164 } 1165 } 1166 1167 private void addAttributeWithoutValue() throws SAXException { 1168 noteAttributeWithoutValue(); 1169 1170 // [NOCPP[ 1171 if (metaBoundaryPassed && AttributeName.CHARSET == attributeName 1172 && ElementName.META == tagName) { 1173 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); 1174 } 1175 // ]NOCPP] 1176 if (attributeName != null) { 1177 // [NOCPP[ 1178 if (html4) { 1179 if (attributeName.isBoolean()) { 1180 if (html4ModeCompatibleWithXhtml1Schemata) { 1181 attributes.addAttribute(attributeName, 1182 attributeName.getLocal(AttributeName.HTML), 1183 xmlnsPolicy); 1184 } else { 1185 attributes.addAttribute(attributeName, "", xmlnsPolicy); 1186 } 1187 } else { 1188 err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)"); 1189 attributes.addAttribute(attributeName, "", xmlnsPolicy); 1190 } 1191 } else { 1192 if (AttributeName.SRC == attributeName 1193 || AttributeName.HREF == attributeName) { 1194 warn("Attribute \u201C" 1195 + attributeName.getLocal(AttributeName.HTML) 1196 + "\u201D without an explicit value seen. The attribute may be dropped by IE7."); 1197 } 1198 // ]NOCPP] 1199 attributes.addAttribute(attributeName, 1200 Portability.newEmptyString() 1201 // [NOCPP[ 1202 , xmlnsPolicy 1203 // ]NOCPP] 1204 ); 1205 // [NOCPP[ 1206 } 1207 // ]NOCPP] 1208 attributeName = null; // attributeName has been adopted by the 1209 // |attributes| object 1210 } 1211 } 1212 1213 private void addAttributeWithValue() throws SAXException { 1214 // [NOCPP[ 1215 if (metaBoundaryPassed && ElementName.META == tagName 1216 && AttributeName.CHARSET == attributeName) { 1217 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); 1218 } 1219 // ]NOCPP] 1220 if (attributeName != null) { 1221 String val = longStrBufToString(); // Ownership transferred to 1222 // HtmlAttributes 1223 // [NOCPP[ 1224 if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata 1225 && attributeName.isCaseFolded()) { 1226 val = newAsciiLowerCaseStringFromString(val); 1227 } 1228 // ]NOCPP] 1229 attributes.addAttribute(attributeName, val 1230 // [NOCPP[ 1231 , xmlnsPolicy 1232 // ]NOCPP] 1233 ); 1234 attributeName = null; // attributeName has been adopted by the 1235 // |attributes| object 1236 } 1237 } 1238 1239 // [NOCPP[ 1240 1241 private static String newAsciiLowerCaseStringFromString(String str) { 1242 if (str == null) { 1243 return null; 1244 } 1245 char[] buf = new char[str.length()]; 1246 for (int i = 0; i < str.length(); i++) { 1247 char c = str.charAt(i); 1248 if (c >= 'A' && c <= 'Z') { 1249 c += 0x20; 1250 } 1251 buf[i] = c; 1252 } 1253 return new String(buf); 1254 } 1255 1256 protected void startErrorReporting() throws SAXException { 1257 1258 } 1259 1260 // ]NOCPP] 1261 1262 public void start() throws SAXException { 1263 initializeWithoutStarting(); 1264 tokenHandler.startTokenization(this); 1265 // [NOCPP[ 1266 startErrorReporting(); 1267 // ]NOCPP] 1268 } 1269 1270 public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException { 1271 int state = stateSave; 1272 int returnState = returnStateSave; 1273 char c = '\u0000'; 1274 shouldSuspend = false; 1275 lastCR = false; 1276 1277 int start = buffer.getStart(); 1278 /** 1279 * The index of the last <code>char</code> read from <code>buf</code>. 1280 */ 1281 int pos = start - 1; 1282 1283 /** 1284 * The index of the first <code>char</code> in <code>buf</code> that is 1285 * part of a coalesced run of character tokens or 1286 * <code>Integer.MAX_VALUE</code> if there is not a current run being 1287 * coalesced. 1288 */ 1289 switch (state) { 1290 case DATA: 1291 case RCDATA: 1292 case SCRIPT_DATA: 1293 case PLAINTEXT: 1294 case RAWTEXT: 1295 case CDATA_SECTION: 1296 case SCRIPT_DATA_ESCAPED: 1297 case SCRIPT_DATA_ESCAPE_START: 1298 case SCRIPT_DATA_ESCAPE_START_DASH: 1299 case SCRIPT_DATA_ESCAPED_DASH: 1300 case SCRIPT_DATA_ESCAPED_DASH_DASH: 1301 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 1302 case SCRIPT_DATA_DOUBLE_ESCAPED: 1303 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 1304 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 1305 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 1306 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 1307 cstart = start; 1308 break; 1309 default: 1310 cstart = Integer.MAX_VALUE; 1311 break; 1312 } 1313 1314 /** 1315 * The number of <code>char</code>s in <code>buf</code> that have 1316 * meaning. (The rest of the array is garbage and should not be 1317 * examined.) 1318 */ 1319 pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, 1320 buffer.getEnd()); 1321 if (pos == buffer.getEnd()) { 1322 // exiting due to end of buffer 1323 buffer.setStart(pos); 1324 } else { 1325 buffer.setStart(pos + 1); 1326 } 1327 return lastCR; 1328 } 1329 1330 @SuppressWarnings("unused") private int stateLoop(int state, char c, 1331 int pos, @NoLength char[] buf, boolean reconsume, int returnState, 1332 int endPos) throws SAXException { 1333 /* 1334 * Idioms used in this code: 1335 * 1336 * 1337 * Consuming the next input character 1338 * 1339 * To consume the next input character, the code does this: if (++pos == 1340 * endPos) { break stateloop; } c = checkChar(buf, pos); 1341 * 1342 * 1343 * Staying in a state 1344 * 1345 * When there's a state that the tokenizer may stay in over multiple 1346 * input characters, the state has a wrapper |for(;;)| loop and staying 1347 * in the state continues the loop. 1348 * 1349 * 1350 * Switching to another state 1351 * 1352 * To switch to another state, the code sets the state variable to the 1353 * magic number of the new state. Then it either continues stateloop or 1354 * breaks out of the state's own wrapper loop if the target state is 1355 * right after the current state in source order. (This is a partial 1356 * workaround for Java's lack of goto.) 1357 * 1358 * 1359 * Reconsume support 1360 * 1361 * The spec sometimes says that an input character is reconsumed in 1362 * another state. If a state can ever be entered so that an input 1363 * character can be reconsumed in it, the state's code starts with an 1364 * |if (reconsume)| that sets reconsume to false and skips over the 1365 * normal code for consuming a new character. 1366 * 1367 * To reconsume the current character in another state, the code sets 1368 * |reconsume| to true and then switches to the other state. 1369 * 1370 * 1371 * Emitting character tokens 1372 * 1373 * This method emits character tokens lazily. Whenever a new range of 1374 * character tokens starts, the field cstart must be set to the start 1375 * index of the range. The flushChars() method must be called at the end 1376 * of a range to flush it. 1377 * 1378 * 1379 * U+0000 handling 1380 * 1381 * The various states have to handle the replacement of U+0000 with 1382 * U+FFFD. However, if U+0000 would be reconsumed in another state, the 1383 * replacement doesn't need to happen, because it's handled by the 1384 * reconsuming state. 1385 * 1386 * 1387 * LF handling 1388 * 1389 * Every state needs to increment the line number upon LF unless the LF 1390 * gets reconsumed by another state which increments the line number. 1391 * 1392 * 1393 * CR handling 1394 * 1395 * Every state needs to handle CR unless the CR gets reconsumed and is 1396 * handled by the reconsuming state. The CR needs to be handled as if it 1397 * were and LF, the lastCR field must be set to true and then this 1398 * method must return. The IO driver will then swallow the next 1399 * character if it is an LF to coalesce CRLF. 1400 */ 1401 stateloop: for (;;) { 1402 switch (state) { 1403 case DATA: 1404 dataloop: for (;;) { 1405 if (reconsume) { 1406 reconsume = false; 1407 } else { 1408 if (++pos == endPos) { 1409 break stateloop; 1410 } 1411 c = checkChar(buf, pos); 1412 } 1413 switch (c) { 1414 case '&': 1415 /* 1416 * U+0026 AMPERSAND (&) Switch to the character 1417 * reference in data state. 1418 */ 1419 flushChars(buf, pos); 1420 clearStrBufAndAppend(c); 1421 setAdditionalAndRememberAmpersandLocation('\u0000'); 1422 returnState = state; 1423 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1424 continue stateloop; 1425 case '<': 1426 /* 1427 * U+003C LESS-THAN SIGN (<) Switch to the tag 1428 * open state. 1429 */ 1430 flushChars(buf, pos); 1431 1432 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); 1433 break dataloop; // FALL THROUGH continue 1434 // stateloop; 1435 case '\u0000': 1436 emitReplacementCharacter(buf, pos); 1437 continue; 1438 case '\r': 1439 emitCarriageReturn(buf, pos); 1440 break stateloop; 1441 case '\n': 1442 silentLineFeed(); 1443 default: 1444 /* 1445 * Anything else Emit the input character as a 1446 * character token. 1447 * 1448 * Stay in the data state. 1449 */ 1450 continue; 1451 } 1452 } 1453 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1454 case TAG_OPEN: 1455 tagopenloop: for (;;) { 1456 /* 1457 * The behavior of this state depends on the content 1458 * model flag. 1459 */ 1460 if (++pos == endPos) { 1461 break stateloop; 1462 } 1463 c = checkChar(buf, pos); 1464 /* 1465 * If the content model flag is set to the PCDATA state 1466 * Consume the next input character: 1467 */ 1468 if (c >= 'A' && c <= 'Z') { 1469 /* 1470 * U+0041 LATIN CAPITAL LETTER A through to U+005A 1471 * LATIN CAPITAL LETTER Z Create a new start tag 1472 * token, 1473 */ 1474 endTag = false; 1475 /* 1476 * set its tag name to the lowercase version of the 1477 * input character (add 0x0020 to the character's 1478 * code point), 1479 */ 1480 clearStrBufAndAppend((char) (c + 0x20)); 1481 /* then switch to the tag name state. */ 1482 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1483 /* 1484 * (Don't emit the token yet; further details will 1485 * be filled in before it is emitted.) 1486 */ 1487 break tagopenloop; 1488 // continue stateloop; 1489 } else if (c >= 'a' && c <= 'z') { 1490 /* 1491 * U+0061 LATIN SMALL LETTER A through to U+007A 1492 * LATIN SMALL LETTER Z Create a new start tag 1493 * token, 1494 */ 1495 endTag = false; 1496 /* 1497 * set its tag name to the input character, 1498 */ 1499 clearStrBufAndAppend(c); 1500 /* then switch to the tag name state. */ 1501 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1502 /* 1503 * (Don't emit the token yet; further details will 1504 * be filled in before it is emitted.) 1505 */ 1506 break tagopenloop; 1507 // continue stateloop; 1508 } 1509 switch (c) { 1510 case '!': 1511 /* 1512 * U+0021 EXCLAMATION MARK (!) Switch to the 1513 * markup declaration open state. 1514 */ 1515 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); 1516 continue stateloop; 1517 case '/': 1518 /* 1519 * U+002F SOLIDUS (/) Switch to the close tag 1520 * open state. 1521 */ 1522 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); 1523 continue stateloop; 1524 case '?': 1525 /* 1526 * U+003F QUESTION MARK (?) Parse error. 1527 */ 1528 errProcessingInstruction(); 1529 /* 1530 * Switch to the bogus comment state. 1531 */ 1532 clearLongStrBufAndAppend(c); 1533 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1534 continue stateloop; 1535 case '>': 1536 /* 1537 * U+003E GREATER-THAN SIGN (>) Parse error. 1538 */ 1539 errLtGt(); 1540 /* 1541 * Emit a U+003C LESS-THAN SIGN character token 1542 * and a U+003E GREATER-THAN SIGN character 1543 * token. 1544 */ 1545 tokenHandler.characters(Tokenizer.LT_GT, 0, 2); 1546 /* Switch to the data state. */ 1547 cstart = pos + 1; 1548 state = transition(state, Tokenizer.DATA, reconsume, pos); 1549 continue stateloop; 1550 default: 1551 /* 1552 * Anything else Parse error. 1553 */ 1554 errBadCharAfterLt(c); 1555 /* 1556 * Emit a U+003C LESS-THAN SIGN character token 1557 */ 1558 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1559 /* 1560 * and reconsume the current input character in 1561 * the data state. 1562 */ 1563 cstart = pos; 1564 state = transition(state, Tokenizer.DATA, reconsume, pos); 1565 reconsume = true; 1566 continue stateloop; 1567 } 1568 } 1569 // FALL THROUGH DON'T REORDER 1570 case TAG_NAME: 1571 tagnameloop: for (;;) { 1572 if (++pos == endPos) { 1573 break stateloop; 1574 } 1575 c = checkChar(buf, pos); 1576 /* 1577 * Consume the next input character: 1578 */ 1579 switch (c) { 1580 case '\r': 1581 silentCarriageReturn(); 1582 strBufToElementNameString(); 1583 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1584 break stateloop; 1585 case '\n': 1586 silentLineFeed(); 1587 case ' ': 1588 case '\t': 1589 case '\u000C': 1590 /* 1591 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1592 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1593 * Switch to the before attribute name state. 1594 */ 1595 strBufToElementNameString(); 1596 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1597 break tagnameloop; 1598 // continue stateloop; 1599 case '/': 1600 /* 1601 * U+002F SOLIDUS (/) Switch to the self-closing 1602 * start tag state. 1603 */ 1604 strBufToElementNameString(); 1605 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1606 continue stateloop; 1607 case '>': 1608 /* 1609 * U+003E GREATER-THAN SIGN (>) Emit the current 1610 * tag token. 1611 */ 1612 strBufToElementNameString(); 1613 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1614 if (shouldSuspend) { 1615 break stateloop; 1616 } 1617 /* 1618 * Switch to the data state. 1619 */ 1620 continue stateloop; 1621 case '\u0000': 1622 c = '\uFFFD'; 1623 // fall thru 1624 default: 1625 if (c >= 'A' && c <= 'Z') { 1626 /* 1627 * U+0041 LATIN CAPITAL LETTER A through to 1628 * U+005A LATIN CAPITAL LETTER Z Append the 1629 * lowercase version of the current input 1630 * character (add 0x0020 to the character's 1631 * code point) to the current tag token's 1632 * tag name. 1633 */ 1634 c += 0x20; 1635 } 1636 /* 1637 * Anything else Append the current input 1638 * character to the current tag token's tag 1639 * name. 1640 */ 1641 appendStrBuf(c); 1642 /* 1643 * Stay in the tag name state. 1644 */ 1645 continue; 1646 } 1647 } 1648 // FALLTHRU DON'T REORDER 1649 case BEFORE_ATTRIBUTE_NAME: 1650 beforeattributenameloop: for (;;) { 1651 if (reconsume) { 1652 reconsume = false; 1653 } else { 1654 if (++pos == endPos) { 1655 break stateloop; 1656 } 1657 c = checkChar(buf, pos); 1658 } 1659 /* 1660 * Consume the next input character: 1661 */ 1662 switch (c) { 1663 case '\r': 1664 silentCarriageReturn(); 1665 break stateloop; 1666 case '\n': 1667 silentLineFeed(); 1668 // fall thru 1669 case ' ': 1670 case '\t': 1671 case '\u000C': 1672 /* 1673 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1674 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1675 * in the before attribute name state. 1676 */ 1677 continue; 1678 case '/': 1679 /* 1680 * U+002F SOLIDUS (/) Switch to the self-closing 1681 * start tag state. 1682 */ 1683 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1684 continue stateloop; 1685 case '>': 1686 /* 1687 * U+003E GREATER-THAN SIGN (>) Emit the current 1688 * tag token. 1689 */ 1690 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1691 if (shouldSuspend) { 1692 break stateloop; 1693 } 1694 /* 1695 * Switch to the data state. 1696 */ 1697 continue stateloop; 1698 case '\u0000': 1699 c = '\uFFFD'; 1700 // fall thru 1701 case '\"': 1702 case '\'': 1703 case '<': 1704 case '=': 1705 /* 1706 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1707 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 1708 * SIGN (=) Parse error. 1709 */ 1710 errBadCharBeforeAttributeNameOrNull(c); 1711 /* 1712 * Treat it as per the "anything else" entry 1713 * below. 1714 */ 1715 default: 1716 /* 1717 * Anything else Start a new attribute in the 1718 * current tag token. 1719 */ 1720 if (c >= 'A' && c <= 'Z') { 1721 /* 1722 * U+0041 LATIN CAPITAL LETTER A through to 1723 * U+005A LATIN CAPITAL LETTER Z Set that 1724 * attribute's name to the lowercase version 1725 * of the current input character (add 1726 * 0x0020 to the character's code point) 1727 */ 1728 c += 0x20; 1729 } 1730 /* 1731 * Set that attribute's name to the current 1732 * input character, 1733 */ 1734 clearStrBufAndAppend(c); 1735 /* 1736 * and its value to the empty string. 1737 */ 1738 // Will do later. 1739 /* 1740 * Switch to the attribute name state. 1741 */ 1742 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 1743 break beforeattributenameloop; 1744 // continue stateloop; 1745 } 1746 } 1747 // FALLTHRU DON'T REORDER 1748 case ATTRIBUTE_NAME: 1749 attributenameloop: for (;;) { 1750 if (++pos == endPos) { 1751 break stateloop; 1752 } 1753 c = checkChar(buf, pos); 1754 /* 1755 * Consume the next input character: 1756 */ 1757 switch (c) { 1758 case '\r': 1759 silentCarriageReturn(); 1760 attributeNameComplete(); 1761 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 1762 break stateloop; 1763 case '\n': 1764 silentLineFeed(); 1765 // fall thru 1766 case ' ': 1767 case '\t': 1768 case '\u000C': 1769 /* 1770 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1771 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1772 * Switch to the after attribute name state. 1773 */ 1774 attributeNameComplete(); 1775 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 1776 continue stateloop; 1777 case '/': 1778 /* 1779 * U+002F SOLIDUS (/) Switch to the self-closing 1780 * start tag state. 1781 */ 1782 attributeNameComplete(); 1783 addAttributeWithoutValue(); 1784 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1785 continue stateloop; 1786 case '=': 1787 /* 1788 * U+003D EQUALS SIGN (=) Switch to the before 1789 * attribute value state. 1790 */ 1791 attributeNameComplete(); 1792 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 1793 break attributenameloop; 1794 // continue stateloop; 1795 case '>': 1796 /* 1797 * U+003E GREATER-THAN SIGN (>) Emit the current 1798 * tag token. 1799 */ 1800 attributeNameComplete(); 1801 addAttributeWithoutValue(); 1802 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1803 if (shouldSuspend) { 1804 break stateloop; 1805 } 1806 /* 1807 * Switch to the data state. 1808 */ 1809 continue stateloop; 1810 case '\u0000': 1811 c = '\uFFFD'; 1812 // fall thru 1813 case '\"': 1814 case '\'': 1815 case '<': 1816 /* 1817 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1818 * (') U+003C LESS-THAN SIGN (<) Parse error. 1819 */ 1820 errQuoteOrLtInAttributeNameOrNull(c); 1821 /* 1822 * Treat it as per the "anything else" entry 1823 * below. 1824 */ 1825 default: 1826 if (c >= 'A' && c <= 'Z') { 1827 /* 1828 * U+0041 LATIN CAPITAL LETTER A through to 1829 * U+005A LATIN CAPITAL LETTER Z Append the 1830 * lowercase version of the current input 1831 * character (add 0x0020 to the character's 1832 * code point) to the current attribute's 1833 * name. 1834 */ 1835 c += 0x20; 1836 } 1837 /* 1838 * Anything else Append the current input 1839 * character to the current attribute's name. 1840 */ 1841 appendStrBuf(c); 1842 /* 1843 * Stay in the attribute name state. 1844 */ 1845 continue; 1846 } 1847 } 1848 // FALLTHRU DON'T REORDER 1849 case BEFORE_ATTRIBUTE_VALUE: 1850 beforeattributevalueloop: for (;;) { 1851 if (++pos == endPos) { 1852 break stateloop; 1853 } 1854 c = checkChar(buf, pos); 1855 /* 1856 * Consume the next input character: 1857 */ 1858 switch (c) { 1859 case '\r': 1860 silentCarriageReturn(); 1861 break stateloop; 1862 case '\n': 1863 silentLineFeed(); 1864 // fall thru 1865 case ' ': 1866 case '\t': 1867 case '\u000C': 1868 /* 1869 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1870 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1871 * in the before attribute value state. 1872 */ 1873 continue; 1874 case '"': 1875 /* 1876 * U+0022 QUOTATION MARK (") Switch to the 1877 * attribute value (double-quoted) state. 1878 */ 1879 clearLongStrBuf(); 1880 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); 1881 break beforeattributevalueloop; 1882 // continue stateloop; 1883 case '&': 1884 /* 1885 * U+0026 AMPERSAND (&) Switch to the attribute 1886 * value (unquoted) state and reconsume this 1887 * input character. 1888 */ 1889 clearLongStrBuf(); 1890 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 1891 noteUnquotedAttributeValue(); 1892 reconsume = true; 1893 continue stateloop; 1894 case '\'': 1895 /* 1896 * U+0027 APOSTROPHE (') Switch to the attribute 1897 * value (single-quoted) state. 1898 */ 1899 clearLongStrBuf(); 1900 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); 1901 continue stateloop; 1902 case '>': 1903 /* 1904 * U+003E GREATER-THAN SIGN (>) Parse error. 1905 */ 1906 errAttributeValueMissing(); 1907 /* 1908 * Emit the current tag token. 1909 */ 1910 addAttributeWithoutValue(); 1911 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1912 if (shouldSuspend) { 1913 break stateloop; 1914 } 1915 /* 1916 * Switch to the data state. 1917 */ 1918 continue stateloop; 1919 case '\u0000': 1920 c = '\uFFFD'; 1921 // fall thru 1922 case '<': 1923 case '=': 1924 case '`': 1925 /* 1926 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN 1927 * (=) U+0060 GRAVE ACCENT (`) 1928 */ 1929 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); 1930 /* 1931 * Treat it as per the "anything else" entry 1932 * below. 1933 */ 1934 default: 1935 // [NOCPP[ 1936 errHtml4NonNameInUnquotedAttribute(c); 1937 // ]NOCPP] 1938 /* 1939 * Anything else Append the current input 1940 * character to the current attribute's value. 1941 */ 1942 clearLongStrBufAndAppend(c); 1943 /* 1944 * Switch to the attribute value (unquoted) 1945 * state. 1946 */ 1947 1948 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 1949 noteUnquotedAttributeValue(); 1950 continue stateloop; 1951 } 1952 } 1953 // FALLTHRU DON'T REORDER 1954 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 1955 attributevaluedoublequotedloop: for (;;) { 1956 if (reconsume) { 1957 reconsume = false; 1958 } else { 1959 if (++pos == endPos) { 1960 break stateloop; 1961 } 1962 c = checkChar(buf, pos); 1963 } 1964 /* 1965 * Consume the next input character: 1966 */ 1967 switch (c) { 1968 case '"': 1969 /* 1970 * U+0022 QUOTATION MARK (") Switch to the after 1971 * attribute value (quoted) state. 1972 */ 1973 addAttributeWithValue(); 1974 1975 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 1976 break attributevaluedoublequotedloop; 1977 // continue stateloop; 1978 case '&': 1979 /* 1980 * U+0026 AMPERSAND (&) Switch to the character 1981 * reference in attribute value state, with the 1982 * additional allowed character being U+0022 1983 * QUOTATION MARK ("). 1984 */ 1985 clearStrBufAndAppend(c); 1986 setAdditionalAndRememberAmpersandLocation('\"'); 1987 returnState = state; 1988 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1989 continue stateloop; 1990 case '\r': 1991 appendLongStrBufCarriageReturn(); 1992 break stateloop; 1993 case '\n': 1994 appendLongStrBufLineFeed(); 1995 continue; 1996 case '\u0000': 1997 c = '\uFFFD'; 1998 // fall thru 1999 default: 2000 /* 2001 * Anything else Append the current input 2002 * character to the current attribute's value. 2003 */ 2004 appendLongStrBuf(c); 2005 /* 2006 * Stay in the attribute value (double-quoted) 2007 * state. 2008 */ 2009 continue; 2010 } 2011 } 2012 // FALLTHRU DON'T REORDER 2013 case AFTER_ATTRIBUTE_VALUE_QUOTED: 2014 afterattributevaluequotedloop: for (;;) { 2015 if (++pos == endPos) { 2016 break stateloop; 2017 } 2018 c = checkChar(buf, pos); 2019 /* 2020 * Consume the next input character: 2021 */ 2022 switch (c) { 2023 case '\r': 2024 silentCarriageReturn(); 2025 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2026 break stateloop; 2027 case '\n': 2028 silentLineFeed(); 2029 // fall thru 2030 case ' ': 2031 case '\t': 2032 case '\u000C': 2033 /* 2034 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2035 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2036 * Switch to the before attribute name state. 2037 */ 2038 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2039 continue stateloop; 2040 case '/': 2041 /* 2042 * U+002F SOLIDUS (/) Switch to the self-closing 2043 * start tag state. 2044 */ 2045 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2046 break afterattributevaluequotedloop; 2047 // continue stateloop; 2048 case '>': 2049 /* 2050 * U+003E GREATER-THAN SIGN (>) Emit the current 2051 * tag token. 2052 */ 2053 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2054 if (shouldSuspend) { 2055 break stateloop; 2056 } 2057 /* 2058 * Switch to the data state. 2059 */ 2060 continue stateloop; 2061 default: 2062 /* 2063 * Anything else Parse error. 2064 */ 2065 errNoSpaceBetweenAttributes(); 2066 /* 2067 * Reconsume the character in the before 2068 * attribute name state. 2069 */ 2070 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2071 reconsume = true; 2072 continue stateloop; 2073 } 2074 } 2075 // FALLTHRU DON'T REORDER 2076 case SELF_CLOSING_START_TAG: 2077 if (++pos == endPos) { 2078 break stateloop; 2079 } 2080 c = checkChar(buf, pos); 2081 /* 2082 * Consume the next input character: 2083 */ 2084 switch (c) { 2085 case '>': 2086 /* 2087 * U+003E GREATER-THAN SIGN (>) Set the self-closing 2088 * flag of the current tag token. Emit the current 2089 * tag token. 2090 */ 2091 // [NOCPP[ 2092 errHtml4XmlVoidSyntax(); 2093 // ]NOCPP] 2094 state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); 2095 if (shouldSuspend) { 2096 break stateloop; 2097 } 2098 /* 2099 * Switch to the data state. 2100 */ 2101 continue stateloop; 2102 default: 2103 /* Anything else Parse error. */ 2104 errSlashNotFollowedByGt(); 2105 /* 2106 * Reconsume the character in the before attribute 2107 * name state. 2108 */ 2109 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2110 reconsume = true; 2111 continue stateloop; 2112 } 2113 // XXX reorder point 2114 case ATTRIBUTE_VALUE_UNQUOTED: 2115 for (;;) { 2116 if (reconsume) { 2117 reconsume = false; 2118 } else { 2119 if (++pos == endPos) { 2120 break stateloop; 2121 } 2122 c = checkChar(buf, pos); 2123 } 2124 /* 2125 * Consume the next input character: 2126 */ 2127 switch (c) { 2128 case '\r': 2129 silentCarriageReturn(); 2130 addAttributeWithValue(); 2131 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2132 break stateloop; 2133 case '\n': 2134 silentLineFeed(); 2135 // fall thru 2136 case ' ': 2137 case '\t': 2138 case '\u000C': 2139 /* 2140 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2141 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2142 * Switch to the before attribute name state. 2143 */ 2144 addAttributeWithValue(); 2145 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2146 continue stateloop; 2147 case '&': 2148 /* 2149 * U+0026 AMPERSAND (&) Switch to the character 2150 * reference in attribute value state, with the 2151 * additional allowed character being U+003E 2152 * GREATER-THAN SIGN (>) 2153 */ 2154 clearStrBufAndAppend(c); 2155 setAdditionalAndRememberAmpersandLocation('>'); 2156 returnState = state; 2157 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2158 continue stateloop; 2159 case '>': 2160 /* 2161 * U+003E GREATER-THAN SIGN (>) Emit the current 2162 * tag token. 2163 */ 2164 addAttributeWithValue(); 2165 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2166 if (shouldSuspend) { 2167 break stateloop; 2168 } 2169 /* 2170 * Switch to the data state. 2171 */ 2172 continue stateloop; 2173 case '\u0000': 2174 c = '\uFFFD'; 2175 // fall thru 2176 case '<': 2177 case '\"': 2178 case '\'': 2179 case '=': 2180 case '`': 2181 /* 2182 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 2183 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 2184 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. 2185 */ 2186 errUnquotedAttributeValOrNull(c); 2187 /* 2188 * Treat it as per the "anything else" entry 2189 * below. 2190 */ 2191 // fall through 2192 default: 2193 // [NOCPP] 2194 errHtml4NonNameInUnquotedAttribute(c); 2195 // ]NOCPP] 2196 /* 2197 * Anything else Append the current input 2198 * character to the current attribute's value. 2199 */ 2200 appendLongStrBuf(c); 2201 /* 2202 * Stay in the attribute value (unquoted) state. 2203 */ 2204 continue; 2205 } 2206 } 2207 // XXX reorder point 2208 case AFTER_ATTRIBUTE_NAME: 2209 for (;;) { 2210 if (++pos == endPos) { 2211 break stateloop; 2212 } 2213 c = checkChar(buf, pos); 2214 /* 2215 * Consume the next input character: 2216 */ 2217 switch (c) { 2218 case '\r': 2219 silentCarriageReturn(); 2220 break stateloop; 2221 case '\n': 2222 silentLineFeed(); 2223 // fall thru 2224 case ' ': 2225 case '\t': 2226 case '\u000C': 2227 /* 2228 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2229 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 2230 * in the after attribute name state. 2231 */ 2232 continue; 2233 case '/': 2234 /* 2235 * U+002F SOLIDUS (/) Switch to the self-closing 2236 * start tag state. 2237 */ 2238 addAttributeWithoutValue(); 2239 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2240 continue stateloop; 2241 case '=': 2242 /* 2243 * U+003D EQUALS SIGN (=) Switch to the before 2244 * attribute value state. 2245 */ 2246 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 2247 continue stateloop; 2248 case '>': 2249 /* 2250 * U+003E GREATER-THAN SIGN (>) Emit the current 2251 * tag token. 2252 */ 2253 addAttributeWithoutValue(); 2254 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2255 if (shouldSuspend) { 2256 break stateloop; 2257 } 2258 /* 2259 * Switch to the data state. 2260 */ 2261 continue stateloop; 2262 case '\u0000': 2263 c = '\uFFFD'; 2264 // fall thru 2265 case '\"': 2266 case '\'': 2267 case '<': 2268 errQuoteOrLtInAttributeNameOrNull(c); 2269 /* 2270 * Treat it as per the "anything else" entry 2271 * below. 2272 */ 2273 default: 2274 addAttributeWithoutValue(); 2275 /* 2276 * Anything else Start a new attribute in the 2277 * current tag token. 2278 */ 2279 if (c >= 'A' && c <= 'Z') { 2280 /* 2281 * U+0041 LATIN CAPITAL LETTER A through to 2282 * U+005A LATIN CAPITAL LETTER Z Set that 2283 * attribute's name to the lowercase version 2284 * of the current input character (add 2285 * 0x0020 to the character's code point) 2286 */ 2287 c += 0x20; 2288 } 2289 /* 2290 * Set that attribute's name to the current 2291 * input character, 2292 */ 2293 clearStrBufAndAppend(c); 2294 /* 2295 * and its value to the empty string. 2296 */ 2297 // Will do later. 2298 /* 2299 * Switch to the attribute name state. 2300 */ 2301 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 2302 continue stateloop; 2303 } 2304 } 2305 // XXX reorder point 2306 case MARKUP_DECLARATION_OPEN: 2307 markupdeclarationopenloop: for (;;) { 2308 if (++pos == endPos) { 2309 break stateloop; 2310 } 2311 c = checkChar(buf, pos); 2312 /* 2313 * If the next two characters are both U+002D 2314 * HYPHEN-MINUS characters (-), consume those two 2315 * characters, create a comment token whose data is the 2316 * empty string, and switch to the comment start state. 2317 * 2318 * Otherwise, if the next seven characters are an ASCII 2319 * case-insensitive match for the word "DOCTYPE", then 2320 * consume those characters and switch to the DOCTYPE 2321 * state. 2322 * 2323 * Otherwise, if the insertion mode is 2324 * "in foreign content" and the current node is not an 2325 * element in the HTML namespace and the next seven 2326 * characters are an case-sensitive match for the string 2327 * "[CDATA[" (the five uppercase letters "CDATA" with a 2328 * U+005B LEFT SQUARE BRACKET character before and 2329 * after), then consume those characters and switch to 2330 * the CDATA section state. 2331 * 2332 * Otherwise, is is a parse error. Switch to the bogus 2333 * comment state. The next character that is consumed, 2334 * if any, is the first character that will be in the 2335 * comment. 2336 */ 2337 switch (c) { 2338 case '-': 2339 clearLongStrBufAndAppend(c); 2340 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); 2341 break markupdeclarationopenloop; 2342 // continue stateloop; 2343 case 'd': 2344 case 'D': 2345 clearLongStrBufAndAppend(c); 2346 index = 0; 2347 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); 2348 continue stateloop; 2349 case '[': 2350 if (tokenHandler.cdataSectionAllowed()) { 2351 clearLongStrBufAndAppend(c); 2352 index = 0; 2353 state = transition(state, Tokenizer.CDATA_START, reconsume, pos); 2354 continue stateloop; 2355 } 2356 // else fall through 2357 default: 2358 errBogusComment(); 2359 clearLongStrBuf(); 2360 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2361 reconsume = true; 2362 continue stateloop; 2363 } 2364 } 2365 // FALLTHRU DON'T REORDER 2366 case MARKUP_DECLARATION_HYPHEN: 2367 markupdeclarationhyphenloop: for (;;) { 2368 if (++pos == endPos) { 2369 break stateloop; 2370 } 2371 c = checkChar(buf, pos); 2372 switch (c) { 2373 case '\u0000': 2374 break stateloop; 2375 case '-': 2376 clearLongStrBuf(); 2377 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); 2378 break markupdeclarationhyphenloop; 2379 // continue stateloop; 2380 default: 2381 errBogusComment(); 2382 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2383 reconsume = true; 2384 continue stateloop; 2385 } 2386 } 2387 // FALLTHRU DON'T REORDER 2388 case COMMENT_START: 2389 commentstartloop: for (;;) { 2390 if (++pos == endPos) { 2391 break stateloop; 2392 } 2393 c = checkChar(buf, pos); 2394 /* 2395 * Comment start state 2396 * 2397 * 2398 * Consume the next input character: 2399 */ 2400 switch (c) { 2401 case '-': 2402 /* 2403 * U+002D HYPHEN-MINUS (-) Switch to the comment 2404 * start dash state. 2405 */ 2406 appendLongStrBuf(c); 2407 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); 2408 continue stateloop; 2409 case '>': 2410 /* 2411 * U+003E GREATER-THAN SIGN (>) Parse error. 2412 */ 2413 errPrematureEndOfComment(); 2414 /* Emit the comment token. */ 2415 emitComment(0, pos); 2416 /* 2417 * Switch to the data state. 2418 */ 2419 state = transition(state, Tokenizer.DATA, reconsume, pos); 2420 continue stateloop; 2421 case '\r': 2422 appendLongStrBufCarriageReturn(); 2423 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2424 break stateloop; 2425 case '\n': 2426 appendLongStrBufLineFeed(); 2427 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2428 break commentstartloop; 2429 case '\u0000': 2430 c = '\uFFFD'; 2431 // fall thru 2432 default: 2433 /* 2434 * Anything else Append the input character to 2435 * the comment token's data. 2436 */ 2437 appendLongStrBuf(c); 2438 /* 2439 * Switch to the comment state. 2440 */ 2441 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2442 break commentstartloop; 2443 // continue stateloop; 2444 } 2445 } 2446 // FALLTHRU DON'T REORDER 2447 case COMMENT: 2448 commentloop: for (;;) { 2449 if (++pos == endPos) { 2450 break stateloop; 2451 } 2452 c = checkChar(buf, pos); 2453 /* 2454 * Comment state Consume the next input character: 2455 */ 2456 switch (c) { 2457 case '-': 2458 /* 2459 * U+002D HYPHEN-MINUS (-) Switch to the comment 2460 * end dash state 2461 */ 2462 appendLongStrBuf(c); 2463 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2464 break commentloop; 2465 // continue stateloop; 2466 case '\r': 2467 appendLongStrBufCarriageReturn(); 2468 break stateloop; 2469 case '\n': 2470 appendLongStrBufLineFeed(); 2471 continue; 2472 case '\u0000': 2473 c = '\uFFFD'; 2474 // fall thru 2475 default: 2476 /* 2477 * Anything else Append the input character to 2478 * the comment token's data. 2479 */ 2480 appendLongStrBuf(c); 2481 /* 2482 * Stay in the comment state. 2483 */ 2484 continue; 2485 } 2486 } 2487 // FALLTHRU DON'T REORDER 2488 case COMMENT_END_DASH: 2489 commentenddashloop: for (;;) { 2490 if (++pos == endPos) { 2491 break stateloop; 2492 } 2493 c = checkChar(buf, pos); 2494 /* 2495 * Comment end dash state Consume the next input 2496 * character: 2497 */ 2498 switch (c) { 2499 case '-': 2500 /* 2501 * U+002D HYPHEN-MINUS (-) Switch to the comment 2502 * end state 2503 */ 2504 appendLongStrBuf(c); 2505 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 2506 break commentenddashloop; 2507 // continue stateloop; 2508 case '\r': 2509 appendLongStrBufCarriageReturn(); 2510 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2511 break stateloop; 2512 case '\n': 2513 appendLongStrBufLineFeed(); 2514 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2515 continue stateloop; 2516 case '\u0000': 2517 c = '\uFFFD'; 2518 // fall thru 2519 default: 2520 /* 2521 * Anything else Append a U+002D HYPHEN-MINUS 2522 * (-) character and the input character to the 2523 * comment token's data. 2524 */ 2525 appendLongStrBuf(c); 2526 /* 2527 * Switch to the comment state. 2528 */ 2529 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2530 continue stateloop; 2531 } 2532 } 2533 // FALLTHRU DON'T REORDER 2534 case COMMENT_END: 2535 commentendloop: for (;;) { 2536 if (++pos == endPos) { 2537 break stateloop; 2538 } 2539 c = checkChar(buf, pos); 2540 /* 2541 * Comment end dash state Consume the next input 2542 * character: 2543 */ 2544 switch (c) { 2545 case '>': 2546 /* 2547 * U+003E GREATER-THAN SIGN (>) Emit the comment 2548 * token. 2549 */ 2550 emitComment(2, pos); 2551 /* 2552 * Switch to the data state. 2553 */ 2554 state = transition(state, Tokenizer.DATA, reconsume, pos); 2555 continue stateloop; 2556 case '-': 2557 /* U+002D HYPHEN-MINUS (-) Parse error. */ 2558 /* 2559 * Append a U+002D HYPHEN-MINUS (-) character to 2560 * the comment token's data. 2561 */ 2562 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c); 2563 /* 2564 * Stay in the comment end state. 2565 */ 2566 continue; 2567 case '\r': 2568 adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn(); 2569 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2570 break stateloop; 2571 case '\n': 2572 adjustDoubleHyphenAndAppendToLongStrBufLineFeed(); 2573 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2574 continue stateloop; 2575 case '!': 2576 errHyphenHyphenBang(); 2577 appendLongStrBuf(c); 2578 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); 2579 continue stateloop; 2580 case '\u0000': 2581 c = '\uFFFD'; 2582 // fall thru 2583 default: 2584 /* 2585 * Append two U+002D HYPHEN-MINUS (-) characters 2586 * and the input character to the comment 2587 * token's data. 2588 */ 2589 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c); 2590 /* 2591 * Switch to the comment state. 2592 */ 2593 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2594 continue stateloop; 2595 } 2596 } 2597 // XXX reorder point 2598 case COMMENT_END_BANG: 2599 for (;;) { 2600 if (++pos == endPos) { 2601 break stateloop; 2602 } 2603 c = checkChar(buf, pos); 2604 /* 2605 * Comment end bang state 2606 * 2607 * Consume the next input character: 2608 */ 2609 switch (c) { 2610 case '>': 2611 /* 2612 * U+003E GREATER-THAN SIGN (>) Emit the comment 2613 * token. 2614 */ 2615 emitComment(3, pos); 2616 /* 2617 * Switch to the data state. 2618 */ 2619 state = transition(state, Tokenizer.DATA, reconsume, pos); 2620 continue stateloop; 2621 case '-': 2622 /* 2623 * Append two U+002D HYPHEN-MINUS (-) characters 2624 * and a U+0021 EXCLAMATION MARK (!) character 2625 * to the comment token's data. 2626 */ 2627 appendLongStrBuf(c); 2628 /* 2629 * Switch to the comment end dash state. 2630 */ 2631 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2632 continue stateloop; 2633 case '\r': 2634 appendLongStrBufCarriageReturn(); 2635 break stateloop; 2636 case '\n': 2637 appendLongStrBufLineFeed(); 2638 continue; 2639 case '\u0000': 2640 c = '\uFFFD'; 2641 // fall thru 2642 default: 2643 /* 2644 * Anything else Append two U+002D HYPHEN-MINUS 2645 * (-) characters, a U+0021 EXCLAMATION MARK (!) 2646 * character, and the input character to the 2647 * comment token's data. Switch to the comment 2648 * state. 2649 */ 2650 appendLongStrBuf(c); 2651 /* 2652 * Switch to the comment state. 2653 */ 2654 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2655 continue stateloop; 2656 } 2657 } 2658 // XXX reorder point 2659 case COMMENT_START_DASH: 2660 if (++pos == endPos) { 2661 break stateloop; 2662 } 2663 c = checkChar(buf, pos); 2664 /* 2665 * Comment start dash state 2666 * 2667 * Consume the next input character: 2668 */ 2669 switch (c) { 2670 case '-': 2671 /* 2672 * U+002D HYPHEN-MINUS (-) Switch to the comment end 2673 * state 2674 */ 2675 appendLongStrBuf(c); 2676 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 2677 continue stateloop; 2678 case '>': 2679 errPrematureEndOfComment(); 2680 /* Emit the comment token. */ 2681 emitComment(1, pos); 2682 /* 2683 * Switch to the data state. 2684 */ 2685 state = transition(state, Tokenizer.DATA, reconsume, pos); 2686 continue stateloop; 2687 case '\r': 2688 appendLongStrBufCarriageReturn(); 2689 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2690 break stateloop; 2691 case '\n': 2692 appendLongStrBufLineFeed(); 2693 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2694 continue stateloop; 2695 case '\u0000': 2696 c = '\uFFFD'; 2697 // fall thru 2698 default: 2699 /* 2700 * Append a U+002D HYPHEN-MINUS character (-) and 2701 * the current input character to the comment 2702 * token's data. 2703 */ 2704 appendLongStrBuf(c); 2705 /* 2706 * Switch to the comment state. 2707 */ 2708 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2709 continue stateloop; 2710 } 2711 // XXX reorder point 2712 case CDATA_START: 2713 for (;;) { 2714 if (++pos == endPos) { 2715 break stateloop; 2716 } 2717 c = checkChar(buf, pos); 2718 if (index < 6) { // CDATA_LSQB.length 2719 if (c == Tokenizer.CDATA_LSQB[index]) { 2720 appendLongStrBuf(c); 2721 } else { 2722 errBogusComment(); 2723 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2724 reconsume = true; 2725 continue stateloop; 2726 } 2727 index++; 2728 continue; 2729 } else { 2730 cstart = pos; // start coalescing 2731 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2732 reconsume = true; 2733 break; // FALL THROUGH continue stateloop; 2734 } 2735 } 2736 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 2737 case CDATA_SECTION: 2738 cdatasectionloop: for (;;) { 2739 if (reconsume) { 2740 reconsume = false; 2741 } else { 2742 if (++pos == endPos) { 2743 break stateloop; 2744 } 2745 c = checkChar(buf, pos); 2746 } 2747 switch (c) { 2748 case ']': 2749 flushChars(buf, pos); 2750 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); 2751 break cdatasectionloop; // FALL THROUGH 2752 case '\u0000': 2753 emitReplacementCharacter(buf, pos); 2754 continue; 2755 case '\r': 2756 emitCarriageReturn(buf, pos); 2757 break stateloop; 2758 case '\n': 2759 silentLineFeed(); 2760 // fall thru 2761 default: 2762 continue; 2763 } 2764 } 2765 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 2766 case CDATA_RSQB: 2767 cdatarsqb: for (;;) { 2768 if (++pos == endPos) { 2769 break stateloop; 2770 } 2771 c = checkChar(buf, pos); 2772 switch (c) { 2773 case ']': 2774 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); 2775 break cdatarsqb; 2776 default: 2777 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2778 1); 2779 cstart = pos; 2780 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2781 reconsume = true; 2782 continue stateloop; 2783 } 2784 } 2785 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 2786 case CDATA_RSQB_RSQB: 2787 if (++pos == endPos) { 2788 break stateloop; 2789 } 2790 c = checkChar(buf, pos); 2791 switch (c) { 2792 case '>': 2793 cstart = pos + 1; 2794 state = transition(state, Tokenizer.DATA, reconsume, pos); 2795 continue stateloop; 2796 default: 2797 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 2798 cstart = pos; 2799 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 2800 reconsume = true; 2801 continue stateloop; 2802 2803 } 2804 // XXX reorder point 2805 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 2806 attributevaluesinglequotedloop: for (;;) { 2807 if (reconsume) { 2808 reconsume = false; 2809 } else { 2810 if (++pos == endPos) { 2811 break stateloop; 2812 } 2813 c = checkChar(buf, pos); 2814 } 2815 /* 2816 * Consume the next input character: 2817 */ 2818 switch (c) { 2819 case '\'': 2820 /* 2821 * U+0027 APOSTROPHE (') Switch to the after 2822 * attribute value (quoted) state. 2823 */ 2824 addAttributeWithValue(); 2825 2826 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 2827 continue stateloop; 2828 case '&': 2829 /* 2830 * U+0026 AMPERSAND (&) Switch to the character 2831 * reference in attribute value state, with the 2832 * + additional allowed character being U+0027 2833 * APOSTROPHE ('). 2834 */ 2835 clearStrBufAndAppend(c); 2836 setAdditionalAndRememberAmpersandLocation('\''); 2837 returnState = state; 2838 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2839 break attributevaluesinglequotedloop; 2840 // continue stateloop; 2841 case '\r': 2842 appendLongStrBufCarriageReturn(); 2843 break stateloop; 2844 case '\n': 2845 appendLongStrBufLineFeed(); 2846 continue; 2847 case '\u0000': 2848 c = '\uFFFD'; 2849 // fall thru 2850 default: 2851 /* 2852 * Anything else Append the current input 2853 * character to the current attribute's value. 2854 */ 2855 appendLongStrBuf(c); 2856 /* 2857 * Stay in the attribute value (double-quoted) 2858 * state. 2859 */ 2860 continue; 2861 } 2862 } 2863 // FALLTHRU DON'T REORDER 2864 case CONSUME_CHARACTER_REFERENCE: 2865 if (++pos == endPos) { 2866 break stateloop; 2867 } 2868 c = checkChar(buf, pos); 2869 if (c == '\u0000') { 2870 break stateloop; 2871 } 2872 /* 2873 * Unlike the definition is the spec, this state does not 2874 * return a value and never requires the caller to 2875 * backtrack. This state takes care of emitting characters 2876 * or appending to the current attribute value. It also 2877 * takes care of that in the case when consuming the 2878 * character reference fails. 2879 */ 2880 /* 2881 * This section defines how to consume a character 2882 * reference. This definition is used when parsing character 2883 * references in text and in attributes. 2884 * 2885 * The behavior depends on the identity of the next 2886 * character (the one immediately after the U+0026 AMPERSAND 2887 * character): 2888 */ 2889 switch (c) { 2890 case ' ': 2891 case '\t': 2892 case '\n': 2893 case '\r': // we'll reconsume! 2894 case '\u000C': 2895 case '<': 2896 case '&': 2897 emitOrAppendStrBuf(returnState); 2898 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 2899 cstart = pos; 2900 } 2901 state = transition(state, returnState, reconsume, pos); 2902 reconsume = true; 2903 continue stateloop; 2904 case '#': 2905 /* 2906 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER 2907 * SIGN. 2908 */ 2909 appendStrBuf('#'); 2910 state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); 2911 continue stateloop; 2912 default: 2913 if (c == additional) { 2914 emitOrAppendStrBuf(returnState); 2915 state = transition(state, returnState, reconsume, pos); 2916 reconsume = true; 2917 continue stateloop; 2918 } 2919 if (c >= 'a' && c <= 'z') { 2920 firstCharKey = c - 'a' + 26; 2921 } else if (c >= 'A' && c <= 'Z') { 2922 firstCharKey = c - 'A'; 2923 } else { 2924 // No match 2925 /* 2926 * If no match can be made, then this is a parse 2927 * error. 2928 */ 2929 errNoNamedCharacterMatch(); 2930 emitOrAppendStrBuf(returnState); 2931 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 2932 cstart = pos; 2933 } 2934 state = transition(state, returnState, reconsume, pos); 2935 reconsume = true; 2936 continue stateloop; 2937 } 2938 // Didn't fail yet 2939 appendStrBuf(c); 2940 state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); 2941 // FALL THROUGH continue stateloop; 2942 } 2943 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 2944 case CHARACTER_REFERENCE_HILO_LOOKUP: 2945 { 2946 if (++pos == endPos) { 2947 break stateloop; 2948 } 2949 c = checkChar(buf, pos); 2950 if (c == '\u0000') { 2951 break stateloop; 2952 } 2953 /* 2954 * The data structure is as follows: 2955 * 2956 * HILO_ACCEL is a two-dimensional int array whose major 2957 * index corresponds to the second character of the 2958 * character reference (code point as index) and the 2959 * minor index corresponds to the first character of the 2960 * character reference (packed so that A-Z runs from 0 2961 * to 25 and a-z runs from 26 to 51). This layout makes 2962 * it easier to use the sparseness of the data structure 2963 * to omit parts of it: The second dimension of the 2964 * table is null when no character reference starts with 2965 * the character corresponding to that row. 2966 * 2967 * The int value HILO_ACCEL (by these indeces) is zero 2968 * if there exists no character reference starting with 2969 * that two-letter prefix. Otherwise, the value is an 2970 * int that packs two shorts so that the higher short is 2971 * the index of the highest character reference name 2972 * with that prefix in NAMES and the lower short 2973 * corresponds to the index of the lowest character 2974 * reference name with that prefix. (It happens that the 2975 * first two character reference names share their 2976 * prefix so the packed int cannot be 0 by packing the 2977 * two shorts.) 2978 * 2979 * NAMES is an array of byte arrays where each byte 2980 * array encodes the name of a character references as 2981 * ASCII. The names omit the first two letters of the 2982 * name. (Since storing the first two letters would be 2983 * redundant with the data contained in HILO_ACCEL.) The 2984 * entries are lexically sorted. 2985 * 2986 * For a given index in NAMES, the same index in VALUES 2987 * contains the corresponding expansion as an array of 2988 * two UTF-16 code units (either the character and 2989 * U+0000 or a suggogate pair). 2990 */ 2991 int hilo = 0; 2992 if (c <= 'z') { 2993 @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c]; 2994 if (row != null) { 2995 hilo = row[firstCharKey]; 2996 } 2997 } 2998 if (hilo == 0) { 2999 /* 3000 * If no match can be made, then this is a parse 3001 * error. 3002 */ 3003 errNoNamedCharacterMatch(); 3004 emitOrAppendStrBuf(returnState); 3005 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3006 cstart = pos; 3007 } 3008 state = transition(state, returnState, reconsume, pos); 3009 reconsume = true; 3010 continue stateloop; 3011 } 3012 // Didn't fail yet 3013 appendStrBuf(c); 3014 lo = hilo & 0xFFFF; 3015 hi = hilo >> 16; 3016 entCol = -1; 3017 candidate = -1; 3018 strBufMark = 0; 3019 state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); 3020 // FALL THROUGH continue stateloop; 3021 } 3022 case CHARACTER_REFERENCE_TAIL: 3023 outer: for (;;) { 3024 if (++pos == endPos) { 3025 break stateloop; 3026 } 3027 c = checkChar(buf, pos); 3028 if (c == '\u0000') { 3029 break stateloop; 3030 } 3031 entCol++; 3032 /* 3033 * Consume the maximum number of characters possible, 3034 * with the consumed characters matching one of the 3035 * identifiers in the first column of the named 3036 * character references table (in a case-sensitive 3037 * manner). 3038 */ 3039 loloop: for (;;) { 3040 if (hi < lo) { 3041 break outer; 3042 } 3043 if (entCol == NamedCharacters.NAMES[lo].length()) { 3044 candidate = lo; 3045 strBufMark = strBufLen; 3046 lo++; 3047 } else if (entCol > NamedCharacters.NAMES[lo].length()) { 3048 break outer; 3049 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 3050 lo++; 3051 } else { 3052 break loloop; 3053 } 3054 } 3055 3056 hiloop: for (;;) { 3057 if (hi < lo) { 3058 break outer; 3059 } 3060 if (entCol == NamedCharacters.NAMES[hi].length()) { 3061 break hiloop; 3062 } 3063 if (entCol > NamedCharacters.NAMES[hi].length()) { 3064 break outer; 3065 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 3066 hi--; 3067 } else { 3068 break hiloop; 3069 } 3070 } 3071 3072 if (hi < lo) { 3073 break outer; 3074 } 3075 appendStrBuf(c); 3076 continue; 3077 } 3078 3079 if (candidate == -1) { 3080 // reconsume deals with CR, LF or nul 3081 /* 3082 * If no match can be made, then this is a parse error. 3083 */ 3084 errNoNamedCharacterMatch(); 3085 emitOrAppendStrBuf(returnState); 3086 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3087 cstart = pos; 3088 } 3089 state = transition(state, returnState, reconsume, pos); 3090 reconsume = true; 3091 continue stateloop; 3092 } else { 3093 // c can't be CR, LF or nul if we got here 3094 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 3095 if (candidateName.length() == 0 3096 || candidateName.charAt(candidateName.length() - 1) != ';') { 3097 /* 3098 * If the last character matched is not a U+003B 3099 * SEMICOLON (;), there is a parse error. 3100 */ 3101 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3102 /* 3103 * If the entity is being consumed as part of an 3104 * attribute, and the last character matched is 3105 * not a U+003B SEMICOLON (;), 3106 */ 3107 char ch; 3108 if (strBufMark == strBufLen) { 3109 ch = c; 3110 } else { 3111 // if (strBufOffset != -1) { 3112 // ch = buf[strBufOffset + strBufMark]; 3113 // } else { 3114 ch = strBuf[strBufMark]; 3115 // } 3116 } 3117 if (ch == '=' || (ch >= '0' && ch <= '9') 3118 || (ch >= 'A' && ch <= 'Z') 3119 || (ch >= 'a' && ch <= 'z')) { 3120 /* 3121 * and the next character is either a U+003D 3122 * EQUALS SIGN character (=) or in the range 3123 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 3124 * U+0041 LATIN CAPITAL LETTER A to U+005A 3125 * LATIN CAPITAL LETTER Z, or U+0061 LATIN 3126 * SMALL LETTER A to U+007A LATIN SMALL 3127 * LETTER Z, then, for historical reasons, 3128 * all the characters that were matched 3129 * after the U+0026 AMPERSAND (&) must be 3130 * unconsumed, and nothing is returned. 3131 */ 3132 errNoNamedCharacterMatch(); 3133 appendStrBufToLongStrBuf(); 3134 state = transition(state, returnState, reconsume, pos); 3135 reconsume = true; 3136 continue stateloop; 3137 } 3138 } 3139 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3140 errUnescapedAmpersandInterpretedAsCharacterReference(); 3141 } else { 3142 errNotSemicolonTerminated(); 3143 } 3144 } 3145 3146 /* 3147 * Otherwise, return a character token for the character 3148 * corresponding to the entity name (as given by the 3149 * second column of the named character references 3150 * table). 3151 */ 3152 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 3153 if ( 3154 // [NOCPP[ 3155 val.length == 1 3156 // ]NOCPP] 3157 // CPPONLY: val[1] == 0 3158 ) { 3159 emitOrAppendOne(val, returnState); 3160 } else { 3161 emitOrAppendTwo(val, returnState); 3162 } 3163 // this is so complicated! 3164 if (strBufMark < strBufLen) { 3165 // if (strBufOffset != -1) { 3166 // if ((returnState & (~1)) != 0) { 3167 // for (int i = strBufMark; i < strBufLen; i++) { 3168 // appendLongStrBuf(buf[strBufOffset + i]); 3169 // } 3170 // } else { 3171 // tokenHandler.characters(buf, strBufOffset 3172 // + strBufMark, strBufLen 3173 // - strBufMark); 3174 // } 3175 // } else { 3176 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3177 for (int i = strBufMark; i < strBufLen; i++) { 3178 appendLongStrBuf(strBuf[i]); 3179 } 3180 } else { 3181 tokenHandler.characters(strBuf, strBufMark, 3182 strBufLen - strBufMark); 3183 } 3184 // } 3185 } 3186 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3187 cstart = pos; 3188 } 3189 state = transition(state, returnState, reconsume, pos); 3190 reconsume = true; 3191 continue stateloop; 3192 /* 3193 * If the markup contains I'm ¬it; I tell you, the 3194 * entity is parsed as "not", as in, I'm ¬it; I tell 3195 * you. But if the markup was I'm ∉ I tell you, 3196 * the entity would be parsed as "notin;", resulting in 3197 * I'm ∉ I tell you. 3198 */ 3199 } 3200 // XXX reorder point 3201 case CONSUME_NCR: 3202 if (++pos == endPos) { 3203 break stateloop; 3204 } 3205 c = checkChar(buf, pos); 3206 prevValue = -1; 3207 value = 0; 3208 seenDigits = false; 3209 /* 3210 * The behavior further depends on the character after the 3211 * U+0023 NUMBER SIGN: 3212 */ 3213 switch (c) { 3214 case 'x': 3215 case 'X': 3216 3217 /* 3218 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL 3219 * LETTER X Consume the X. 3220 * 3221 * Follow the steps below, but using the range of 3222 * characters U+0030 DIGIT ZERO through to U+0039 3223 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through 3224 * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN 3225 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL 3226 * LETTER F (in other words, 0-9, A-F, a-f). 3227 * 3228 * When it comes to interpreting the number, 3229 * interpret it as a hexadecimal number. 3230 */ 3231 appendStrBuf(c); 3232 state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos); 3233 continue stateloop; 3234 default: 3235 /* 3236 * Anything else Follow the steps below, but using 3237 * the range of characters U+0030 DIGIT ZERO through 3238 * to U+0039 DIGIT NINE (i.e. just 0-9). 3239 * 3240 * When it comes to interpreting the number, 3241 * interpret it as a decimal number. 3242 */ 3243 state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); 3244 reconsume = true; 3245 // FALL THROUGH continue stateloop; 3246 } 3247 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 3248 case DECIMAL_NRC_LOOP: 3249 decimalloop: for (;;) { 3250 if (reconsume) { 3251 reconsume = false; 3252 } else { 3253 if (++pos == endPos) { 3254 break stateloop; 3255 } 3256 c = checkChar(buf, pos); 3257 } 3258 // Deal with overflow gracefully 3259 if (value < prevValue) { 3260 value = 0x110000; // Value above Unicode range but 3261 // within int 3262 // range 3263 } 3264 prevValue = value; 3265 /* 3266 * Consume as many characters as match the range of 3267 * characters given above. 3268 */ 3269 if (c >= '0' && c <= '9') { 3270 seenDigits = true; 3271 value *= 10; 3272 value += c - '0'; 3273 continue; 3274 } else if (c == ';') { 3275 if (seenDigits) { 3276 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3277 cstart = pos + 1; 3278 } 3279 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3280 // FALL THROUGH continue stateloop; 3281 break decimalloop; 3282 } else { 3283 errNoDigitsInNCR(); 3284 appendStrBuf(';'); 3285 emitOrAppendStrBuf(returnState); 3286 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3287 cstart = pos + 1; 3288 } 3289 state = transition(state, returnState, reconsume, pos); 3290 continue stateloop; 3291 } 3292 } else { 3293 /* 3294 * If no characters match the range, then don't 3295 * consume any characters (and unconsume the U+0023 3296 * NUMBER SIGN character and, if appropriate, the X 3297 * character). This is a parse error; nothing is 3298 * returned. 3299 * 3300 * Otherwise, if the next character is a U+003B 3301 * SEMICOLON, consume that too. If it isn't, there 3302 * is a parse error. 3303 */ 3304 if (!seenDigits) { 3305 errNoDigitsInNCR(); 3306 emitOrAppendStrBuf(returnState); 3307 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3308 cstart = pos; 3309 } 3310 state = transition(state, returnState, reconsume, pos); 3311 reconsume = true; 3312 continue stateloop; 3313 } else { 3314 errCharRefLacksSemicolon(); 3315 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3316 cstart = pos; 3317 } 3318 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3319 reconsume = true; 3320 // FALL THROUGH continue stateloop; 3321 break decimalloop; 3322 } 3323 } 3324 } 3325 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 3326 case HANDLE_NCR_VALUE: 3327 // WARNING previous state sets reconsume 3328 // XXX inline this case if the method size can take it 3329 handleNcrValue(returnState); 3330 state = transition(state, returnState, reconsume, pos); 3331 continue stateloop; 3332 // XXX reorder point 3333 case HEX_NCR_LOOP: 3334 for (;;) { 3335 if (++pos == endPos) { 3336 break stateloop; 3337 } 3338 c = checkChar(buf, pos); 3339 // Deal with overflow gracefully 3340 if (value < prevValue) { 3341 value = 0x110000; // Value above Unicode range but 3342 // within int 3343 // range 3344 } 3345 prevValue = value; 3346 /* 3347 * Consume as many characters as match the range of 3348 * characters given above. 3349 */ 3350 if (c >= '0' && c <= '9') { 3351 seenDigits = true; 3352 value *= 16; 3353 value += c - '0'; 3354 continue; 3355 } else if (c >= 'A' && c <= 'F') { 3356 seenDigits = true; 3357 value *= 16; 3358 value += c - 'A' + 10; 3359 continue; 3360 } else if (c >= 'a' && c <= 'f') { 3361 seenDigits = true; 3362 value *= 16; 3363 value += c - 'a' + 10; 3364 continue; 3365 } else if (c == ';') { 3366 if (seenDigits) { 3367 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3368 cstart = pos + 1; 3369 } 3370 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3371 continue stateloop; 3372 } else { 3373 errNoDigitsInNCR(); 3374 appendStrBuf(';'); 3375 emitOrAppendStrBuf(returnState); 3376 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3377 cstart = pos + 1; 3378 } 3379 state = transition(state, returnState, reconsume, pos); 3380 continue stateloop; 3381 } 3382 } else { 3383 /* 3384 * If no characters match the range, then don't 3385 * consume any characters (and unconsume the U+0023 3386 * NUMBER SIGN character and, if appropriate, the X 3387 * character). This is a parse error; nothing is 3388 * returned. 3389 * 3390 * Otherwise, if the next character is a U+003B 3391 * SEMICOLON, consume that too. If it isn't, there 3392 * is a parse error. 3393 */ 3394 if (!seenDigits) { 3395 errNoDigitsInNCR(); 3396 emitOrAppendStrBuf(returnState); 3397 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3398 cstart = pos; 3399 } 3400 state = transition(state, returnState, reconsume, pos); 3401 reconsume = true; 3402 continue stateloop; 3403 } else { 3404 errCharRefLacksSemicolon(); 3405 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3406 cstart = pos; 3407 } 3408 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3409 reconsume = true; 3410 continue stateloop; 3411 } 3412 } 3413 } 3414 // XXX reorder point 3415 case PLAINTEXT: 3416 plaintextloop: for (;;) { 3417 if (reconsume) { 3418 reconsume = false; 3419 } else { 3420 if (++pos == endPos) { 3421 break stateloop; 3422 } 3423 c = checkChar(buf, pos); 3424 } 3425 switch (c) { 3426 case '\u0000': 3427 emitPlaintextReplacementCharacter(buf, pos); 3428 continue; 3429 case '\r': 3430 emitCarriageReturn(buf, pos); 3431 break stateloop; 3432 case '\n': 3433 silentLineFeed(); 3434 default: 3435 /* 3436 * Anything else Emit the current input 3437 * character as a character token. Stay in the 3438 * RAWTEXT state. 3439 */ 3440 continue; 3441 } 3442 } 3443 // XXX reorder point 3444 case CLOSE_TAG_OPEN: 3445 if (++pos == endPos) { 3446 break stateloop; 3447 } 3448 c = checkChar(buf, pos); 3449 /* 3450 * Otherwise, if the content model flag is set to the PCDATA 3451 * state, or if the next few characters do match that tag 3452 * name, consume the next input character: 3453 */ 3454 switch (c) { 3455 case '>': 3456 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 3457 errLtSlashGt(); 3458 /* 3459 * Switch to the data state. 3460 */ 3461 cstart = pos + 1; 3462 state = transition(state, Tokenizer.DATA, reconsume, pos); 3463 continue stateloop; 3464 case '\r': 3465 silentCarriageReturn(); 3466 /* Anything else Parse error. */ 3467 errGarbageAfterLtSlash(); 3468 /* 3469 * Switch to the bogus comment state. 3470 */ 3471 clearLongStrBufAndAppend('\n'); 3472 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3473 break stateloop; 3474 case '\n': 3475 silentLineFeed(); 3476 /* Anything else Parse error. */ 3477 errGarbageAfterLtSlash(); 3478 /* 3479 * Switch to the bogus comment state. 3480 */ 3481 clearLongStrBufAndAppend('\n'); 3482 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3483 continue stateloop; 3484 case '\u0000': 3485 c = '\uFFFD'; 3486 // fall thru 3487 default: 3488 if (c >= 'A' && c <= 'Z') { 3489 c += 0x20; 3490 } 3491 if (c >= 'a' && c <= 'z') { 3492 /* 3493 * U+0061 LATIN SMALL LETTER A through to U+007A 3494 * LATIN SMALL LETTER Z Create a new end tag 3495 * token, 3496 */ 3497 endTag = true; 3498 /* 3499 * set its tag name to the input character, 3500 */ 3501 clearStrBufAndAppend(c); 3502 /* 3503 * then switch to the tag name state. (Don't 3504 * emit the token yet; further details will be 3505 * filled in before it is emitted.) 3506 */ 3507 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 3508 continue stateloop; 3509 } else { 3510 /* Anything else Parse error. */ 3511 errGarbageAfterLtSlash(); 3512 /* 3513 * Switch to the bogus comment state. 3514 */ 3515 clearLongStrBufAndAppend(c); 3516 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3517 continue stateloop; 3518 } 3519 } 3520 // XXX reorder point 3521 case RCDATA: 3522 rcdataloop: for (;;) { 3523 if (reconsume) { 3524 reconsume = false; 3525 } else { 3526 if (++pos == endPos) { 3527 break stateloop; 3528 } 3529 c = checkChar(buf, pos); 3530 } 3531 switch (c) { 3532 case '&': 3533 /* 3534 * U+0026 AMPERSAND (&) Switch to the character 3535 * reference in RCDATA state. 3536 */ 3537 flushChars(buf, pos); 3538 clearStrBufAndAppend(c); 3539 additional = '\u0000'; 3540 returnState = state; 3541 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 3542 continue stateloop; 3543 case '<': 3544 /* 3545 * U+003C LESS-THAN SIGN (<) Switch to the 3546 * RCDATA less-than sign state. 3547 */ 3548 flushChars(buf, pos); 3549 3550 returnState = state; 3551 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 3552 continue stateloop; 3553 case '\u0000': 3554 emitReplacementCharacter(buf, pos); 3555 continue; 3556 case '\r': 3557 emitCarriageReturn(buf, pos); 3558 break stateloop; 3559 case '\n': 3560 silentLineFeed(); 3561 default: 3562 /* 3563 * Emit the current input character as a 3564 * character token. Stay in the RCDATA state. 3565 */ 3566 continue; 3567 } 3568 } 3569 // XXX reorder point 3570 case RAWTEXT: 3571 rawtextloop: for (;;) { 3572 if (reconsume) { 3573 reconsume = false; 3574 } else { 3575 if (++pos == endPos) { 3576 break stateloop; 3577 } 3578 c = checkChar(buf, pos); 3579 } 3580 switch (c) { 3581 case '<': 3582 /* 3583 * U+003C LESS-THAN SIGN (<) Switch to the 3584 * RAWTEXT less-than sign state. 3585 */ 3586 flushChars(buf, pos); 3587 3588 returnState = state; 3589 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 3590 break rawtextloop; 3591 // FALL THRU continue stateloop; 3592 case '\u0000': 3593 emitReplacementCharacter(buf, pos); 3594 continue; 3595 case '\r': 3596 emitCarriageReturn(buf, pos); 3597 break stateloop; 3598 case '\n': 3599 silentLineFeed(); 3600 default: 3601 /* 3602 * Emit the current input character as a 3603 * character token. Stay in the RAWTEXT state. 3604 */ 3605 continue; 3606 } 3607 } 3608 // XXX fallthru don't reorder 3609 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 3610 rawtextrcdatalessthansignloop: for (;;) { 3611 if (++pos == endPos) { 3612 break stateloop; 3613 } 3614 c = checkChar(buf, pos); 3615 switch (c) { 3616 case '/': 3617 /* 3618 * U+002F SOLIDUS (/) Set the temporary buffer 3619 * to the empty string. Switch to the script 3620 * data end tag open state. 3621 */ 3622 index = 0; 3623 clearStrBuf(); 3624 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 3625 break rawtextrcdatalessthansignloop; 3626 // FALL THRU continue stateloop; 3627 default: 3628 /* 3629 * Otherwise, emit a U+003C LESS-THAN SIGN 3630 * character token 3631 */ 3632 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 3633 /* 3634 * and reconsume the current input character in 3635 * the data state. 3636 */ 3637 cstart = pos; 3638 state = transition(state, returnState, reconsume, pos); 3639 reconsume = true; 3640 continue stateloop; 3641 } 3642 } 3643 // XXX fall thru. don't reorder. 3644 case NON_DATA_END_TAG_NAME: 3645 for (;;) { 3646 if (++pos == endPos) { 3647 break stateloop; 3648 } 3649 c = checkChar(buf, pos); 3650 /* 3651 * ASSERT! when entering this state, set index to 0 and 3652 * call clearStrBuf() assert (contentModelElement != 3653 * null); Let's implement the above without lookahead. 3654 * strBuf is the 'temporary buffer'. 3655 */ 3656 if (index < endTagExpectationAsArray.length) { 3657 char e = endTagExpectationAsArray[index]; 3658 char folded = c; 3659 if (c >= 'A' && c <= 'Z') { 3660 folded += 0x20; 3661 } 3662 if (folded != e) { 3663 // [NOCPP[ 3664 errHtml4LtSlashInRcdata(folded); 3665 // ]NOCPP] 3666 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 3667 0, 2); 3668 emitStrBuf(); 3669 cstart = pos; 3670 state = transition(state, returnState, reconsume, pos); 3671 reconsume = true; 3672 continue stateloop; 3673 } 3674 appendStrBuf(c); 3675 index++; 3676 continue; 3677 } else { 3678 endTag = true; 3679 // XXX replace contentModelElement with different 3680 // type 3681 tagName = endTagExpectation; 3682 switch (c) { 3683 case '\r': 3684 silentCarriageReturn(); 3685 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 3686 break stateloop; 3687 case '\n': 3688 silentLineFeed(); 3689 // fall thru 3690 case ' ': 3691 case '\t': 3692 case '\u000C': 3693 /* 3694 * U+0009 CHARACTER TABULATION U+000A LINE 3695 * FEED (LF) U+000C FORM FEED (FF) U+0020 3696 * SPACE If the current end tag token is an 3697 * appropriate end tag token, then switch to 3698 * the before attribute name state. 3699 */ 3700 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 3701 continue stateloop; 3702 case '/': 3703 /* 3704 * U+002F SOLIDUS (/) If the current end tag 3705 * token is an appropriate end tag token, 3706 * then switch to the self-closing start tag 3707 * state. 3708 */ 3709 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 3710 continue stateloop; 3711 case '>': 3712 /* 3713 * U+003E GREATER-THAN SIGN (>) If the 3714 * current end tag token is an appropriate 3715 * end tag token, then emit the current tag 3716 * token and switch to the data state. 3717 */ 3718 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 3719 if (shouldSuspend) { 3720 break stateloop; 3721 } 3722 continue stateloop; 3723 default: 3724 /* 3725 * Emit a U+003C LESS-THAN SIGN character 3726 * token, a U+002F SOLIDUS character token, 3727 * a character token for each of the 3728 * characters in the temporary buffer (in 3729 * the order they were added to the buffer), 3730 * and reconsume the current input character 3731 * in the RAWTEXT state. 3732 */ 3733 // [NOCPP[ 3734 errWarnLtSlashInRcdata(); 3735 // ]NOCPP] 3736 tokenHandler.characters( 3737 Tokenizer.LT_SOLIDUS, 0, 2); 3738 emitStrBuf(); 3739 if (c == '\u0000') { 3740 emitReplacementCharacter(buf, pos); 3741 } else { 3742 cstart = pos; // don't drop the 3743 // character 3744 } 3745 state = transition(state, returnState, reconsume, pos); 3746 continue stateloop; 3747 } 3748 } 3749 } 3750 // XXX reorder point 3751 // BEGIN HOTSPOT WORKAROUND 3752 case BOGUS_COMMENT: 3753 boguscommentloop: for (;;) { 3754 if (reconsume) { 3755 reconsume = false; 3756 } else { 3757 if (++pos == endPos) { 3758 break stateloop; 3759 } 3760 c = checkChar(buf, pos); 3761 } 3762 /* 3763 * Consume every character up to and including the first 3764 * U+003E GREATER-THAN SIGN character (>) or the end of 3765 * the file (EOF), whichever comes first. Emit a comment 3766 * token whose data is the concatenation of all the 3767 * characters starting from and including the character 3768 * that caused the state machine to switch into the 3769 * bogus comment state, up to and including the 3770 * character immediately before the last consumed 3771 * character (i.e. up to the character just before the 3772 * U+003E or EOF character). (If the comment was started 3773 * by the end of the file (EOF), the token is empty.) 3774 * 3775 * Switch to the data state. 3776 * 3777 * If the end of the file was reached, reconsume the EOF 3778 * character. 3779 */ 3780 switch (c) { 3781 case '>': 3782 emitComment(0, pos); 3783 state = transition(state, Tokenizer.DATA, reconsume, pos); 3784 continue stateloop; 3785 case '-': 3786 appendLongStrBuf(c); 3787 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); 3788 break boguscommentloop; 3789 case '\r': 3790 appendLongStrBufCarriageReturn(); 3791 break stateloop; 3792 case '\n': 3793 appendLongStrBufLineFeed(); 3794 continue; 3795 case '\u0000': 3796 c = '\uFFFD'; 3797 // fall thru 3798 default: 3799 appendLongStrBuf(c); 3800 continue; 3801 } 3802 } 3803 // FALLTHRU DON'T REORDER 3804 case BOGUS_COMMENT_HYPHEN: 3805 boguscommenthyphenloop: for (;;) { 3806 if (++pos == endPos) { 3807 break stateloop; 3808 } 3809 c = checkChar(buf, pos); 3810 switch (c) { 3811 case '>': 3812 // [NOCPP[ 3813 maybeAppendSpaceToBogusComment(); 3814 // ]NOCPP] 3815 emitComment(0, pos); 3816 state = transition(state, Tokenizer.DATA, reconsume, pos); 3817 continue stateloop; 3818 case '-': 3819 appendSecondHyphenToBogusComment(); 3820 continue boguscommenthyphenloop; 3821 case '\r': 3822 appendLongStrBufCarriageReturn(); 3823 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3824 break stateloop; 3825 case '\n': 3826 appendLongStrBufLineFeed(); 3827 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3828 continue stateloop; 3829 case '\u0000': 3830 c = '\uFFFD'; 3831 // fall thru 3832 default: 3833 appendLongStrBuf(c); 3834 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3835 continue stateloop; 3836 } 3837 } 3838 // XXX reorder point 3839 case SCRIPT_DATA: 3840 scriptdataloop: for (;;) { 3841 if (reconsume) { 3842 reconsume = false; 3843 } else { 3844 if (++pos == endPos) { 3845 break stateloop; 3846 } 3847 c = checkChar(buf, pos); 3848 } 3849 switch (c) { 3850 case '<': 3851 /* 3852 * U+003C LESS-THAN SIGN (<) Switch to the 3853 * script data less-than sign state. 3854 */ 3855 flushChars(buf, pos); 3856 returnState = state; 3857 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); 3858 break scriptdataloop; // FALL THRU continue 3859 // stateloop; 3860 case '\u0000': 3861 emitReplacementCharacter(buf, pos); 3862 continue; 3863 case '\r': 3864 emitCarriageReturn(buf, pos); 3865 break stateloop; 3866 case '\n': 3867 silentLineFeed(); 3868 default: 3869 /* 3870 * Anything else Emit the current input 3871 * character as a character token. Stay in the 3872 * script data state. 3873 */ 3874 continue; 3875 } 3876 } 3877 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 3878 case SCRIPT_DATA_LESS_THAN_SIGN: 3879 scriptdatalessthansignloop: for (;;) { 3880 if (++pos == endPos) { 3881 break stateloop; 3882 } 3883 c = checkChar(buf, pos); 3884 switch (c) { 3885 case '/': 3886 /* 3887 * U+002F SOLIDUS (/) Set the temporary buffer 3888 * to the empty string. Switch to the script 3889 * data end tag open state. 3890 */ 3891 index = 0; 3892 clearStrBuf(); 3893 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 3894 continue stateloop; 3895 case '!': 3896 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 3897 cstart = pos; 3898 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); 3899 break scriptdatalessthansignloop; // FALL THRU 3900 // continue 3901 // stateloop; 3902 default: 3903 /* 3904 * Otherwise, emit a U+003C LESS-THAN SIGN 3905 * character token 3906 */ 3907 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 3908 /* 3909 * and reconsume the current input character in 3910 * the data state. 3911 */ 3912 cstart = pos; 3913 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 3914 reconsume = true; 3915 continue stateloop; 3916 } 3917 } 3918 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 3919 case SCRIPT_DATA_ESCAPE_START: 3920 scriptdataescapestartloop: for (;;) { 3921 if (++pos == endPos) { 3922 break stateloop; 3923 } 3924 c = checkChar(buf, pos); 3925 /* 3926 * Consume the next input character: 3927 */ 3928 switch (c) { 3929 case '-': 3930 /* 3931 * U+002D HYPHEN-MINUS (-) Emit a U+002D 3932 * HYPHEN-MINUS character token. Switch to the 3933 * script data escape start dash state. 3934 */ 3935 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); 3936 break scriptdataescapestartloop; // FALL THRU 3937 // continue 3938 // stateloop; 3939 default: 3940 /* 3941 * Anything else Reconsume the current input 3942 * character in the script data state. 3943 */ 3944 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 3945 reconsume = true; 3946 continue stateloop; 3947 } 3948 } 3949 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 3950 case SCRIPT_DATA_ESCAPE_START_DASH: 3951 scriptdataescapestartdashloop: for (;;) { 3952 if (++pos == endPos) { 3953 break stateloop; 3954 } 3955 c = checkChar(buf, pos); 3956 /* 3957 * Consume the next input character: 3958 */ 3959 switch (c) { 3960 case '-': 3961 /* 3962 * U+002D HYPHEN-MINUS (-) Emit a U+002D 3963 * HYPHEN-MINUS character token. Switch to the 3964 * script data escaped dash dash state. 3965 */ 3966 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 3967 break scriptdataescapestartdashloop; 3968 // continue stateloop; 3969 default: 3970 /* 3971 * Anything else Reconsume the current input 3972 * character in the script data state. 3973 */ 3974 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 3975 reconsume = true; 3976 continue stateloop; 3977 } 3978 } 3979 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 3980 case SCRIPT_DATA_ESCAPED_DASH_DASH: 3981 scriptdataescapeddashdashloop: for (;;) { 3982 if (++pos == endPos) { 3983 break stateloop; 3984 } 3985 c = checkChar(buf, pos); 3986 /* 3987 * Consume the next input character: 3988 */ 3989 switch (c) { 3990 case '-': 3991 /* 3992 * U+002D HYPHEN-MINUS (-) Emit a U+002D 3993 * HYPHEN-MINUS character token. Stay in the 3994 * script data escaped dash dash state. 3995 */ 3996 continue; 3997 case '<': 3998 /* 3999 * U+003C LESS-THAN SIGN (<) Switch to the 4000 * script data escaped less-than sign state. 4001 */ 4002 flushChars(buf, pos); 4003 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4004 continue stateloop; 4005 case '>': 4006 /* 4007 * U+003E GREATER-THAN SIGN (>) Emit a U+003E 4008 * GREATER-THAN SIGN character token. Switch to 4009 * the script data state. 4010 */ 4011 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4012 continue stateloop; 4013 case '\u0000': 4014 emitReplacementCharacter(buf, pos); 4015 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4016 break scriptdataescapeddashdashloop; 4017 case '\r': 4018 emitCarriageReturn(buf, pos); 4019 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4020 break stateloop; 4021 case '\n': 4022 silentLineFeed(); 4023 default: 4024 /* 4025 * Anything else Emit the current input 4026 * character as a character token. Switch to the 4027 * script data escaped state. 4028 */ 4029 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4030 break scriptdataescapeddashdashloop; 4031 // continue stateloop; 4032 } 4033 } 4034 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4035 case SCRIPT_DATA_ESCAPED: 4036 scriptdataescapedloop: for (;;) { 4037 if (reconsume) { 4038 reconsume = false; 4039 } else { 4040 if (++pos == endPos) { 4041 break stateloop; 4042 } 4043 c = checkChar(buf, pos); 4044 } 4045 /* 4046 * Consume the next input character: 4047 */ 4048 switch (c) { 4049 case '-': 4050 /* 4051 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4052 * HYPHEN-MINUS character token. Switch to the 4053 * script data escaped dash state. 4054 */ 4055 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); 4056 break scriptdataescapedloop; // FALL THRU 4057 // continue 4058 // stateloop; 4059 case '<': 4060 /* 4061 * U+003C LESS-THAN SIGN (<) Switch to the 4062 * script data escaped less-than sign state. 4063 */ 4064 flushChars(buf, pos); 4065 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4066 continue stateloop; 4067 case '\u0000': 4068 emitReplacementCharacter(buf, pos); 4069 continue; 4070 case '\r': 4071 emitCarriageReturn(buf, pos); 4072 break stateloop; 4073 case '\n': 4074 silentLineFeed(); 4075 default: 4076 /* 4077 * Anything else Emit the current input 4078 * character as a character token. Stay in the 4079 * script data escaped state. 4080 */ 4081 continue; 4082 } 4083 } 4084 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4085 case SCRIPT_DATA_ESCAPED_DASH: 4086 scriptdataescapeddashloop: for (;;) { 4087 if (++pos == endPos) { 4088 break stateloop; 4089 } 4090 c = checkChar(buf, pos); 4091 /* 4092 * Consume the next input character: 4093 */ 4094 switch (c) { 4095 case '-': 4096 /* 4097 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4098 * HYPHEN-MINUS character token. Switch to the 4099 * script data escaped dash dash state. 4100 */ 4101 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 4102 continue stateloop; 4103 case '<': 4104 /* 4105 * U+003C LESS-THAN SIGN (<) Switch to the 4106 * script data escaped less-than sign state. 4107 */ 4108 flushChars(buf, pos); 4109 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4110 break scriptdataescapeddashloop; 4111 // continue stateloop; 4112 case '\u0000': 4113 emitReplacementCharacter(buf, pos); 4114 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4115 continue stateloop; 4116 case '\r': 4117 emitCarriageReturn(buf, pos); 4118 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4119 break stateloop; 4120 case '\n': 4121 silentLineFeed(); 4122 default: 4123 /* 4124 * Anything else Emit the current input 4125 * character as a character token. Switch to the 4126 * script data escaped state. 4127 */ 4128 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4129 continue stateloop; 4130 } 4131 } 4132 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4133 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 4134 scriptdataescapedlessthanloop: for (;;) { 4135 if (++pos == endPos) { 4136 break stateloop; 4137 } 4138 c = checkChar(buf, pos); 4139 /* 4140 * Consume the next input character: 4141 */ 4142 switch (c) { 4143 case '/': 4144 /* 4145 * U+002F SOLIDUS (/) Set the temporary buffer 4146 * to the empty string. Switch to the script 4147 * data escaped end tag open state. 4148 */ 4149 index = 0; 4150 clearStrBuf(); 4151 returnState = Tokenizer.SCRIPT_DATA_ESCAPED; 4152 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 4153 continue stateloop; 4154 case 'S': 4155 case 's': 4156 /* 4157 * U+0041 LATIN CAPITAL LETTER A through to 4158 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C 4159 * LESS-THAN SIGN character token and the 4160 * current input character as a character token. 4161 */ 4162 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4163 cstart = pos; 4164 index = 1; 4165 /* 4166 * Set the temporary buffer to the empty string. 4167 * Append the lowercase version of the current 4168 * input character (add 0x0020 to the 4169 * character's code point) to the temporary 4170 * buffer. Switch to the script data double 4171 * escape start state. 4172 */ 4173 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); 4174 break scriptdataescapedlessthanloop; 4175 // continue stateloop; 4176 default: 4177 /* 4178 * Anything else Emit a U+003C LESS-THAN SIGN 4179 * character token and reconsume the current 4180 * input character in the script data escaped 4181 * state. 4182 */ 4183 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4184 cstart = pos; 4185 reconsume = true; 4186 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4187 continue stateloop; 4188 } 4189 } 4190 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4191 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 4192 scriptdatadoubleescapestartloop: for (;;) { 4193 if (++pos == endPos) { 4194 break stateloop; 4195 } 4196 c = checkChar(buf, pos); 4197 assert (index > 0); 4198 if (index < 6) { // SCRIPT_ARR.length 4199 char folded = c; 4200 if (c >= 'A' && c <= 'Z') { 4201 folded += 0x20; 4202 } 4203 if (folded != Tokenizer.SCRIPT_ARR[index]) { 4204 reconsume = true; 4205 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4206 continue stateloop; 4207 } 4208 index++; 4209 continue; 4210 } 4211 switch (c) { 4212 case '\r': 4213 emitCarriageReturn(buf, pos); 4214 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4215 break stateloop; 4216 case '\n': 4217 silentLineFeed(); 4218 case ' ': 4219 case '\t': 4220 case '\u000C': 4221 case '/': 4222 case '>': 4223 /* 4224 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4225 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4226 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 4227 * (>) Emit the current input character as a 4228 * character token. If the temporary buffer is 4229 * the string "script", then switch to the 4230 * script data double escaped state. 4231 */ 4232 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4233 break scriptdatadoubleescapestartloop; 4234 // continue stateloop; 4235 default: 4236 /* 4237 * Anything else Reconsume the current input 4238 * character in the script data escaped state. 4239 */ 4240 reconsume = true; 4241 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4242 continue stateloop; 4243 } 4244 } 4245 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4246 case SCRIPT_DATA_DOUBLE_ESCAPED: 4247 scriptdatadoubleescapedloop: for (;;) { 4248 if (reconsume) { 4249 reconsume = false; 4250 } else { 4251 if (++pos == endPos) { 4252 break stateloop; 4253 } 4254 c = checkChar(buf, pos); 4255 } 4256 /* 4257 * Consume the next input character: 4258 */ 4259 switch (c) { 4260 case '-': 4261 /* 4262 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4263 * HYPHEN-MINUS character token. Switch to the 4264 * script data double escaped dash state. 4265 */ 4266 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); 4267 break scriptdatadoubleescapedloop; // FALL THRU 4268 // continue 4269 // stateloop; 4270 case '<': 4271 /* 4272 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4273 * LESS-THAN SIGN character token. Switch to the 4274 * script data double escaped less-than sign 4275 * state. 4276 */ 4277 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4278 continue stateloop; 4279 case '\u0000': 4280 emitReplacementCharacter(buf, pos); 4281 continue; 4282 case '\r': 4283 emitCarriageReturn(buf, pos); 4284 break stateloop; 4285 case '\n': 4286 silentLineFeed(); 4287 default: 4288 /* 4289 * Anything else Emit the current input 4290 * character as a character token. Stay in the 4291 * script data double escaped state. 4292 */ 4293 continue; 4294 } 4295 } 4296 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4297 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 4298 scriptdatadoubleescapeddashloop: for (;;) { 4299 if (++pos == endPos) { 4300 break stateloop; 4301 } 4302 c = checkChar(buf, pos); 4303 /* 4304 * Consume the next input character: 4305 */ 4306 switch (c) { 4307 case '-': 4308 /* 4309 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4310 * HYPHEN-MINUS character token. Switch to the 4311 * script data double escaped dash dash state. 4312 */ 4313 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); 4314 break scriptdatadoubleescapeddashloop; 4315 // continue stateloop; 4316 case '<': 4317 /* 4318 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4319 * LESS-THAN SIGN character token. Switch to the 4320 * script data double escaped less-than sign 4321 * state. 4322 */ 4323 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4324 continue stateloop; 4325 case '\u0000': 4326 emitReplacementCharacter(buf, pos); 4327 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4328 continue stateloop; 4329 case '\r': 4330 emitCarriageReturn(buf, pos); 4331 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4332 break stateloop; 4333 case '\n': 4334 silentLineFeed(); 4335 default: 4336 /* 4337 * Anything else Emit the current input 4338 * character as a character token. Switch to the 4339 * script data double escaped state. 4340 */ 4341 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4342 continue stateloop; 4343 } 4344 } 4345 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4346 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 4347 scriptdatadoubleescapeddashdashloop: for (;;) { 4348 if (++pos == endPos) { 4349 break stateloop; 4350 } 4351 c = checkChar(buf, pos); 4352 /* 4353 * Consume the next input character: 4354 */ 4355 switch (c) { 4356 case '-': 4357 /* 4358 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4359 * HYPHEN-MINUS character token. Stay in the 4360 * script data double escaped dash dash state. 4361 */ 4362 continue; 4363 case '<': 4364 /* 4365 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4366 * LESS-THAN SIGN character token. Switch to the 4367 * script data double escaped less-than sign 4368 * state. 4369 */ 4370 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4371 break scriptdatadoubleescapeddashdashloop; 4372 case '>': 4373 /* 4374 * U+003E GREATER-THAN SIGN (>) Emit a U+003E 4375 * GREATER-THAN SIGN character token. Switch to 4376 * the script data state. 4377 */ 4378 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4379 continue stateloop; 4380 case '\u0000': 4381 emitReplacementCharacter(buf, pos); 4382 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4383 continue stateloop; 4384 case '\r': 4385 emitCarriageReturn(buf, pos); 4386 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4387 break stateloop; 4388 case '\n': 4389 silentLineFeed(); 4390 default: 4391 /* 4392 * Anything else Emit the current input 4393 * character as a character token. Switch to the 4394 * script data double escaped state. 4395 */ 4396 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4397 continue stateloop; 4398 } 4399 } 4400 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4401 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 4402 scriptdatadoubleescapedlessthanloop: for (;;) { 4403 if (++pos == endPos) { 4404 break stateloop; 4405 } 4406 c = checkChar(buf, pos); 4407 /* 4408 * Consume the next input character: 4409 */ 4410 switch (c) { 4411 case '/': 4412 /* 4413 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS 4414 * character token. Set the temporary buffer to 4415 * the empty string. Switch to the script data 4416 * double escape end state. 4417 */ 4418 index = 0; 4419 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); 4420 break scriptdatadoubleescapedlessthanloop; 4421 default: 4422 /* 4423 * Anything else Reconsume the current input 4424 * character in the script data double escaped 4425 * state. 4426 */ 4427 reconsume = true; 4428 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4429 continue stateloop; 4430 } 4431 } 4432 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 4433 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 4434 scriptdatadoubleescapeendloop: for (;;) { 4435 if (++pos == endPos) { 4436 break stateloop; 4437 } 4438 c = checkChar(buf, pos); 4439 if (index < 6) { // SCRIPT_ARR.length 4440 char folded = c; 4441 if (c >= 'A' && c <= 'Z') { 4442 folded += 0x20; 4443 } 4444 if (folded != Tokenizer.SCRIPT_ARR[index]) { 4445 reconsume = true; 4446 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4447 continue stateloop; 4448 } 4449 index++; 4450 continue; 4451 } 4452 switch (c) { 4453 case '\r': 4454 emitCarriageReturn(buf, pos); 4455 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4456 break stateloop; 4457 case '\n': 4458 silentLineFeed(); 4459 case ' ': 4460 case '\t': 4461 case '\u000C': 4462 case '/': 4463 case '>': 4464 /* 4465 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4466 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4467 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 4468 * (>) Emit the current input character as a 4469 * character token. If the temporary buffer is 4470 * the string "script", then switch to the 4471 * script data escaped state. 4472 */ 4473 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4474 continue stateloop; 4475 default: 4476 /* 4477 * Reconsume the current input character in the 4478 * script data double escaped state. 4479 */ 4480 reconsume = true; 4481 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4482 continue stateloop; 4483 } 4484 } 4485 // XXX reorder point 4486 case MARKUP_DECLARATION_OCTYPE: 4487 markupdeclarationdoctypeloop: for (;;) { 4488 if (++pos == endPos) { 4489 break stateloop; 4490 } 4491 c = checkChar(buf, pos); 4492 if (index < 6) { // OCTYPE.length 4493 char folded = c; 4494 if (c >= 'A' && c <= 'Z') { 4495 folded += 0x20; 4496 } 4497 if (folded == Tokenizer.OCTYPE[index]) { 4498 appendLongStrBuf(c); 4499 } else { 4500 errBogusComment(); 4501 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4502 reconsume = true; 4503 continue stateloop; 4504 } 4505 index++; 4506 continue; 4507 } else { 4508 state = transition(state, Tokenizer.DOCTYPE, reconsume, pos); 4509 reconsume = true; 4510 break markupdeclarationdoctypeloop; 4511 // continue stateloop; 4512 } 4513 } 4514 // FALLTHRU DON'T REORDER 4515 case DOCTYPE: 4516 doctypeloop: for (;;) { 4517 if (reconsume) { 4518 reconsume = false; 4519 } else { 4520 if (++pos == endPos) { 4521 break stateloop; 4522 } 4523 c = checkChar(buf, pos); 4524 } 4525 initDoctypeFields(); 4526 /* 4527 * Consume the next input character: 4528 */ 4529 switch (c) { 4530 case '\r': 4531 silentCarriageReturn(); 4532 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4533 break stateloop; 4534 case '\n': 4535 silentLineFeed(); 4536 // fall thru 4537 case ' ': 4538 case '\t': 4539 case '\u000C': 4540 /* 4541 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4542 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4543 * Switch to the before DOCTYPE name state. 4544 */ 4545 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4546 break doctypeloop; 4547 // continue stateloop; 4548 default: 4549 /* 4550 * Anything else Parse error. 4551 */ 4552 errMissingSpaceBeforeDoctypeName(); 4553 /* 4554 * Reconsume the current character in the before 4555 * DOCTYPE name state. 4556 */ 4557 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 4558 reconsume = true; 4559 break doctypeloop; 4560 // continue stateloop; 4561 } 4562 } 4563 // FALLTHRU DON'T REORDER 4564 case BEFORE_DOCTYPE_NAME: 4565 beforedoctypenameloop: for (;;) { 4566 if (reconsume) { 4567 reconsume = false; 4568 } else { 4569 if (++pos == endPos) { 4570 break stateloop; 4571 } 4572 c = checkChar(buf, pos); 4573 } 4574 /* 4575 * Consume the next input character: 4576 */ 4577 switch (c) { 4578 case '\r': 4579 silentCarriageReturn(); 4580 break stateloop; 4581 case '\n': 4582 silentLineFeed(); 4583 // fall thru 4584 case ' ': 4585 case '\t': 4586 case '\u000C': 4587 /* 4588 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4589 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 4590 * in the before DOCTYPE name state. 4591 */ 4592 continue; 4593 case '>': 4594 /* 4595 * U+003E GREATER-THAN SIGN (>) Parse error. 4596 */ 4597 errNamelessDoctype(); 4598 /* 4599 * Create a new DOCTYPE token. Set its 4600 * force-quirks flag to on. 4601 */ 4602 forceQuirks = true; 4603 /* 4604 * Emit the token. 4605 */ 4606 emitDoctypeToken(pos); 4607 /* 4608 * Switch to the data state. 4609 */ 4610 state = transition(state, Tokenizer.DATA, reconsume, pos); 4611 continue stateloop; 4612 case '\u0000': 4613 c = '\uFFFD'; 4614 // fall thru 4615 default: 4616 if (c >= 'A' && c <= 'Z') { 4617 /* 4618 * U+0041 LATIN CAPITAL LETTER A through to 4619 * U+005A LATIN CAPITAL LETTER Z Create a 4620 * new DOCTYPE token. Set the token's name 4621 * to the lowercase version of the input 4622 * character (add 0x0020 to the character's 4623 * code point). 4624 */ 4625 c += 0x20; 4626 } 4627 /* Anything else Create a new DOCTYPE token. */ 4628 /* 4629 * Set the token's name name to the current 4630 * input character. 4631 */ 4632 clearStrBufAndAppend(c); 4633 /* 4634 * Switch to the DOCTYPE name state. 4635 */ 4636 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); 4637 break beforedoctypenameloop; 4638 // continue stateloop; 4639 } 4640 } 4641 // FALLTHRU DON'T REORDER 4642 case DOCTYPE_NAME: 4643 doctypenameloop: for (;;) { 4644 if (++pos == endPos) { 4645 break stateloop; 4646 } 4647 c = checkChar(buf, pos); 4648 /* 4649 * Consume the next input character: 4650 */ 4651 switch (c) { 4652 case '\r': 4653 silentCarriageReturn(); 4654 strBufToDoctypeName(); 4655 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 4656 break stateloop; 4657 case '\n': 4658 silentLineFeed(); 4659 // fall thru 4660 case ' ': 4661 case '\t': 4662 case '\u000C': 4663 /* 4664 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4665 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4666 * Switch to the after DOCTYPE name state. 4667 */ 4668 strBufToDoctypeName(); 4669 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 4670 break doctypenameloop; 4671 // continue stateloop; 4672 case '>': 4673 /* 4674 * U+003E GREATER-THAN SIGN (>) Emit the current 4675 * DOCTYPE token. 4676 */ 4677 strBufToDoctypeName(); 4678 emitDoctypeToken(pos); 4679 /* 4680 * Switch to the data state. 4681 */ 4682 state = transition(state, Tokenizer.DATA, reconsume, pos); 4683 continue stateloop; 4684 case '\u0000': 4685 c = '\uFFFD'; 4686 // fall thru 4687 default: 4688 /* 4689 * U+0041 LATIN CAPITAL LETTER A through to 4690 * U+005A LATIN CAPITAL LETTER Z Append the 4691 * lowercase version of the input character (add 4692 * 0x0020 to the character's code point) to the 4693 * current DOCTYPE token's name. 4694 */ 4695 if (c >= 'A' && c <= 'Z') { 4696 c += 0x0020; 4697 } 4698 /* 4699 * Anything else Append the current input 4700 * character to the current DOCTYPE token's 4701 * name. 4702 */ 4703 appendStrBuf(c); 4704 /* 4705 * Stay in the DOCTYPE name state. 4706 */ 4707 continue; 4708 } 4709 } 4710 // FALLTHRU DON'T REORDER 4711 case AFTER_DOCTYPE_NAME: 4712 afterdoctypenameloop: for (;;) { 4713 if (++pos == endPos) { 4714 break stateloop; 4715 } 4716 c = checkChar(buf, pos); 4717 /* 4718 * Consume the next input character: 4719 */ 4720 switch (c) { 4721 case '\r': 4722 silentCarriageReturn(); 4723 break stateloop; 4724 case '\n': 4725 silentLineFeed(); 4726 // fall thru 4727 case ' ': 4728 case '\t': 4729 case '\u000C': 4730 /* 4731 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4732 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 4733 * in the after DOCTYPE name state. 4734 */ 4735 continue; 4736 case '>': 4737 /* 4738 * U+003E GREATER-THAN SIGN (>) Emit the current 4739 * DOCTYPE token. 4740 */ 4741 emitDoctypeToken(pos); 4742 /* 4743 * Switch to the data state. 4744 */ 4745 state = transition(state, Tokenizer.DATA, reconsume, pos); 4746 continue stateloop; 4747 case 'p': 4748 case 'P': 4749 index = 0; 4750 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); 4751 break afterdoctypenameloop; 4752 // continue stateloop; 4753 case 's': 4754 case 'S': 4755 index = 0; 4756 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); 4757 continue stateloop; 4758 default: 4759 /* 4760 * Otherwise, this is the parse error. 4761 */ 4762 bogusDoctype(); 4763 4764 /* 4765 * Set the DOCTYPE token's force-quirks flag to 4766 * on. 4767 */ 4768 // done by bogusDoctype(); 4769 /* 4770 * Switch to the bogus DOCTYPE state. 4771 */ 4772 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 4773 continue stateloop; 4774 } 4775 } 4776 // FALLTHRU DON'T REORDER 4777 case DOCTYPE_UBLIC: 4778 doctypeublicloop: for (;;) { 4779 if (++pos == endPos) { 4780 break stateloop; 4781 } 4782 c = checkChar(buf, pos); 4783 /* 4784 * If the six characters starting from the current input 4785 * character are an ASCII case-insensitive match for the 4786 * word "PUBLIC", then consume those characters and 4787 * switch to the before DOCTYPE public identifier state. 4788 */ 4789 if (index < 5) { // UBLIC.length 4790 char folded = c; 4791 if (c >= 'A' && c <= 'Z') { 4792 folded += 0x20; 4793 } 4794 if (folded != Tokenizer.UBLIC[index]) { 4795 bogusDoctype(); 4796 // forceQuirks = true; 4797 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 4798 reconsume = true; 4799 continue stateloop; 4800 } 4801 index++; 4802 continue; 4803 } else { 4804 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); 4805 reconsume = true; 4806 break doctypeublicloop; 4807 // continue stateloop; 4808 } 4809 } 4810 // FALLTHRU DON'T REORDER 4811 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 4812 afterdoctypepublickeywordloop: for (;;) { 4813 if (reconsume) { 4814 reconsume = false; 4815 } else { 4816 if (++pos == endPos) { 4817 break stateloop; 4818 } 4819 c = checkChar(buf, pos); 4820 } 4821 /* 4822 * Consume the next input character: 4823 */ 4824 switch (c) { 4825 case '\r': 4826 silentCarriageReturn(); 4827 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 4828 break stateloop; 4829 case '\n': 4830 silentLineFeed(); 4831 // fall thru 4832 case ' ': 4833 case '\t': 4834 case '\u000C': 4835 /* 4836 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4837 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4838 * Switch to the before DOCTYPE public 4839 * identifier state. 4840 */ 4841 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 4842 break afterdoctypepublickeywordloop; 4843 // FALL THROUGH continue stateloop 4844 case '"': 4845 /* 4846 * U+0022 QUOTATION MARK (") Parse Error. 4847 */ 4848 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 4849 /* 4850 * Set the DOCTYPE token's public identifier to 4851 * the empty string (not missing), 4852 */ 4853 clearLongStrBuf(); 4854 /* 4855 * then switch to the DOCTYPE public identifier 4856 * (double-quoted) state. 4857 */ 4858 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 4859 continue stateloop; 4860 case '\'': 4861 /* 4862 * U+0027 APOSTROPHE (') Parse Error. 4863 */ 4864 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 4865 /* 4866 * Set the DOCTYPE token's public identifier to 4867 * the empty string (not missing), 4868 */ 4869 clearLongStrBuf(); 4870 /* 4871 * then switch to the DOCTYPE public identifier 4872 * (single-quoted) state. 4873 */ 4874 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 4875 continue stateloop; 4876 case '>': 4877 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 4878 errExpectedPublicId(); 4879 /* 4880 * Set the DOCTYPE token's force-quirks flag to 4881 * on. 4882 */ 4883 forceQuirks = true; 4884 /* 4885 * Emit that DOCTYPE token. 4886 */ 4887 emitDoctypeToken(pos); 4888 /* 4889 * Switch to the data state. 4890 */ 4891 state = transition(state, Tokenizer.DATA, reconsume, pos); 4892 continue stateloop; 4893 default: 4894 bogusDoctype(); 4895 /* 4896 * Set the DOCTYPE token's force-quirks flag to 4897 * on. 4898 */ 4899 // done by bogusDoctype(); 4900 /* 4901 * Switch to the bogus DOCTYPE state. 4902 */ 4903 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 4904 continue stateloop; 4905 } 4906 } 4907 // FALLTHRU DON'T REORDER 4908 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 4909 beforedoctypepublicidentifierloop: for (;;) { 4910 if (++pos == endPos) { 4911 break stateloop; 4912 } 4913 c = checkChar(buf, pos); 4914 /* 4915 * Consume the next input character: 4916 */ 4917 switch (c) { 4918 case '\r': 4919 silentCarriageReturn(); 4920 break stateloop; 4921 case '\n': 4922 silentLineFeed(); 4923 // fall thru 4924 case ' ': 4925 case '\t': 4926 case '\u000C': 4927 /* 4928 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4929 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 4930 * in the before DOCTYPE public identifier 4931 * state. 4932 */ 4933 continue; 4934 case '"': 4935 /* 4936 * U+0022 QUOTATION MARK (") Set the DOCTYPE 4937 * token's public identifier to the empty string 4938 * (not missing), 4939 */ 4940 clearLongStrBuf(); 4941 /* 4942 * then switch to the DOCTYPE public identifier 4943 * (double-quoted) state. 4944 */ 4945 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 4946 break beforedoctypepublicidentifierloop; 4947 // continue stateloop; 4948 case '\'': 4949 /* 4950 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 4951 * public identifier to the empty string (not 4952 * missing), 4953 */ 4954 clearLongStrBuf(); 4955 /* 4956 * then switch to the DOCTYPE public identifier 4957 * (single-quoted) state. 4958 */ 4959 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 4960 continue stateloop; 4961 case '>': 4962 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 4963 errExpectedPublicId(); 4964 /* 4965 * Set the DOCTYPE token's force-quirks flag to 4966 * on. 4967 */ 4968 forceQuirks = true; 4969 /* 4970 * Emit that DOCTYPE token. 4971 */ 4972 emitDoctypeToken(pos); 4973 /* 4974 * Switch to the data state. 4975 */ 4976 state = transition(state, Tokenizer.DATA, reconsume, pos); 4977 continue stateloop; 4978 default: 4979 bogusDoctype(); 4980 /* 4981 * Set the DOCTYPE token's force-quirks flag to 4982 * on. 4983 */ 4984 // done by bogusDoctype(); 4985 /* 4986 * Switch to the bogus DOCTYPE state. 4987 */ 4988 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 4989 continue stateloop; 4990 } 4991 } 4992 // FALLTHRU DON'T REORDER 4993 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 4994 doctypepublicidentifierdoublequotedloop: for (;;) { 4995 if (++pos == endPos) { 4996 break stateloop; 4997 } 4998 c = checkChar(buf, pos); 4999 /* 5000 * Consume the next input character: 5001 */ 5002 switch (c) { 5003 case '"': 5004 /* 5005 * U+0022 QUOTATION MARK (") Switch to the after 5006 * DOCTYPE public identifier state. 5007 */ 5008 publicIdentifier = longStrBufToString(); 5009 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5010 break doctypepublicidentifierdoublequotedloop; 5011 // continue stateloop; 5012 case '>': 5013 /* 5014 * U+003E GREATER-THAN SIGN (>) Parse error. 5015 */ 5016 errGtInPublicId(); 5017 /* 5018 * Set the DOCTYPE token's force-quirks flag to 5019 * on. 5020 */ 5021 forceQuirks = true; 5022 /* 5023 * Emit that DOCTYPE token. 5024 */ 5025 publicIdentifier = longStrBufToString(); 5026 emitDoctypeToken(pos); 5027 /* 5028 * Switch to the data state. 5029 */ 5030 state = transition(state, Tokenizer.DATA, reconsume, pos); 5031 continue stateloop; 5032 case '\r': 5033 appendLongStrBufCarriageReturn(); 5034 break stateloop; 5035 case '\n': 5036 appendLongStrBufLineFeed(); 5037 continue; 5038 case '\u0000': 5039 c = '\uFFFD'; 5040 // fall thru 5041 default: 5042 /* 5043 * Anything else Append the current input 5044 * character to the current DOCTYPE token's 5045 * public identifier. 5046 */ 5047 appendLongStrBuf(c); 5048 /* 5049 * Stay in the DOCTYPE public identifier 5050 * (double-quoted) state. 5051 */ 5052 continue; 5053 } 5054 } 5055 // FALLTHRU DON'T REORDER 5056 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 5057 afterdoctypepublicidentifierloop: for (;;) { 5058 if (++pos == endPos) { 5059 break stateloop; 5060 } 5061 c = checkChar(buf, pos); 5062 /* 5063 * Consume the next input character: 5064 */ 5065 switch (c) { 5066 case '\r': 5067 silentCarriageReturn(); 5068 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 5069 break stateloop; 5070 case '\n': 5071 silentLineFeed(); 5072 // fall thru 5073 case ' ': 5074 case '\t': 5075 case '\u000C': 5076 /* 5077 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5078 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5079 * Switch to the between DOCTYPE public and 5080 * system identifiers state. 5081 */ 5082 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 5083 break afterdoctypepublicidentifierloop; 5084 // continue stateloop; 5085 case '>': 5086 /* 5087 * U+003E GREATER-THAN SIGN (>) Emit the current 5088 * DOCTYPE token. 5089 */ 5090 emitDoctypeToken(pos); 5091 /* 5092 * Switch to the data state. 5093 */ 5094 state = transition(state, Tokenizer.DATA, reconsume, pos); 5095 continue stateloop; 5096 case '"': 5097 /* 5098 * U+0022 QUOTATION MARK (") Parse error. 5099 */ 5100 errNoSpaceBetweenPublicAndSystemIds(); 5101 /* 5102 * Set the DOCTYPE token's system identifier to 5103 * the empty string (not missing), 5104 */ 5105 clearLongStrBuf(); 5106 /* 5107 * then switch to the DOCTYPE system identifier 5108 * (double-quoted) state. 5109 */ 5110 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5111 continue stateloop; 5112 case '\'': 5113 /* 5114 * U+0027 APOSTROPHE (') Parse error. 5115 */ 5116 errNoSpaceBetweenPublicAndSystemIds(); 5117 /* 5118 * Set the DOCTYPE token's system identifier to 5119 * the empty string (not missing), 5120 */ 5121 clearLongStrBuf(); 5122 /* 5123 * then switch to the DOCTYPE system identifier 5124 * (single-quoted) state. 5125 */ 5126 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5127 continue stateloop; 5128 default: 5129 bogusDoctype(); 5130 /* 5131 * Set the DOCTYPE token's force-quirks flag to 5132 * on. 5133 */ 5134 // done by bogusDoctype(); 5135 /* 5136 * Switch to the bogus DOCTYPE state. 5137 */ 5138 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5139 continue stateloop; 5140 } 5141 } 5142 // FALLTHRU DON'T REORDER 5143 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 5144 betweendoctypepublicandsystemidentifiersloop: for (;;) { 5145 if (++pos == endPos) { 5146 break stateloop; 5147 } 5148 c = checkChar(buf, pos); 5149 /* 5150 * Consume the next input character: 5151 */ 5152 switch (c) { 5153 case '\r': 5154 silentCarriageReturn(); 5155 break stateloop; 5156 case '\n': 5157 silentLineFeed(); 5158 // fall thru 5159 case ' ': 5160 case '\t': 5161 case '\u000C': 5162 /* 5163 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5164 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5165 * in the between DOCTYPE public and system 5166 * identifiers state. 5167 */ 5168 continue; 5169 case '>': 5170 /* 5171 * U+003E GREATER-THAN SIGN (>) Emit the current 5172 * DOCTYPE token. 5173 */ 5174 emitDoctypeToken(pos); 5175 /* 5176 * Switch to the data state. 5177 */ 5178 state = transition(state, Tokenizer.DATA, reconsume, pos); 5179 continue stateloop; 5180 case '"': 5181 /* 5182 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5183 * token's system identifier to the empty string 5184 * (not missing), 5185 */ 5186 clearLongStrBuf(); 5187 /* 5188 * then switch to the DOCTYPE system identifier 5189 * (double-quoted) state. 5190 */ 5191 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5192 break betweendoctypepublicandsystemidentifiersloop; 5193 // continue stateloop; 5194 case '\'': 5195 /* 5196 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5197 * system identifier to the empty string (not 5198 * missing), 5199 */ 5200 clearLongStrBuf(); 5201 /* 5202 * then switch to the DOCTYPE system identifier 5203 * (single-quoted) state. 5204 */ 5205 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5206 continue stateloop; 5207 default: 5208 bogusDoctype(); 5209 /* 5210 * Set the DOCTYPE token's force-quirks flag to 5211 * on. 5212 */ 5213 // done by bogusDoctype(); 5214 /* 5215 * Switch to the bogus DOCTYPE state. 5216 */ 5217 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5218 continue stateloop; 5219 } 5220 } 5221 // FALLTHRU DON'T REORDER 5222 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 5223 doctypesystemidentifierdoublequotedloop: for (;;) { 5224 if (++pos == endPos) { 5225 break stateloop; 5226 } 5227 c = checkChar(buf, pos); 5228 /* 5229 * Consume the next input character: 5230 */ 5231 switch (c) { 5232 case '"': 5233 /* 5234 * U+0022 QUOTATION MARK (") Switch to the after 5235 * DOCTYPE system identifier state. 5236 */ 5237 systemIdentifier = longStrBufToString(); 5238 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5239 continue stateloop; 5240 case '>': 5241 /* 5242 * U+003E GREATER-THAN SIGN (>) Parse error. 5243 */ 5244 errGtInSystemId(); 5245 /* 5246 * Set the DOCTYPE token's force-quirks flag to 5247 * on. 5248 */ 5249 forceQuirks = true; 5250 /* 5251 * Emit that DOCTYPE token. 5252 */ 5253 systemIdentifier = longStrBufToString(); 5254 emitDoctypeToken(pos); 5255 /* 5256 * Switch to the data state. 5257 */ 5258 state = transition(state, Tokenizer.DATA, reconsume, pos); 5259 continue stateloop; 5260 case '\r': 5261 appendLongStrBufCarriageReturn(); 5262 break stateloop; 5263 case '\n': 5264 appendLongStrBufLineFeed(); 5265 continue; 5266 case '\u0000': 5267 c = '\uFFFD'; 5268 // fall thru 5269 default: 5270 /* 5271 * Anything else Append the current input 5272 * character to the current DOCTYPE token's 5273 * system identifier. 5274 */ 5275 appendLongStrBuf(c); 5276 /* 5277 * Stay in the DOCTYPE system identifier 5278 * (double-quoted) state. 5279 */ 5280 continue; 5281 } 5282 } 5283 // FALLTHRU DON'T REORDER 5284 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 5285 afterdoctypesystemidentifierloop: for (;;) { 5286 if (++pos == endPos) { 5287 break stateloop; 5288 } 5289 c = checkChar(buf, pos); 5290 /* 5291 * Consume the next input character: 5292 */ 5293 switch (c) { 5294 case '\r': 5295 silentCarriageReturn(); 5296 break stateloop; 5297 case '\n': 5298 silentLineFeed(); 5299 // fall thru 5300 case ' ': 5301 case '\t': 5302 case '\u000C': 5303 /* 5304 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5305 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5306 * in the after DOCTYPE system identifier state. 5307 */ 5308 continue; 5309 case '>': 5310 /* 5311 * U+003E GREATER-THAN SIGN (>) Emit the current 5312 * DOCTYPE token. 5313 */ 5314 emitDoctypeToken(pos); 5315 /* 5316 * Switch to the data state. 5317 */ 5318 state = transition(state, Tokenizer.DATA, reconsume, pos); 5319 continue stateloop; 5320 default: 5321 /* 5322 * Switch to the bogus DOCTYPE state. (This does 5323 * not set the DOCTYPE token's force-quirks flag 5324 * to on.) 5325 */ 5326 bogusDoctypeWithoutQuirks(); 5327 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5328 break afterdoctypesystemidentifierloop; 5329 // continue stateloop; 5330 } 5331 } 5332 // FALLTHRU DON'T REORDER 5333 case BOGUS_DOCTYPE: 5334 for (;;) { 5335 if (reconsume) { 5336 reconsume = false; 5337 } else { 5338 if (++pos == endPos) { 5339 break stateloop; 5340 } 5341 c = checkChar(buf, pos); 5342 } 5343 /* 5344 * Consume the next input character: 5345 */ 5346 switch (c) { 5347 case '>': 5348 /* 5349 * U+003E GREATER-THAN SIGN (>) Emit that 5350 * DOCTYPE token. 5351 */ 5352 emitDoctypeToken(pos); 5353 /* 5354 * Switch to the data state. 5355 */ 5356 state = transition(state, Tokenizer.DATA, reconsume, pos); 5357 continue stateloop; 5358 case '\r': 5359 silentCarriageReturn(); 5360 break stateloop; 5361 case '\n': 5362 silentLineFeed(); 5363 // fall thru 5364 default: 5365 /* 5366 * Anything else Stay in the bogus DOCTYPE 5367 * state. 5368 */ 5369 continue; 5370 } 5371 } 5372 // XXX reorder point 5373 case DOCTYPE_YSTEM: 5374 doctypeystemloop: for (;;) { 5375 if (++pos == endPos) { 5376 break stateloop; 5377 } 5378 c = checkChar(buf, pos); 5379 /* 5380 * Otherwise, if the six characters starting from the 5381 * current input character are an ASCII case-insensitive 5382 * match for the word "SYSTEM", then consume those 5383 * characters and switch to the before DOCTYPE system 5384 * identifier state. 5385 */ 5386 if (index < 5) { // YSTEM.length 5387 char folded = c; 5388 if (c >= 'A' && c <= 'Z') { 5389 folded += 0x20; 5390 } 5391 if (folded != Tokenizer.YSTEM[index]) { 5392 bogusDoctype(); 5393 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5394 reconsume = true; 5395 continue stateloop; 5396 } 5397 index++; 5398 continue stateloop; 5399 } else { 5400 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); 5401 reconsume = true; 5402 break doctypeystemloop; 5403 // continue stateloop; 5404 } 5405 } 5406 // FALLTHRU DON'T REORDER 5407 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 5408 afterdoctypesystemkeywordloop: for (;;) { 5409 if (reconsume) { 5410 reconsume = false; 5411 } else { 5412 if (++pos == endPos) { 5413 break stateloop; 5414 } 5415 c = checkChar(buf, pos); 5416 } 5417 /* 5418 * Consume the next input character: 5419 */ 5420 switch (c) { 5421 case '\r': 5422 silentCarriageReturn(); 5423 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5424 break stateloop; 5425 case '\n': 5426 silentLineFeed(); 5427 // fall thru 5428 case ' ': 5429 case '\t': 5430 case '\u000C': 5431 /* 5432 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5433 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5434 * Switch to the before DOCTYPE public 5435 * identifier state. 5436 */ 5437 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5438 break afterdoctypesystemkeywordloop; 5439 // FALL THROUGH continue stateloop 5440 case '"': 5441 /* 5442 * U+0022 QUOTATION MARK (") Parse Error. 5443 */ 5444 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 5445 /* 5446 * Set the DOCTYPE token's system identifier to 5447 * the empty string (not missing), 5448 */ 5449 clearLongStrBuf(); 5450 /* 5451 * then switch to the DOCTYPE public identifier 5452 * (double-quoted) state. 5453 */ 5454 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5455 continue stateloop; 5456 case '\'': 5457 /* 5458 * U+0027 APOSTROPHE (') Parse Error. 5459 */ 5460 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 5461 /* 5462 * Set the DOCTYPE token's public identifier to 5463 * the empty string (not missing), 5464 */ 5465 clearLongStrBuf(); 5466 /* 5467 * then switch to the DOCTYPE public identifier 5468 * (single-quoted) state. 5469 */ 5470 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5471 continue stateloop; 5472 case '>': 5473 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5474 errExpectedPublicId(); 5475 /* 5476 * Set the DOCTYPE token's force-quirks flag to 5477 * on. 5478 */ 5479 forceQuirks = true; 5480 /* 5481 * Emit that DOCTYPE token. 5482 */ 5483 emitDoctypeToken(pos); 5484 /* 5485 * Switch to the data state. 5486 */ 5487 state = transition(state, Tokenizer.DATA, reconsume, pos); 5488 continue stateloop; 5489 default: 5490 bogusDoctype(); 5491 /* 5492 * Set the DOCTYPE token's force-quirks flag to 5493 * on. 5494 */ 5495 // done by bogusDoctype(); 5496 /* 5497 * Switch to the bogus DOCTYPE state. 5498 */ 5499 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5500 continue stateloop; 5501 } 5502 } 5503 // FALLTHRU DON'T REORDER 5504 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 5505 beforedoctypesystemidentifierloop: for (;;) { 5506 if (++pos == endPos) { 5507 break stateloop; 5508 } 5509 c = checkChar(buf, pos); 5510 /* 5511 * Consume the next input character: 5512 */ 5513 switch (c) { 5514 case '\r': 5515 silentCarriageReturn(); 5516 break stateloop; 5517 case '\n': 5518 silentLineFeed(); 5519 // fall thru 5520 case ' ': 5521 case '\t': 5522 case '\u000C': 5523 /* 5524 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5525 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5526 * in the before DOCTYPE system identifier 5527 * state. 5528 */ 5529 continue; 5530 case '"': 5531 /* 5532 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5533 * token's system identifier to the empty string 5534 * (not missing), 5535 */ 5536 clearLongStrBuf(); 5537 /* 5538 * then switch to the DOCTYPE system identifier 5539 * (double-quoted) state. 5540 */ 5541 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5542 continue stateloop; 5543 case '\'': 5544 /* 5545 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5546 * system identifier to the empty string (not 5547 * missing), 5548 */ 5549 clearLongStrBuf(); 5550 /* 5551 * then switch to the DOCTYPE system identifier 5552 * (single-quoted) state. 5553 */ 5554 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5555 break beforedoctypesystemidentifierloop; 5556 // continue stateloop; 5557 case '>': 5558 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5559 errExpectedSystemId(); 5560 /* 5561 * Set the DOCTYPE token's force-quirks flag to 5562 * on. 5563 */ 5564 forceQuirks = true; 5565 /* 5566 * Emit that DOCTYPE token. 5567 */ 5568 emitDoctypeToken(pos); 5569 /* 5570 * Switch to the data state. 5571 */ 5572 state = transition(state, Tokenizer.DATA, reconsume, pos); 5573 continue stateloop; 5574 default: 5575 bogusDoctype(); 5576 /* 5577 * Set the DOCTYPE token's force-quirks flag to 5578 * on. 5579 */ 5580 // done by bogusDoctype(); 5581 /* 5582 * Switch to the bogus DOCTYPE state. 5583 */ 5584 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5585 continue stateloop; 5586 } 5587 } 5588 // FALLTHRU DON'T REORDER 5589 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 5590 for (;;) { 5591 if (++pos == endPos) { 5592 break stateloop; 5593 } 5594 c = checkChar(buf, pos); 5595 /* 5596 * Consume the next input character: 5597 */ 5598 switch (c) { 5599 case '\'': 5600 /* 5601 * U+0027 APOSTROPHE (') Switch to the after 5602 * DOCTYPE system identifier state. 5603 */ 5604 systemIdentifier = longStrBufToString(); 5605 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5606 continue stateloop; 5607 case '>': 5608 errGtInSystemId(); 5609 /* 5610 * Set the DOCTYPE token's force-quirks flag to 5611 * on. 5612 */ 5613 forceQuirks = true; 5614 /* 5615 * Emit that DOCTYPE token. 5616 */ 5617 systemIdentifier = longStrBufToString(); 5618 emitDoctypeToken(pos); 5619 /* 5620 * Switch to the data state. 5621 */ 5622 state = transition(state, Tokenizer.DATA, reconsume, pos); 5623 continue stateloop; 5624 case '\r': 5625 appendLongStrBufCarriageReturn(); 5626 break stateloop; 5627 case '\n': 5628 appendLongStrBufLineFeed(); 5629 continue; 5630 case '\u0000': 5631 c = '\uFFFD'; 5632 // fall thru 5633 default: 5634 /* 5635 * Anything else Append the current input 5636 * character to the current DOCTYPE token's 5637 * system identifier. 5638 */ 5639 appendLongStrBuf(c); 5640 /* 5641 * Stay in the DOCTYPE system identifier 5642 * (double-quoted) state. 5643 */ 5644 continue; 5645 } 5646 } 5647 // XXX reorder point 5648 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 5649 for (;;) { 5650 if (++pos == endPos) { 5651 break stateloop; 5652 } 5653 c = checkChar(buf, pos); 5654 /* 5655 * Consume the next input character: 5656 */ 5657 switch (c) { 5658 case '\'': 5659 /* 5660 * U+0027 APOSTROPHE (') Switch to the after 5661 * DOCTYPE public identifier state. 5662 */ 5663 publicIdentifier = longStrBufToString(); 5664 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5665 continue stateloop; 5666 case '>': 5667 errGtInPublicId(); 5668 /* 5669 * Set the DOCTYPE token's force-quirks flag to 5670 * on. 5671 */ 5672 forceQuirks = true; 5673 /* 5674 * Emit that DOCTYPE token. 5675 */ 5676 publicIdentifier = longStrBufToString(); 5677 emitDoctypeToken(pos); 5678 /* 5679 * Switch to the data state. 5680 */ 5681 state = transition(state, Tokenizer.DATA, reconsume, pos); 5682 continue stateloop; 5683 case '\r': 5684 appendLongStrBufCarriageReturn(); 5685 break stateloop; 5686 case '\n': 5687 appendLongStrBufLineFeed(); 5688 continue; 5689 case '\u0000': 5690 c = '\uFFFD'; 5691 // fall thru 5692 default: 5693 /* 5694 * Anything else Append the current input 5695 * character to the current DOCTYPE token's 5696 * public identifier. 5697 */ 5698 appendLongStrBuf(c); 5699 /* 5700 * Stay in the DOCTYPE public identifier 5701 * (single-quoted) state. 5702 */ 5703 continue; 5704 } 5705 } 5706 // END HOTSPOT WORKAROUND 5707 } 5708 } 5709 flushChars(buf, pos); 5710 /* 5711 * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } 5712 */ 5713 // Save locals 5714 stateSave = state; 5715 returnStateSave = returnState; 5716 return pos; 5717 } 5718 5719 // HOTSPOT WORKAROUND INSERTION POINT 5720 5721 // [NOCPP[ 5722 5723 protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { 5724 return to; 5725 } 5726 5727 // ]NOCPP] 5728 5729 private void initDoctypeFields() { 5730 doctypeName = ""; 5731 if (systemIdentifier != null) { 5732 Portability.releaseString(systemIdentifier); 5733 systemIdentifier = null; 5734 } 5735 if (publicIdentifier != null) { 5736 Portability.releaseString(publicIdentifier); 5737 publicIdentifier = null; 5738 } 5739 forceQuirks = false; 5740 } 5741 5742 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn() 5743 throws SAXException { 5744 silentCarriageReturn(); 5745 adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n'); 5746 } 5747 5748 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed() 5749 throws SAXException { 5750 silentLineFeed(); 5751 adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n'); 5752 } 5753 5754 @Inline private void appendLongStrBufLineFeed() { 5755 silentLineFeed(); 5756 appendLongStrBuf('\n'); 5757 } 5758 5759 @Inline private void appendLongStrBufCarriageReturn() { 5760 silentCarriageReturn(); 5761 appendLongStrBuf('\n'); 5762 } 5763 5764 @Inline protected void silentCarriageReturn() { 5765 ++line; 5766 lastCR = true; 5767 } 5768 5769 @Inline protected void silentLineFeed() { 5770 ++line; 5771 } 5772 5773 private void emitCarriageReturn(@NoLength char[] buf, int pos) 5774 throws SAXException { 5775 silentCarriageReturn(); 5776 flushChars(buf, pos); 5777 tokenHandler.characters(Tokenizer.LF, 0, 1); 5778 cstart = Integer.MAX_VALUE; 5779 } 5780 5781 private void emitReplacementCharacter(@NoLength char[] buf, int pos) 5782 throws SAXException { 5783 flushChars(buf, pos); 5784 tokenHandler.zeroOriginatingReplacementCharacter(); 5785 cstart = pos + 1; 5786 } 5787 5788 private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos) 5789 throws SAXException { 5790 flushChars(buf, pos); 5791 tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1); 5792 cstart = pos + 1; 5793 } 5794 5795 private void setAdditionalAndRememberAmpersandLocation(char add) { 5796 additional = add; 5797 // [NOCPP[ 5798 ampersandLocation = new LocatorImpl(this); 5799 // ]NOCPP] 5800 } 5801 5802 private void bogusDoctype() throws SAXException { 5803 errBogusDoctype(); 5804 forceQuirks = true; 5805 } 5806 5807 private void bogusDoctypeWithoutQuirks() throws SAXException { 5808 errBogusDoctype(); 5809 forceQuirks = false; 5810 } 5811 5812 private void emitOrAppendStrBuf(int returnState) throws SAXException { 5813 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 5814 appendStrBufToLongStrBuf(); 5815 } else { 5816 emitStrBuf(); 5817 } 5818 } 5819 5820 private void handleNcrValue(int returnState) throws SAXException { 5821 /* 5822 * If one or more characters match the range, then take them all and 5823 * interpret the string of characters as a number (either hexadecimal or 5824 * decimal as appropriate). 5825 */ 5826 if (value <= 0xFFFF) { 5827 if (value >= 0x80 && value <= 0x9f) { 5828 /* 5829 * If that number is one of the numbers in the first column of 5830 * the following table, then this is a parse error. 5831 */ 5832 errNcrInC1Range(); 5833 /* 5834 * Find the row with that number in the first column, and return 5835 * a character token for the Unicode character given in the 5836 * second column of that row. 5837 */ 5838 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80]; 5839 emitOrAppendOne(val, returnState); 5840 // [NOCPP[ 5841 } else if (value == 0xC 5842 && contentSpacePolicy != XmlViolationPolicy.ALLOW) { 5843 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) { 5844 emitOrAppendOne(Tokenizer.SPACE, returnState); 5845 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) { 5846 fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space."); 5847 } 5848 // ]NOCPP] 5849 } else if (value == 0x0) { 5850 errNcrZero(); 5851 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 5852 } else if ((value & 0xF800) == 0xD800) { 5853 errNcrSurrogate(); 5854 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 5855 } else { 5856 /* 5857 * Otherwise, return a character token for the Unicode character 5858 * whose code point is that number. 5859 */ 5860 char ch = (char) value; 5861 // [NOCPP[ 5862 if (value == 0x0D) { 5863 errNcrCr(); 5864 } else if ((value <= 0x0008) || (value == 0x000B) 5865 || (value >= 0x000E && value <= 0x001F)) { 5866 ch = errNcrControlChar(ch); 5867 } else if (value >= 0xFDD0 && value <= 0xFDEF) { 5868 errNcrUnassigned(); 5869 } else if ((value & 0xFFFE) == 0xFFFE) { 5870 ch = errNcrNonCharacter(ch); 5871 } else if (value >= 0x007F && value <= 0x009F) { 5872 errNcrControlChar(); 5873 } else { 5874 maybeWarnPrivateUse(ch); 5875 } 5876 // ]NOCPP] 5877 bmpChar[0] = ch; 5878 emitOrAppendOne(bmpChar, returnState); 5879 } 5880 } else if (value <= 0x10FFFF) { 5881 // [NOCPP[ 5882 maybeWarnPrivateUseAstral(); 5883 if ((value & 0xFFFE) == 0xFFFE) { 5884 errAstralNonCharacter(value); 5885 } 5886 // ]NOCPP] 5887 astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10)); 5888 astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); 5889 emitOrAppendTwo(astralChar, returnState); 5890 } else { 5891 errNcrOutOfRange(); 5892 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 5893 } 5894 } 5895 5896 public void eof() throws SAXException { 5897 int state = stateSave; 5898 int returnState = returnStateSave; 5899 5900 eofloop: for (;;) { 5901 switch (state) { 5902 case SCRIPT_DATA_LESS_THAN_SIGN: 5903 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 5904 /* 5905 * Otherwise, emit a U+003C LESS-THAN SIGN character token 5906 */ 5907 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 5908 /* 5909 * and reconsume the current input character in the data 5910 * state. 5911 */ 5912 break eofloop; 5913 case TAG_OPEN: 5914 /* 5915 * The behavior of this state depends on the content model 5916 * flag. 5917 */ 5918 /* 5919 * Anything else Parse error. 5920 */ 5921 errEofAfterLt(); 5922 /* 5923 * Emit a U+003C LESS-THAN SIGN character token 5924 */ 5925 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 5926 /* 5927 * and reconsume the current input character in the data 5928 * state. 5929 */ 5930 break eofloop; 5931 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 5932 /* 5933 * Emit a U+003C LESS-THAN SIGN character token 5934 */ 5935 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 5936 /* 5937 * and reconsume the current input character in the RCDATA 5938 * state. 5939 */ 5940 break eofloop; 5941 case NON_DATA_END_TAG_NAME: 5942 /* 5943 * Emit a U+003C LESS-THAN SIGN character token, a U+002F 5944 * SOLIDUS character token, 5945 */ 5946 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 5947 /* 5948 * a character token for each of the characters in the 5949 * temporary buffer (in the order they were added to the 5950 * buffer), 5951 */ 5952 emitStrBuf(); 5953 /* 5954 * and reconsume the current input character in the RCDATA 5955 * state. 5956 */ 5957 break eofloop; 5958 case CLOSE_TAG_OPEN: 5959 /* EOF Parse error. */ 5960 errEofAfterLt(); 5961 /* 5962 * Emit a U+003C LESS-THAN SIGN character token and a U+002F 5963 * SOLIDUS character token. 5964 */ 5965 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 5966 /* 5967 * Reconsume the EOF character in the data state. 5968 */ 5969 break eofloop; 5970 case TAG_NAME: 5971 /* 5972 * EOF Parse error. 5973 */ 5974 errEofInTagName(); 5975 /* 5976 * Reconsume the EOF character in the data state. 5977 */ 5978 break eofloop; 5979 case BEFORE_ATTRIBUTE_NAME: 5980 case AFTER_ATTRIBUTE_VALUE_QUOTED: 5981 case SELF_CLOSING_START_TAG: 5982 /* EOF Parse error. */ 5983 errEofWithoutGt(); 5984 /* 5985 * Reconsume the EOF character in the data state. 5986 */ 5987 break eofloop; 5988 case ATTRIBUTE_NAME: 5989 /* 5990 * EOF Parse error. 5991 */ 5992 errEofInAttributeName(); 5993 /* 5994 * Reconsume the EOF character in the data state. 5995 */ 5996 break eofloop; 5997 case AFTER_ATTRIBUTE_NAME: 5998 case BEFORE_ATTRIBUTE_VALUE: 5999 /* EOF Parse error. */ 6000 errEofWithoutGt(); 6001 /* 6002 * Reconsume the EOF character in the data state. 6003 */ 6004 break eofloop; 6005 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 6006 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 6007 case ATTRIBUTE_VALUE_UNQUOTED: 6008 /* EOF Parse error. */ 6009 errEofInAttributeValue(); 6010 /* 6011 * Reconsume the EOF character in the data state. 6012 */ 6013 break eofloop; 6014 case BOGUS_COMMENT: 6015 emitComment(0, 0); 6016 break eofloop; 6017 case BOGUS_COMMENT_HYPHEN: 6018 // [NOCPP[ 6019 maybeAppendSpaceToBogusComment(); 6020 // ]NOCPP] 6021 emitComment(0, 0); 6022 break eofloop; 6023 case MARKUP_DECLARATION_OPEN: 6024 errBogusComment(); 6025 clearLongStrBuf(); 6026 emitComment(0, 0); 6027 break eofloop; 6028 case MARKUP_DECLARATION_HYPHEN: 6029 errBogusComment(); 6030 emitComment(0, 0); 6031 break eofloop; 6032 case MARKUP_DECLARATION_OCTYPE: 6033 if (index < 6) { 6034 errBogusComment(); 6035 emitComment(0, 0); 6036 } else { 6037 /* EOF Parse error. */ 6038 errEofInDoctype(); 6039 /* 6040 * Create a new DOCTYPE token. Set its force-quirks flag 6041 * to on. 6042 */ 6043 doctypeName = ""; 6044 if (systemIdentifier != null) { 6045 Portability.releaseString(systemIdentifier); 6046 systemIdentifier = null; 6047 } 6048 if (publicIdentifier != null) { 6049 Portability.releaseString(publicIdentifier); 6050 publicIdentifier = null; 6051 } 6052 forceQuirks = true; 6053 /* 6054 * Emit the token. 6055 */ 6056 emitDoctypeToken(0); 6057 /* 6058 * Reconsume the EOF character in the data state. 6059 */ 6060 break eofloop; 6061 } 6062 break eofloop; 6063 case COMMENT_START: 6064 case COMMENT: 6065 /* 6066 * EOF Parse error. 6067 */ 6068 errEofInComment(); 6069 /* Emit the comment token. */ 6070 emitComment(0, 0); 6071 /* 6072 * Reconsume the EOF character in the data state. 6073 */ 6074 break eofloop; 6075 case COMMENT_END: 6076 errEofInComment(); 6077 /* Emit the comment token. */ 6078 emitComment(2, 0); 6079 /* 6080 * Reconsume the EOF character in the data state. 6081 */ 6082 break eofloop; 6083 case COMMENT_END_DASH: 6084 case COMMENT_START_DASH: 6085 errEofInComment(); 6086 /* Emit the comment token. */ 6087 emitComment(1, 0); 6088 /* 6089 * Reconsume the EOF character in the data state. 6090 */ 6091 break eofloop; 6092 case COMMENT_END_BANG: 6093 errEofInComment(); 6094 /* Emit the comment token. */ 6095 emitComment(3, 0); 6096 /* 6097 * Reconsume the EOF character in the data state. 6098 */ 6099 break eofloop; 6100 case DOCTYPE: 6101 case BEFORE_DOCTYPE_NAME: 6102 errEofInDoctype(); 6103 /* 6104 * Create a new DOCTYPE token. Set its force-quirks flag to 6105 * on. 6106 */ 6107 forceQuirks = true; 6108 /* 6109 * Emit the token. 6110 */ 6111 emitDoctypeToken(0); 6112 /* 6113 * Reconsume the EOF character in the data state. 6114 */ 6115 break eofloop; 6116 case DOCTYPE_NAME: 6117 errEofInDoctype(); 6118 strBufToDoctypeName(); 6119 /* 6120 * Set the DOCTYPE token's force-quirks flag to on. 6121 */ 6122 forceQuirks = true; 6123 /* 6124 * Emit that DOCTYPE token. 6125 */ 6126 emitDoctypeToken(0); 6127 /* 6128 * Reconsume the EOF character in the data state. 6129 */ 6130 break eofloop; 6131 case DOCTYPE_UBLIC: 6132 case DOCTYPE_YSTEM: 6133 case AFTER_DOCTYPE_NAME: 6134 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 6135 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 6136 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 6137 errEofInDoctype(); 6138 /* 6139 * Set the DOCTYPE token's force-quirks flag to on. 6140 */ 6141 forceQuirks = true; 6142 /* 6143 * Emit that DOCTYPE token. 6144 */ 6145 emitDoctypeToken(0); 6146 /* 6147 * Reconsume the EOF character in the data state. 6148 */ 6149 break eofloop; 6150 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 6151 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 6152 /* EOF Parse error. */ 6153 errEofInPublicId(); 6154 /* 6155 * Set the DOCTYPE token's force-quirks flag to on. 6156 */ 6157 forceQuirks = true; 6158 /* 6159 * Emit that DOCTYPE token. 6160 */ 6161 publicIdentifier = longStrBufToString(); 6162 emitDoctypeToken(0); 6163 /* 6164 * Reconsume the EOF character in the data state. 6165 */ 6166 break eofloop; 6167 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 6168 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 6169 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 6170 errEofInDoctype(); 6171 /* 6172 * Set the DOCTYPE token's force-quirks flag to on. 6173 */ 6174 forceQuirks = true; 6175 /* 6176 * Emit that DOCTYPE token. 6177 */ 6178 emitDoctypeToken(0); 6179 /* 6180 * Reconsume the EOF character in the data state. 6181 */ 6182 break eofloop; 6183 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 6184 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 6185 /* EOF Parse error. */ 6186 errEofInSystemId(); 6187 /* 6188 * Set the DOCTYPE token's force-quirks flag to on. 6189 */ 6190 forceQuirks = true; 6191 /* 6192 * Emit that DOCTYPE token. 6193 */ 6194 systemIdentifier = longStrBufToString(); 6195 emitDoctypeToken(0); 6196 /* 6197 * Reconsume the EOF character in the data state. 6198 */ 6199 break eofloop; 6200 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 6201 errEofInDoctype(); 6202 /* 6203 * Set the DOCTYPE token's force-quirks flag to on. 6204 */ 6205 forceQuirks = true; 6206 /* 6207 * Emit that DOCTYPE token. 6208 */ 6209 emitDoctypeToken(0); 6210 /* 6211 * Reconsume the EOF character in the data state. 6212 */ 6213 break eofloop; 6214 case BOGUS_DOCTYPE: 6215 /* 6216 * Emit that DOCTYPE token. 6217 */ 6218 emitDoctypeToken(0); 6219 /* 6220 * Reconsume the EOF character in the data state. 6221 */ 6222 break eofloop; 6223 case CONSUME_CHARACTER_REFERENCE: 6224 /* 6225 * Unlike the definition is the spec, this state does not 6226 * return a value and never requires the caller to 6227 * backtrack. This state takes care of emitting characters 6228 * or appending to the current attribute value. It also 6229 * takes care of that in the case when consuming the entity 6230 * fails. 6231 */ 6232 /* 6233 * This section defines how to consume an entity. This 6234 * definition is used when parsing entities in text and in 6235 * attributes. 6236 * 6237 * The behavior depends on the identity of the next 6238 * character (the one immediately after the U+0026 AMPERSAND 6239 * character): 6240 */ 6241 6242 emitOrAppendStrBuf(returnState); 6243 state = returnState; 6244 continue; 6245 case CHARACTER_REFERENCE_HILO_LOOKUP: 6246 errNoNamedCharacterMatch(); 6247 emitOrAppendStrBuf(returnState); 6248 state = returnState; 6249 continue; 6250 case CHARACTER_REFERENCE_TAIL: 6251 outer: for (;;) { 6252 char c = '\u0000'; 6253 entCol++; 6254 /* 6255 * Consume the maximum number of characters possible, 6256 * with the consumed characters matching one of the 6257 * identifiers in the first column of the named 6258 * character references table (in a case-sensitive 6259 * manner). 6260 */ 6261 hiloop: for (;;) { 6262 if (hi == -1) { 6263 break hiloop; 6264 } 6265 if (entCol == NamedCharacters.NAMES[hi].length()) { 6266 break hiloop; 6267 } 6268 if (entCol > NamedCharacters.NAMES[hi].length()) { 6269 break outer; 6270 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 6271 hi--; 6272 } else { 6273 break hiloop; 6274 } 6275 } 6276 6277 loloop: for (;;) { 6278 if (hi < lo) { 6279 break outer; 6280 } 6281 if (entCol == NamedCharacters.NAMES[lo].length()) { 6282 candidate = lo; 6283 strBufMark = strBufLen; 6284 lo++; 6285 } else if (entCol > NamedCharacters.NAMES[lo].length()) { 6286 break outer; 6287 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 6288 lo++; 6289 } else { 6290 break loloop; 6291 } 6292 } 6293 if (hi < lo) { 6294 break outer; 6295 } 6296 continue; 6297 } 6298 6299 if (candidate == -1) { 6300 /* 6301 * If no match can be made, then this is a parse error. 6302 */ 6303 errNoNamedCharacterMatch(); 6304 emitOrAppendStrBuf(returnState); 6305 state = returnState; 6306 continue eofloop; 6307 } else { 6308 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 6309 if (candidateName.length() == 0 6310 || candidateName.charAt(candidateName.length() - 1) != ';') { 6311 /* 6312 * If the last character matched is not a U+003B 6313 * SEMICOLON (;), there is a parse error. 6314 */ 6315 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6316 /* 6317 * If the entity is being consumed as part of an 6318 * attribute, and the last character matched is 6319 * not a U+003B SEMICOLON (;), 6320 */ 6321 char ch; 6322 if (strBufMark == strBufLen) { 6323 ch = '\u0000'; 6324 } else { 6325 ch = strBuf[strBufMark]; 6326 } 6327 if ((ch >= '0' && ch <= '9') 6328 || (ch >= 'A' && ch <= 'Z') 6329 || (ch >= 'a' && ch <= 'z')) { 6330 /* 6331 * and the next character is in the range 6332 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 6333 * U+0041 LATIN CAPITAL LETTER A to U+005A 6334 * LATIN CAPITAL LETTER Z, or U+0061 LATIN 6335 * SMALL LETTER A to U+007A LATIN SMALL 6336 * LETTER Z, then, for historical reasons, 6337 * all the characters that were matched 6338 * after the U+0026 AMPERSAND (&) must be 6339 * unconsumed, and nothing is returned. 6340 */ 6341 errNoNamedCharacterMatch(); 6342 appendStrBufToLongStrBuf(); 6343 state = returnState; 6344 continue eofloop; 6345 } 6346 } 6347 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6348 errUnescapedAmpersandInterpretedAsCharacterReference(); 6349 } else { 6350 errNotSemicolonTerminated(); 6351 } 6352 } 6353 6354 /* 6355 * Otherwise, return a character token for the character 6356 * corresponding to the entity name (as given by the 6357 * second column of the named character references 6358 * table). 6359 */ 6360 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 6361 if ( 6362 // [NOCPP[ 6363 val.length == 1 6364 // ]NOCPP] 6365 // CPPONLY: val[1] == 0 6366 ) { 6367 emitOrAppendOne(val, returnState); 6368 } else { 6369 emitOrAppendTwo(val, returnState); 6370 } 6371 // this is so complicated! 6372 if (strBufMark < strBufLen) { 6373 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6374 for (int i = strBufMark; i < strBufLen; i++) { 6375 appendLongStrBuf(strBuf[i]); 6376 } 6377 } else { 6378 tokenHandler.characters(strBuf, strBufMark, 6379 strBufLen - strBufMark); 6380 } 6381 } 6382 state = returnState; 6383 continue eofloop; 6384 /* 6385 * If the markup contains I'm ¬it; I tell you, the 6386 * entity is parsed as "not", as in, I'm ¬it; I tell 6387 * you. But if the markup was I'm ∉ I tell you, 6388 * the entity would be parsed as "notin;", resulting in 6389 * I'm ∉ I tell you. 6390 */ 6391 } 6392 case CONSUME_NCR: 6393 case DECIMAL_NRC_LOOP: 6394 case HEX_NCR_LOOP: 6395 /* 6396 * If no characters match the range, then don't consume any 6397 * characters (and unconsume the U+0023 NUMBER SIGN 6398 * character and, if appropriate, the X character). This is 6399 * a parse error; nothing is returned. 6400 * 6401 * Otherwise, if the next character is a U+003B SEMICOLON, 6402 * consume that too. If it isn't, there is a parse error. 6403 */ 6404 if (!seenDigits) { 6405 errNoDigitsInNCR(); 6406 emitOrAppendStrBuf(returnState); 6407 state = returnState; 6408 continue; 6409 } else { 6410 errCharRefLacksSemicolon(); 6411 } 6412 // WARNING previous state sets reconsume 6413 handleNcrValue(returnState); 6414 state = returnState; 6415 continue; 6416 case CDATA_RSQB: 6417 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 6418 break eofloop; 6419 case CDATA_RSQB_RSQB: 6420 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 6421 break eofloop; 6422 case DATA: 6423 default: 6424 break eofloop; 6425 } 6426 } 6427 // case DATA: 6428 /* 6429 * EOF Emit an end-of-file token. 6430 */ 6431 tokenHandler.eof(); 6432 return; 6433 } 6434 6435 private void emitDoctypeToken(int pos) throws SAXException { 6436 cstart = pos + 1; 6437 tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier, 6438 forceQuirks); 6439 // It is OK and sufficient to release these here, since 6440 // there's no way out of the doctype states than through paths 6441 // that call this method. 6442 doctypeName = null; 6443 Portability.releaseString(publicIdentifier); 6444 publicIdentifier = null; 6445 Portability.releaseString(systemIdentifier); 6446 systemIdentifier = null; 6447 } 6448 6449 @Inline protected char checkChar(@NoLength char[] buf, int pos) 6450 throws SAXException { 6451 return buf[pos]; 6452 } 6453 6454 // [NOCPP[ 6455 6456 /** 6457 * Returns the alreadyComplainedAboutNonAscii. 6458 * 6459 * @return the alreadyComplainedAboutNonAscii 6460 */ 6461 public boolean isAlreadyComplainedAboutNonAscii() { 6462 return true; 6463 } 6464 6465 // ]NOCPP] 6466 6467 public boolean internalEncodingDeclaration(String internalCharset) 6468 throws SAXException { 6469 if (encodingDeclarationHandler != null) { 6470 return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset); 6471 } 6472 return false; 6473 } 6474 6475 /** 6476 * @param val 6477 * @throws SAXException 6478 */ 6479 private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState) 6480 throws SAXException { 6481 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6482 appendLongStrBuf(val[0]); 6483 appendLongStrBuf(val[1]); 6484 } else { 6485 tokenHandler.characters(val, 0, 2); 6486 } 6487 } 6488 6489 private void emitOrAppendOne(@Const @NoLength char[] val, int returnState) 6490 throws SAXException { 6491 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6492 appendLongStrBuf(val[0]); 6493 } else { 6494 tokenHandler.characters(val, 0, 1); 6495 } 6496 } 6497 6498 public void end() throws SAXException { 6499 strBuf = null; 6500 longStrBuf = null; 6501 doctypeName = null; 6502 if (systemIdentifier != null) { 6503 Portability.releaseString(systemIdentifier); 6504 systemIdentifier = null; 6505 } 6506 if (publicIdentifier != null) { 6507 Portability.releaseString(publicIdentifier); 6508 publicIdentifier = null; 6509 } 6510 if (tagName != null) { 6511 tagName.release(); 6512 tagName = null; 6513 } 6514 if (attributeName != null) { 6515 attributeName.release(); 6516 attributeName = null; 6517 } 6518 tokenHandler.endTokenization(); 6519 if (attributes != null) { 6520 attributes.clear(mappingLangToXmlLang); 6521 Portability.delete(attributes); 6522 attributes = null; 6523 } 6524 } 6525 6526 public void requestSuspension() { 6527 shouldSuspend = true; 6528 } 6529 6530 // [NOCPP[ 6531 6532 public void becomeConfident() { 6533 confident = true; 6534 } 6535 6536 /** 6537 * Returns the nextCharOnNewLine. 6538 * 6539 * @return the nextCharOnNewLine 6540 */ 6541 public boolean isNextCharOnNewLine() { 6542 return false; 6543 } 6544 6545 public boolean isPrevCR() { 6546 return lastCR; 6547 } 6548 6549 /** 6550 * Returns the line. 6551 * 6552 * @return the line 6553 */ 6554 public int getLine() { 6555 return -1; 6556 } 6557 6558 /** 6559 * Returns the col. 6560 * 6561 * @return the col 6562 */ 6563 public int getCol() { 6564 return -1; 6565 } 6566 6567 // ]NOCPP] 6568 6569 public boolean isInDataState() { 6570 return (stateSave == DATA); 6571 } 6572 6573 public void resetToDataState() { 6574 strBufLen = 0; 6575 longStrBufLen = 0; 6576 stateSave = Tokenizer.DATA; 6577 // line = 1; XXX line numbers 6578 lastCR = false; 6579 index = 0; 6580 forceQuirks = false; 6581 additional = '\u0000'; 6582 entCol = -1; 6583 firstCharKey = -1; 6584 lo = 0; 6585 hi = 0; // will always be overwritten before use anyway 6586 candidate = -1; 6587 strBufMark = 0; 6588 prevValue = -1; 6589 value = 0; 6590 seenDigits = false; 6591 endTag = false; 6592 shouldSuspend = false; 6593 initDoctypeFields(); 6594 if (tagName != null) { 6595 tagName.release(); 6596 tagName = null; 6597 } 6598 if (attributeName != null) { 6599 attributeName.release(); 6600 attributeName = null; 6601 } 6602 // [NOCPP[ 6603 if (newAttributesEachTime) { 6604 // ]NOCPP] 6605 if (attributes != null) { 6606 Portability.delete(attributes); 6607 attributes = null; 6608 } 6609 // [NOCPP[ 6610 } 6611 // ]NOCPP] 6612 } 6613 6614 public void loadState(Tokenizer other) throws SAXException { 6615 strBufLen = other.strBufLen; 6616 if (strBufLen > strBuf.length) { 6617 strBuf = new char[strBufLen]; 6618 } 6619 System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen); 6620 6621 longStrBufLen = other.longStrBufLen; 6622 if (longStrBufLen > longStrBuf.length) { 6623 longStrBuf = new char[longStrBufLen]; 6624 } 6625 System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen); 6626 6627 stateSave = other.stateSave; 6628 returnStateSave = other.returnStateSave; 6629 endTagExpectation = other.endTagExpectation; 6630 endTagExpectationAsArray = other.endTagExpectationAsArray; 6631 // line = 1; XXX line numbers 6632 lastCR = other.lastCR; 6633 index = other.index; 6634 forceQuirks = other.forceQuirks; 6635 additional = other.additional; 6636 entCol = other.entCol; 6637 firstCharKey = other.firstCharKey; 6638 lo = other.lo; 6639 hi = other.hi; 6640 candidate = other.candidate; 6641 strBufMark = other.strBufMark; 6642 prevValue = other.prevValue; 6643 value = other.value; 6644 seenDigits = other.seenDigits; 6645 endTag = other.endTag; 6646 shouldSuspend = false; 6647 6648 if (other.doctypeName == null) { 6649 doctypeName = null; 6650 } else { 6651 doctypeName = Portability.newLocalFromLocal(other.doctypeName, 6652 interner); 6653 } 6654 6655 Portability.releaseString(systemIdentifier); 6656 if (other.systemIdentifier == null) { 6657 systemIdentifier = null; 6658 } else { 6659 systemIdentifier = Portability.newStringFromString(other.systemIdentifier); 6660 } 6661 6662 Portability.releaseString(publicIdentifier); 6663 if (other.publicIdentifier == null) { 6664 publicIdentifier = null; 6665 } else { 6666 publicIdentifier = Portability.newStringFromString(other.publicIdentifier); 6667 } 6668 6669 if (tagName != null) { 6670 tagName.release(); 6671 } 6672 if (other.tagName == null) { 6673 tagName = null; 6674 } else { 6675 tagName = other.tagName.cloneElementName(interner); 6676 } 6677 6678 if (attributeName != null) { 6679 attributeName.release(); 6680 } 6681 if (other.attributeName == null) { 6682 attributeName = null; 6683 } else { 6684 attributeName = other.attributeName.cloneAttributeName(interner); 6685 } 6686 6687 if (attributes != null) { 6688 Portability.delete(attributes); 6689 } 6690 if (other.attributes == null) { 6691 attributes = null; 6692 } else { 6693 attributes = other.attributes.cloneAttributes(interner); 6694 } 6695 } 6696 6697 public void initializeWithoutStarting() throws SAXException { 6698 confident = false; 6699 strBuf = new char[64]; 6700 longStrBuf = new char[1024]; 6701 line = 1; 6702 // [NOCPP[ 6703 html4 = false; 6704 metaBoundaryPassed = false; 6705 wantsComments = tokenHandler.wantsComments(); 6706 if (!newAttributesEachTime) { 6707 attributes = new HtmlAttributes(mappingLangToXmlLang); 6708 } 6709 // ]NOCPP] 6710 resetToDataState(); 6711 } 6712 6713 protected void errGarbageAfterLtSlash() throws SAXException { 6714 } 6715 6716 protected void errLtSlashGt() throws SAXException { 6717 } 6718 6719 protected void errWarnLtSlashInRcdata() throws SAXException { 6720 } 6721 6722 protected void errHtml4LtSlashInRcdata(char folded) throws SAXException { 6723 } 6724 6725 protected void errCharRefLacksSemicolon() throws SAXException { 6726 } 6727 6728 protected void errNoDigitsInNCR() throws SAXException { 6729 } 6730 6731 protected void errGtInSystemId() throws SAXException { 6732 } 6733 6734 protected void errGtInPublicId() throws SAXException { 6735 } 6736 6737 protected void errNamelessDoctype() throws SAXException { 6738 } 6739 6740 protected void errConsecutiveHyphens() throws SAXException { 6741 } 6742 6743 protected void errPrematureEndOfComment() throws SAXException { 6744 } 6745 6746 protected void errBogusComment() throws SAXException { 6747 } 6748 6749 protected void errUnquotedAttributeValOrNull(char c) throws SAXException { 6750 } 6751 6752 protected void errSlashNotFollowedByGt() throws SAXException { 6753 } 6754 6755 protected void errHtml4XmlVoidSyntax() throws SAXException { 6756 } 6757 6758 protected void errNoSpaceBetweenAttributes() throws SAXException { 6759 } 6760 6761 protected void errHtml4NonNameInUnquotedAttribute(char c) 6762 throws SAXException { 6763 } 6764 6765 protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c) 6766 throws SAXException { 6767 } 6768 6769 protected void errAttributeValueMissing() throws SAXException { 6770 } 6771 6772 protected void errBadCharBeforeAttributeNameOrNull(char c) 6773 throws SAXException { 6774 } 6775 6776 protected void errEqualsSignBeforeAttributeName() throws SAXException { 6777 } 6778 6779 protected void errBadCharAfterLt(char c) throws SAXException { 6780 } 6781 6782 protected void errLtGt() throws SAXException { 6783 } 6784 6785 protected void errProcessingInstruction() throws SAXException { 6786 } 6787 6788 protected void errUnescapedAmpersandInterpretedAsCharacterReference() 6789 throws SAXException { 6790 } 6791 6792 protected void errNotSemicolonTerminated() throws SAXException { 6793 } 6794 6795 protected void errNoNamedCharacterMatch() throws SAXException { 6796 } 6797 6798 protected void errQuoteBeforeAttributeName(char c) throws SAXException { 6799 } 6800 6801 protected void errQuoteOrLtInAttributeNameOrNull(char c) 6802 throws SAXException { 6803 } 6804 6805 protected void errExpectedPublicId() throws SAXException { 6806 } 6807 6808 protected void errBogusDoctype() throws SAXException { 6809 } 6810 6811 protected void maybeWarnPrivateUseAstral() throws SAXException { 6812 } 6813 6814 protected void maybeWarnPrivateUse(char ch) throws SAXException { 6815 } 6816 6817 protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs) 6818 throws SAXException { 6819 } 6820 6821 protected void maybeErrSlashInEndTag(boolean selfClosing) 6822 throws SAXException { 6823 } 6824 6825 protected char errNcrNonCharacter(char ch) throws SAXException { 6826 return ch; 6827 } 6828 6829 protected void errAstralNonCharacter(int ch) throws SAXException { 6830 } 6831 6832 protected void errNcrSurrogate() throws SAXException { 6833 } 6834 6835 protected char errNcrControlChar(char ch) throws SAXException { 6836 return ch; 6837 } 6838 6839 protected void errNcrCr() throws SAXException { 6840 } 6841 6842 protected void errNcrInC1Range() throws SAXException { 6843 } 6844 6845 protected void errEofInPublicId() throws SAXException { 6846 } 6847 6848 protected void errEofInComment() throws SAXException { 6849 } 6850 6851 protected void errEofInDoctype() throws SAXException { 6852 } 6853 6854 protected void errEofInAttributeValue() throws SAXException { 6855 } 6856 6857 protected void errEofInAttributeName() throws SAXException { 6858 } 6859 6860 protected void errEofWithoutGt() throws SAXException { 6861 } 6862 6863 protected void errEofInTagName() throws SAXException { 6864 } 6865 6866 protected void errEofInEndTag() throws SAXException { 6867 } 6868 6869 protected void errEofAfterLt() throws SAXException { 6870 } 6871 6872 protected void errNcrOutOfRange() throws SAXException { 6873 } 6874 6875 protected void errNcrUnassigned() throws SAXException { 6876 } 6877 6878 protected void errDuplicateAttribute() throws SAXException { 6879 } 6880 6881 protected void errEofInSystemId() throws SAXException { 6882 } 6883 6884 protected void errExpectedSystemId() throws SAXException { 6885 } 6886 6887 protected void errMissingSpaceBeforeDoctypeName() throws SAXException { 6888 } 6889 6890 protected void errHyphenHyphenBang() throws SAXException { 6891 } 6892 6893 protected void errNcrControlChar() throws SAXException { 6894 } 6895 6896 protected void errNcrZero() throws SAXException { 6897 } 6898 6899 protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote() 6900 throws SAXException { 6901 } 6902 6903 protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException { 6904 } 6905 6906 protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote() 6907 throws SAXException { 6908 } 6909 6910 protected void noteAttributeWithoutValue() throws SAXException { 6911 } 6912 6913 protected void noteUnquotedAttributeValue() throws SAXException { 6914 } 6915 6916 /** 6917 * Sets the encodingDeclarationHandler. 6918 * 6919 * @param encodingDeclarationHandler 6920 * the encodingDeclarationHandler to set 6921 */ 6922 public void setEncodingDeclarationHandler( 6923 EncodingDeclarationHandler encodingDeclarationHandler) { 6924 this.encodingDeclarationHandler = encodingDeclarationHandler; 6925 } 6926 6927 void destructor() { 6928 // The translator will write refcount tracing stuff here 6929 } 6930 6931 // [NOCPP[ 6932 6933 /** 6934 * Sets an offset to be added to the position reported to 6935 * <code>TransitionHandler</code>. 6936 * 6937 * @param offset the offset 6938 */ 6939 public void setTransitionBaseOffset(int offset) { 6940 6941 } 6942 6943 // ]NOCPP] 6944 6945 }