001 /*
002 * Copyright (c) 2005-2007 Henri Sivonen
003 * Copyright (c) 2007-2010 Mozilla Foundation
004 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
005 * Foundation, and Opera Software ASA.
006 *
007 * Permission is hereby granted, free of charge, to any person obtaining a
008 * copy of this software and associated documentation files (the "Software"),
009 * to deal in the Software without restriction, including without limitation
010 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
011 * and/or sell copies of the Software, and to permit persons to whom the
012 * Software is furnished to do so, subject to the following conditions:
013 *
014 * The above copyright notice and this permission notice shall be included in
015 * all copies or substantial portions of the Software.
016 *
017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
018 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
019 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
020 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
021 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
022 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
023 * DEALINGS IN THE SOFTWARE.
024 */
025
026 /*
027 * The comments following this one that use the same comment syntax as this
028 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
029 * amended as of June 18 2008 and May 31 2010.
030 * That document came with this statement:
031 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
032 * Opera Software ASA. You are granted a license to use, reproduce and
033 * create derivative works of this document."
034 */
035
036 package nu.validator.htmlparser.impl;
037
038 import nu.validator.htmlparser.annotation.Auto;
039 import nu.validator.htmlparser.annotation.CharacterName;
040 import nu.validator.htmlparser.annotation.Const;
041 import nu.validator.htmlparser.annotation.Inline;
042 import nu.validator.htmlparser.annotation.Local;
043 import nu.validator.htmlparser.annotation.NoLength;
044 import nu.validator.htmlparser.common.EncodingDeclarationHandler;
045 import nu.validator.htmlparser.common.Interner;
046 import nu.validator.htmlparser.common.TokenHandler;
047 import nu.validator.htmlparser.common.XmlViolationPolicy;
048
049 import org.xml.sax.ErrorHandler;
050 import org.xml.sax.Locator;
051 import org.xml.sax.SAXException;
052 import org.xml.sax.SAXParseException;
053
054 /**
055 * An implementation of
056 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
057 *
058 * This class implements the <code>Locator</code> interface. This is not an
059 * incidental implementation detail: Users of this class are encouraged to make
060 * use of the <code>Locator</code> nature.
061 *
062 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
063 * can be configured to treat these conditions as fatal or to coerce the infoset
064 * to something that XML 1.0 allows.
065 *
066 * @version $Id$
067 * @author hsivonen
068 */
069 public class Tokenizer implements Locator {
070
071 private static final int DATA_AND_RCDATA_MASK = ~1;
072
073 public static final int DATA = 0;
074
075 public static final int RCDATA = 1;
076
077 public static final int SCRIPT_DATA = 2;
078
079 public static final int RAWTEXT = 3;
080
081 public static final int SCRIPT_DATA_ESCAPED = 4;
082
083 public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
084
085 public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
086
087 public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
088
089 public static final int PLAINTEXT = 8;
090
091 public static final int TAG_OPEN = 9;
092
093 public static final int CLOSE_TAG_OPEN = 10;
094
095 public static final int TAG_NAME = 11;
096
097 public static final int BEFORE_ATTRIBUTE_NAME = 12;
098
099 public static final int ATTRIBUTE_NAME = 13;
100
101 public static final int AFTER_ATTRIBUTE_NAME = 14;
102
103 public static final int BEFORE_ATTRIBUTE_VALUE = 15;
104
105 public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
106
107 public static final int BOGUS_COMMENT = 17;
108
109 public static final int MARKUP_DECLARATION_OPEN = 18;
110
111 public static final int DOCTYPE = 19;
112
113 public static final int BEFORE_DOCTYPE_NAME = 20;
114
115 public static final int DOCTYPE_NAME = 21;
116
117 public static final int AFTER_DOCTYPE_NAME = 22;
118
119 public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
120
121 public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
122
123 public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
124
125 public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
126
127 public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
128
129 public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
130
131 public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
132
133 public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
134
135 public static final int BOGUS_DOCTYPE = 31;
136
137 public static final int COMMENT_START = 32;
138
139 public static final int COMMENT_START_DASH = 33;
140
141 public static final int COMMENT = 34;
142
143 public static final int COMMENT_END_DASH = 35;
144
145 public static final int COMMENT_END = 36;
146
147 public static final int COMMENT_END_BANG = 37;
148
149 public static final int NON_DATA_END_TAG_NAME = 38;
150
151 public static final int MARKUP_DECLARATION_HYPHEN = 39;
152
153 public static final int MARKUP_DECLARATION_OCTYPE = 40;
154
155 public static final int DOCTYPE_UBLIC = 41;
156
157 public static final int DOCTYPE_YSTEM = 42;
158
159 public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
160
161 public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
162
163 public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
164
165 public static final int CONSUME_CHARACTER_REFERENCE = 46;
166
167 public static final int CONSUME_NCR = 47;
168
169 public static final int CHARACTER_REFERENCE_TAIL = 48;
170
171 public static final int HEX_NCR_LOOP = 49;
172
173 public static final int DECIMAL_NRC_LOOP = 50;
174
175 public static final int HANDLE_NCR_VALUE = 51;
176
177 public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
178
179 public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
180
181 public static final int SELF_CLOSING_START_TAG = 54;
182
183 public static final int CDATA_START = 55;
184
185 public static final int CDATA_SECTION = 56;
186
187 public static final int CDATA_RSQB = 57;
188
189 public static final int CDATA_RSQB_RSQB = 58;
190
191 public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
192
193 public static final int SCRIPT_DATA_ESCAPE_START = 60;
194
195 public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
196
197 public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
198
199 public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
200
201 public static final int BOGUS_COMMENT_HYPHEN = 64;
202
203 public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
204
205 public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
206
207 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
208
209 public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
210
211 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
212
213 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
214
215 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
216
217 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
218
219 /**
220 * Magic value for UTF-16 operations.
221 */
222 private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
223
224 /**
225 * UTF-16 code unit array containing less than and greater than for emitting
226 * those characters on certain parse errors.
227 */
228 private static final @NoLength char[] LT_GT = { '<', '>' };
229
230 /**
231 * UTF-16 code unit array containing less than and solidus for emitting
232 * those characters on certain parse errors.
233 */
234 private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
235
236 /**
237 * UTF-16 code unit array containing ]] for emitting those characters on
238 * state transitions.
239 */
240 private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
241
242 /**
243 * Array version of U+FFFD.
244 */
245 private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
246
247 // [NOCPP[
248
249 /**
250 * Array version of space.
251 */
252 private static final @NoLength char[] SPACE = { ' ' };
253
254 // ]NOCPP]
255
256 /**
257 * Array version of line feed.
258 */
259 private static final @NoLength char[] LF = { '\n' };
260
261 /**
262 * Buffer growth parameter.
263 */
264 private static final int BUFFER_GROW_BY = 1024;
265
266 /**
267 * "CDATA[" as <code>char[]</code>
268 */
269 private static final @NoLength char[] CDATA_LSQB = "CDATA[".toCharArray();
270
271 /**
272 * "octype" as <code>char[]</code>
273 */
274 private static final @NoLength char[] OCTYPE = "octype".toCharArray();
275
276 /**
277 * "ublic" as <code>char[]</code>
278 */
279 private static final @NoLength char[] UBLIC = "ublic".toCharArray();
280
281 /**
282 * "ystem" as <code>char[]</code>
283 */
284 private static final @NoLength char[] YSTEM = "ystem".toCharArray();
285
286 private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
287
288 private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
289
290 private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
291
292 private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
293 'e', 'x', 't' };
294
295 private static final char[] XMP_ARR = { 'x', 'm', 'p' };
296
297 private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
298 'e', 'a' };
299
300 private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
301
302 private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
303 'd' };
304
305 private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
306 'p', 't' };
307
308 private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
309 'e', 's' };
310
311 /**
312 * The token handler.
313 */
314 protected final TokenHandler tokenHandler;
315
316 protected EncodingDeclarationHandler encodingDeclarationHandler;
317
318 // [NOCPP[
319
320 /**
321 * The error handler.
322 */
323 protected ErrorHandler errorHandler;
324
325 // ]NOCPP]
326
327 /**
328 * Whether the previous char read was CR.
329 */
330 protected boolean lastCR;
331
332 protected int stateSave;
333
334 private int returnStateSave;
335
336 protected int index;
337
338 private boolean forceQuirks;
339
340 private char additional;
341
342 private int entCol;
343
344 private int firstCharKey;
345
346 private int lo;
347
348 private int hi;
349
350 private int candidate;
351
352 private int strBufMark;
353
354 private int prevValue;
355
356 protected int value;
357
358 private boolean seenDigits;
359
360 protected int cstart;
361
362 /**
363 * The SAX public id for the resource being tokenized. (Only passed to back
364 * as part of locator data.)
365 */
366 private String publicId;
367
368 /**
369 * The SAX system id for the resource being tokenized. (Only passed to back
370 * as part of locator data.)
371 */
372 private String systemId;
373
374 /**
375 * Buffer for short identifiers.
376 */
377 private @Auto char[] strBuf;
378
379 /**
380 * Number of significant <code>char</code>s in <code>strBuf</code>.
381 */
382 private int strBufLen;
383
384 /**
385 * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise
386 * an offset to the main buffer.
387 */
388 // private int strBufOffset = -1;
389 /**
390 * Buffer for long strings.
391 */
392 private @Auto char[] longStrBuf;
393
394 /**
395 * Number of significant <code>char</code>s in <code>longStrBuf</code>.
396 */
397 private int longStrBufLen;
398
399 /**
400 * <code>-1</code> to indicate that <code>longStrBuf</code> is used or
401 * otherwise an offset to the main buffer.
402 */
403 // private int longStrBufOffset = -1;
404
405 /**
406 * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
407 */
408 private final @Auto char[] bmpChar;
409
410 /**
411 * Buffer for expanding astral NCRs.
412 */
413 private final @Auto char[] astralChar;
414
415 /**
416 * The element whose end tag closes the current CDATA or RCDATA element.
417 */
418 protected ElementName endTagExpectation = null;
419
420 private char[] endTagExpectationAsArray; // not @Auto!
421
422 /**
423 * <code>true</code> if tokenizing an end tag
424 */
425 protected boolean endTag;
426
427 /**
428 * The current tag token name.
429 */
430 private ElementName tagName = null;
431
432 /**
433 * The current attribute name.
434 */
435 protected AttributeName attributeName = null;
436
437 // [NOCPP[
438
439 /**
440 * Whether comment tokens are emitted.
441 */
442 private boolean wantsComments = false;
443
444 /**
445 * <code>true</code> when HTML4-specific additional errors are requested.
446 */
447 protected boolean html4;
448
449 /**
450 * Whether the stream is past the first 512 bytes.
451 */
452 private boolean metaBoundaryPassed;
453
454 // ]NOCPP]
455
456 /**
457 * The name of the current doctype token.
458 */
459 private @Local String doctypeName;
460
461 /**
462 * The public id of the current doctype token.
463 */
464 private String publicIdentifier;
465
466 /**
467 * The system id of the current doctype token.
468 */
469 private String systemIdentifier;
470
471 /**
472 * The attribute holder.
473 */
474 private HtmlAttributes attributes;
475
476 // [NOCPP[
477
478 /**
479 * The policy for vertical tab and form feed.
480 */
481 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
482
483 /**
484 * The policy for comments.
485 */
486 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
487
488 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
489
490 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
491
492 private boolean html4ModeCompatibleWithXhtml1Schemata;
493
494 private final boolean newAttributesEachTime;
495
496 // ]NOCPP]
497
498 private int mappingLangToXmlLang;
499
500 private boolean shouldSuspend;
501
502 protected boolean confident;
503
504 private int line;
505
506 private Interner interner;
507
508 // [NOCPP[
509
510 protected LocatorImpl ampersandLocation;
511
512 public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
513 this.tokenHandler = tokenHandler;
514 this.encodingDeclarationHandler = null;
515 this.newAttributesEachTime = newAttributesEachTime;
516 this.bmpChar = new char[1];
517 this.astralChar = new char[2];
518 this.tagName = null;
519 this.attributeName = null;
520 this.doctypeName = null;
521 this.publicIdentifier = null;
522 this.systemIdentifier = null;
523 this.attributes = null;
524 }
525
526 // ]NOCPP]
527
528 /**
529 * The constructor.
530 *
531 * @param tokenHandler
532 * the handler for receiving tokens
533 */
534 public Tokenizer(TokenHandler tokenHandler) {
535 this.tokenHandler = tokenHandler;
536 this.encodingDeclarationHandler = null;
537 // [NOCPP[
538 this.newAttributesEachTime = false;
539 // ]NOCPP]
540 this.bmpChar = new char[1];
541 this.astralChar = new char[2];
542 this.tagName = null;
543 this.attributeName = null;
544 this.doctypeName = null;
545 this.publicIdentifier = null;
546 this.systemIdentifier = null;
547 this.attributes = null;
548 }
549
550 public void setInterner(Interner interner) {
551 this.interner = interner;
552 }
553
554 public void initLocation(String newPublicId, String newSystemId) {
555 this.systemId = newSystemId;
556 this.publicId = newPublicId;
557
558 }
559
560 // [NOCPP[
561
562 /**
563 * Returns the mappingLangToXmlLang.
564 *
565 * @return the mappingLangToXmlLang
566 */
567 public boolean isMappingLangToXmlLang() {
568 return mappingLangToXmlLang == AttributeName.HTML_LANG;
569 }
570
571 /**
572 * Sets the mappingLangToXmlLang.
573 *
574 * @param mappingLangToXmlLang
575 * the mappingLangToXmlLang to set
576 */
577 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
578 this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
579 : AttributeName.HTML;
580 }
581
582 /**
583 * Sets the error handler.
584 *
585 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
586 */
587 public void setErrorHandler(ErrorHandler eh) {
588 this.errorHandler = eh;
589 }
590
591 public ErrorHandler getErrorHandler() {
592 return this.errorHandler;
593 }
594
595 /**
596 * Sets the commentPolicy.
597 *
598 * @param commentPolicy
599 * the commentPolicy to set
600 */
601 public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
602 this.commentPolicy = commentPolicy;
603 }
604
605 /**
606 * Sets the contentNonXmlCharPolicy.
607 *
608 * @param contentNonXmlCharPolicy
609 * the contentNonXmlCharPolicy to set
610 */
611 public void setContentNonXmlCharPolicy(
612 XmlViolationPolicy contentNonXmlCharPolicy) {
613 if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
614 throw new IllegalArgumentException(
615 "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
616 }
617 }
618
619 /**
620 * Sets the contentSpacePolicy.
621 *
622 * @param contentSpacePolicy
623 * the contentSpacePolicy to set
624 */
625 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
626 this.contentSpacePolicy = contentSpacePolicy;
627 }
628
629 /**
630 * Sets the xmlnsPolicy.
631 *
632 * @param xmlnsPolicy
633 * the xmlnsPolicy to set
634 */
635 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
636 if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
637 throw new IllegalArgumentException("Can't use FATAL here.");
638 }
639 this.xmlnsPolicy = xmlnsPolicy;
640 }
641
642 public void setNamePolicy(XmlViolationPolicy namePolicy) {
643 this.namePolicy = namePolicy;
644 }
645
646 /**
647 * Sets the html4ModeCompatibleWithXhtml1Schemata.
648 *
649 * @param html4ModeCompatibleWithXhtml1Schemata
650 * the html4ModeCompatibleWithXhtml1Schemata to set
651 */
652 public void setHtml4ModeCompatibleWithXhtml1Schemata(
653 boolean html4ModeCompatibleWithXhtml1Schemata) {
654 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
655 }
656
657 // ]NOCPP]
658
659 // For the token handler to call
660 /**
661 * Sets the tokenizer state and the associated element name. This should
662 * only ever used to put the tokenizer into one of the states that have
663 * a special end tag expectation.
664 *
665 * @param specialTokenizerState
666 * the tokenizer state to set
667 * @param endTagExpectation
668 * the expected end tag for transitioning back to normal
669 */
670 public void setStateAndEndTagExpectation(int specialTokenizerState,
671 @Local String endTagExpectation) {
672 this.stateSave = specialTokenizerState;
673 if (specialTokenizerState == Tokenizer.DATA) {
674 return;
675 }
676 @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
677 this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
678 asArray.length, interner);
679 endTagExpectationToArray();
680 }
681
682 /**
683 * Sets the tokenizer state and the associated element name. This should
684 * only ever used to put the tokenizer into one of the states that have
685 * a special end tag expectation.
686 *
687 * @param specialTokenizerState
688 * the tokenizer state to set
689 * @param endTagExpectation
690 * the expected end tag for transitioning back to normal
691 */
692 public void setStateAndEndTagExpectation(int specialTokenizerState,
693 ElementName endTagExpectation) {
694 this.stateSave = specialTokenizerState;
695 this.endTagExpectation = endTagExpectation;
696 endTagExpectationToArray();
697 }
698
699 private void endTagExpectationToArray() {
700 switch (endTagExpectation.getGroup()) {
701 case TreeBuilder.TITLE:
702 endTagExpectationAsArray = TITLE_ARR;
703 return;
704 case TreeBuilder.SCRIPT:
705 endTagExpectationAsArray = SCRIPT_ARR;
706 return;
707 case TreeBuilder.STYLE:
708 endTagExpectationAsArray = STYLE_ARR;
709 return;
710 case TreeBuilder.PLAINTEXT:
711 endTagExpectationAsArray = PLAINTEXT_ARR;
712 return;
713 case TreeBuilder.XMP:
714 endTagExpectationAsArray = XMP_ARR;
715 return;
716 case TreeBuilder.TEXTAREA:
717 endTagExpectationAsArray = TEXTAREA_ARR;
718 return;
719 case TreeBuilder.IFRAME:
720 endTagExpectationAsArray = IFRAME_ARR;
721 return;
722 case TreeBuilder.NOEMBED:
723 endTagExpectationAsArray = NOEMBED_ARR;
724 return;
725 case TreeBuilder.NOSCRIPT:
726 endTagExpectationAsArray = NOSCRIPT_ARR;
727 return;
728 case TreeBuilder.NOFRAMES:
729 endTagExpectationAsArray = NOFRAMES_ARR;
730 return;
731 default:
732 assert false: "Bad end tag expectation.";
733 return;
734 }
735 }
736
737 /**
738 * For C++ use only.
739 */
740 public void setLineNumber(int line) {
741 this.line = line;
742 }
743
744 // start Locator impl
745
746 /**
747 * @see org.xml.sax.Locator#getLineNumber()
748 */
749 @Inline public int getLineNumber() {
750 return line;
751 }
752
753 // [NOCPP[
754
755 /**
756 * @see org.xml.sax.Locator#getColumnNumber()
757 */
758 @Inline public int getColumnNumber() {
759 return -1;
760 }
761
762 /**
763 * @see org.xml.sax.Locator#getPublicId()
764 */
765 public String getPublicId() {
766 return publicId;
767 }
768
769 /**
770 * @see org.xml.sax.Locator#getSystemId()
771 */
772 public String getSystemId() {
773 return systemId;
774 }
775
776 // end Locator impl
777
778 // end public API
779
780 public void notifyAboutMetaBoundary() {
781 metaBoundaryPassed = true;
782 }
783
784 void turnOnAdditionalHtml4Errors() {
785 html4 = true;
786 }
787
788 // ]NOCPP]
789
790 HtmlAttributes emptyAttributes() {
791 // [NOCPP[
792 if (newAttributesEachTime) {
793 return new HtmlAttributes(mappingLangToXmlLang);
794 } else {
795 // ]NOCPP]
796 return HtmlAttributes.EMPTY_ATTRIBUTES;
797 // [NOCPP[
798 }
799 // ]NOCPP]
800 }
801
802 @Inline private void clearStrBufAndAppend(char c) {
803 strBuf[0] = c;
804 strBufLen = 1;
805 }
806
807 @Inline private void clearStrBuf() {
808 strBufLen = 0;
809 }
810
811 /**
812 * Appends to the smaller buffer.
813 *
814 * @param c
815 * the UTF-16 code unit to append
816 */
817 private void appendStrBuf(char c) {
818 if (strBufLen == strBuf.length) {
819 char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];
820 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
821 strBuf = newBuf;
822 }
823 strBuf[strBufLen++] = c;
824 }
825
826 /**
827 * The smaller buffer as a String. Currently only used for error reporting.
828 *
829 * <p>
830 * C++ memory note: The return value must be released.
831 *
832 * @return the smaller buffer as a string
833 */
834 protected String strBufToString() {
835 return Portability.newStringFromBuffer(strBuf, 0, strBufLen);
836 }
837
838 /**
839 * Returns the short buffer as a local name. The return value is released in
840 * emitDoctypeToken().
841 *
842 * @return the smaller buffer as local name
843 */
844 private void strBufToDoctypeName() {
845 doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
846 interner);
847 }
848
849 /**
850 * Emits the smaller buffer as character tokens.
851 *
852 * @throws SAXException
853 * if the token handler threw
854 */
855 private void emitStrBuf() throws SAXException {
856 if (strBufLen > 0) {
857 tokenHandler.characters(strBuf, 0, strBufLen);
858 }
859 }
860
861 @Inline private void clearLongStrBuf() {
862 longStrBufLen = 0;
863 }
864
865 @Inline private void clearLongStrBufAndAppend(char c) {
866 longStrBuf[0] = c;
867 longStrBufLen = 1;
868 }
869
870 /**
871 * Appends to the larger buffer.
872 *
873 * @param c
874 * the UTF-16 code unit to append
875 */
876 private void appendLongStrBuf(char c) {
877 if (longStrBufLen == longStrBuf.length) {
878 char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)];
879 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
880 longStrBuf = newBuf;
881 }
882 longStrBuf[longStrBufLen++] = c;
883 }
884
885 @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
886 // [NOCPP[
887 switch (commentPolicy) {
888 case ALTER_INFOSET:
889 // detachLongStrBuf();
890 appendLongStrBuf(' ');
891 // FALLTHROUGH
892 case ALLOW:
893 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
894 // ]NOCPP]
895 appendLongStrBuf('-');
896 // [NOCPP[
897 break;
898 case FATAL:
899 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
900 break;
901 }
902 // ]NOCPP]
903 }
904
905 // [NOCPP[
906 private void maybeAppendSpaceToBogusComment() throws SAXException {
907 switch (commentPolicy) {
908 case ALTER_INFOSET:
909 // detachLongStrBuf();
910 appendLongStrBuf(' ');
911 // FALLTHROUGH
912 case ALLOW:
913 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
914 break;
915 case FATAL:
916 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
917 break;
918 }
919 }
920
921 // ]NOCPP]
922
923 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c)
924 throws SAXException {
925 errConsecutiveHyphens();
926 // [NOCPP[
927 switch (commentPolicy) {
928 case ALTER_INFOSET:
929 // detachLongStrBuf();
930 longStrBufLen--;
931 appendLongStrBuf(' ');
932 appendLongStrBuf('-');
933 // FALLTHROUGH
934 case ALLOW:
935 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
936 // ]NOCPP]
937 appendLongStrBuf(c);
938 // [NOCPP[
939 break;
940 case FATAL:
941 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
942 break;
943 }
944 // ]NOCPP]
945 }
946
947 private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) {
948 int reqLen = longStrBufLen + length;
949 if (longStrBuf.length < reqLen) {
950 char[] newBuf = new char[reqLen + (reqLen >> 1)];
951 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
952 longStrBuf = newBuf;
953 }
954 System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length);
955 longStrBufLen = reqLen;
956 }
957
958 /**
959 * Append the contents of the smaller buffer to the larger one.
960 */
961 @Inline private void appendStrBufToLongStrBuf() {
962 appendLongStrBuf(strBuf, 0, strBufLen);
963 }
964
965 /**
966 * The larger buffer as a string.
967 *
968 * <p>
969 * C++ memory note: The return value must be released.
970 *
971 * @return the larger buffer as a string
972 */
973 private String longStrBufToString() {
974 return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen);
975 }
976
977 /**
978 * Emits the current comment token.
979 *
980 * @param pos
981 * TODO
982 *
983 * @throws SAXException
984 */
985 private void emitComment(int provisionalHyphens, int pos)
986 throws SAXException {
987 // [NOCPP[
988 if (wantsComments) {
989 // ]NOCPP]
990 // if (longStrBufOffset != -1) {
991 // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen
992 // - provisionalHyphens);
993 // } else {
994 tokenHandler.comment(longStrBuf, 0, longStrBufLen
995 - provisionalHyphens);
996 // }
997 // [NOCPP[
998 }
999 // ]NOCPP]
1000 cstart = pos + 1;
1001 }
1002
1003 /**
1004 * Flushes coalesced character tokens.
1005 *
1006 * @param buf
1007 * TODO
1008 * @param pos
1009 * TODO
1010 *
1011 * @throws SAXException
1012 */
1013 protected void flushChars(@NoLength char[] buf, int pos)
1014 throws SAXException {
1015 if (pos > cstart) {
1016 tokenHandler.characters(buf, cstart, pos - cstart);
1017 }
1018 cstart = Integer.MAX_VALUE;
1019 }
1020
1021 /**
1022 * Reports an condition that would make the infoset incompatible with XML
1023 * 1.0 as fatal.
1024 *
1025 * @param message
1026 * the message
1027 * @throws SAXException
1028 * @throws SAXParseException
1029 */
1030 public void fatal(String message) throws SAXException {
1031 SAXParseException spe = new SAXParseException(message, this);
1032 if (errorHandler != null) {
1033 errorHandler.fatalError(spe);
1034 }
1035 throw spe;
1036 }
1037
1038 /**
1039 * Reports a Parse Error.
1040 *
1041 * @param message
1042 * the message
1043 * @throws SAXException
1044 */
1045 public void err(String message) throws SAXException {
1046 if (errorHandler == null) {
1047 return;
1048 }
1049 SAXParseException spe = new SAXParseException(message, this);
1050 errorHandler.error(spe);
1051 }
1052
1053 public void errTreeBuilder(String message) throws SAXException {
1054 ErrorHandler eh = null;
1055 if (tokenHandler instanceof TreeBuilder<?>) {
1056 TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
1057 eh = treeBuilder.getErrorHandler();
1058 }
1059 if (eh == null) {
1060 eh = errorHandler;
1061 }
1062 if (eh == null) {
1063 return;
1064 }
1065 SAXParseException spe = new SAXParseException(message, this);
1066 eh.error(spe);
1067 }
1068
1069 /**
1070 * Reports a warning
1071 *
1072 * @param message
1073 * the message
1074 * @throws SAXException
1075 */
1076 public void warn(String message) throws SAXException {
1077 if (errorHandler == null) {
1078 return;
1079 }
1080 SAXParseException spe = new SAXParseException(message, this);
1081 errorHandler.warning(spe);
1082 }
1083
1084 /**
1085 *
1086 */
1087 private void resetAttributes() {
1088 // [NOCPP[
1089 if (newAttributesEachTime) {
1090 // ]NOCPP]
1091 attributes = null;
1092 // [NOCPP[
1093 } else {
1094 attributes.clear(mappingLangToXmlLang);
1095 }
1096 // ]NOCPP]
1097 }
1098
1099 private void strBufToElementNameString() {
1100 // if (strBufOffset != -1) {
1101 // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen);
1102 // } else {
1103 tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
1104 interner);
1105 // }
1106 }
1107
1108 private int emitCurrentTagToken(boolean selfClosing, int pos)
1109 throws SAXException {
1110 cstart = pos + 1;
1111 maybeErrSlashInEndTag(selfClosing);
1112 stateSave = Tokenizer.DATA;
1113 HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
1114 : attributes);
1115 if (endTag) {
1116 /*
1117 * When an end tag token is emitted, the content model flag must be
1118 * switched to the PCDATA state.
1119 */
1120 maybeErrAttributesOnEndTag(attrs);
1121 tokenHandler.endTag(tagName);
1122 Portability.delete(attributes);
1123 } else {
1124 tokenHandler.startTag(tagName, attrs, selfClosing);
1125 }
1126 tagName.release();
1127 tagName = null;
1128 resetAttributes();
1129 /*
1130 * The token handler may have called setStateAndEndTagExpectation
1131 * and changed stateSave since the start of this method.
1132 */
1133 return stateSave;
1134 }
1135
1136 private void attributeNameComplete() throws SAXException {
1137 // if (strBufOffset != -1) {
1138 // attributeName = AttributeName.nameByBuffer(buf, strBufOffset,
1139 // strBufLen, namePolicy != XmlViolationPolicy.ALLOW);
1140 // } else {
1141 attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
1142 // [NOCPP[
1143 , namePolicy != XmlViolationPolicy.ALLOW
1144 // ]NOCPP]
1145 , interner);
1146 // }
1147
1148 if (attributes == null) {
1149 attributes = new HtmlAttributes(mappingLangToXmlLang);
1150 }
1151
1152 /*
1153 * When the user agent leaves the attribute name state (and before
1154 * emitting the tag token, if appropriate), the complete attribute's
1155 * name must be compared to the other attributes on the same token; if
1156 * there is already an attribute on the token with the exact same name,
1157 * then this is a parse error and the new attribute must be dropped,
1158 * along with the value that gets associated with it (if any).
1159 */
1160 if (attributes.contains(attributeName)) {
1161 errDuplicateAttribute();
1162 attributeName.release();
1163 attributeName = null;
1164 }
1165 }
1166
1167 private void addAttributeWithoutValue() throws SAXException {
1168 noteAttributeWithoutValue();
1169
1170 // [NOCPP[
1171 if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
1172 && ElementName.META == tagName) {
1173 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1174 }
1175 // ]NOCPP]
1176 if (attributeName != null) {
1177 // [NOCPP[
1178 if (html4) {
1179 if (attributeName.isBoolean()) {
1180 if (html4ModeCompatibleWithXhtml1Schemata) {
1181 attributes.addAttribute(attributeName,
1182 attributeName.getLocal(AttributeName.HTML),
1183 xmlnsPolicy);
1184 } else {
1185 attributes.addAttribute(attributeName, "", xmlnsPolicy);
1186 }
1187 } else {
1188 err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
1189 attributes.addAttribute(attributeName, "", xmlnsPolicy);
1190 }
1191 } else {
1192 if (AttributeName.SRC == attributeName
1193 || AttributeName.HREF == attributeName) {
1194 warn("Attribute \u201C"
1195 + attributeName.getLocal(AttributeName.HTML)
1196 + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
1197 }
1198 // ]NOCPP]
1199 attributes.addAttribute(attributeName,
1200 Portability.newEmptyString()
1201 // [NOCPP[
1202 , xmlnsPolicy
1203 // ]NOCPP]
1204 );
1205 // [NOCPP[
1206 }
1207 // ]NOCPP]
1208 attributeName = null; // attributeName has been adopted by the
1209 // |attributes| object
1210 }
1211 }
1212
1213 private void addAttributeWithValue() throws SAXException {
1214 // [NOCPP[
1215 if (metaBoundaryPassed && ElementName.META == tagName
1216 && AttributeName.CHARSET == attributeName) {
1217 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1218 }
1219 // ]NOCPP]
1220 if (attributeName != null) {
1221 String val = longStrBufToString(); // Ownership transferred to
1222 // HtmlAttributes
1223 // [NOCPP[
1224 if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
1225 && attributeName.isCaseFolded()) {
1226 val = newAsciiLowerCaseStringFromString(val);
1227 }
1228 // ]NOCPP]
1229 attributes.addAttribute(attributeName, val
1230 // [NOCPP[
1231 , xmlnsPolicy
1232 // ]NOCPP]
1233 );
1234 attributeName = null; // attributeName has been adopted by the
1235 // |attributes| object
1236 }
1237 }
1238
1239 // [NOCPP[
1240
1241 private static String newAsciiLowerCaseStringFromString(String str) {
1242 if (str == null) {
1243 return null;
1244 }
1245 char[] buf = new char[str.length()];
1246 for (int i = 0; i < str.length(); i++) {
1247 char c = str.charAt(i);
1248 if (c >= 'A' && c <= 'Z') {
1249 c += 0x20;
1250 }
1251 buf[i] = c;
1252 }
1253 return new String(buf);
1254 }
1255
1256 protected void startErrorReporting() throws SAXException {
1257
1258 }
1259
1260 // ]NOCPP]
1261
1262 public void start() throws SAXException {
1263 initializeWithoutStarting();
1264 tokenHandler.startTokenization(this);
1265 // [NOCPP[
1266 startErrorReporting();
1267 // ]NOCPP]
1268 }
1269
1270 public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
1271 int state = stateSave;
1272 int returnState = returnStateSave;
1273 char c = '\u0000';
1274 shouldSuspend = false;
1275 lastCR = false;
1276
1277 int start = buffer.getStart();
1278 /**
1279 * The index of the last <code>char</code> read from <code>buf</code>.
1280 */
1281 int pos = start - 1;
1282
1283 /**
1284 * The index of the first <code>char</code> in <code>buf</code> that is
1285 * part of a coalesced run of character tokens or
1286 * <code>Integer.MAX_VALUE</code> if there is not a current run being
1287 * coalesced.
1288 */
1289 switch (state) {
1290 case DATA:
1291 case RCDATA:
1292 case SCRIPT_DATA:
1293 case PLAINTEXT:
1294 case RAWTEXT:
1295 case CDATA_SECTION:
1296 case SCRIPT_DATA_ESCAPED:
1297 case SCRIPT_DATA_ESCAPE_START:
1298 case SCRIPT_DATA_ESCAPE_START_DASH:
1299 case SCRIPT_DATA_ESCAPED_DASH:
1300 case SCRIPT_DATA_ESCAPED_DASH_DASH:
1301 case SCRIPT_DATA_DOUBLE_ESCAPE_START:
1302 case SCRIPT_DATA_DOUBLE_ESCAPED:
1303 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
1304 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
1305 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
1306 case SCRIPT_DATA_DOUBLE_ESCAPE_END:
1307 cstart = start;
1308 break;
1309 default:
1310 cstart = Integer.MAX_VALUE;
1311 break;
1312 }
1313
1314 /**
1315 * The number of <code>char</code>s in <code>buf</code> that have
1316 * meaning. (The rest of the array is garbage and should not be
1317 * examined.)
1318 */
1319 pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
1320 buffer.getEnd());
1321 if (pos == buffer.getEnd()) {
1322 // exiting due to end of buffer
1323 buffer.setStart(pos);
1324 } else {
1325 buffer.setStart(pos + 1);
1326 }
1327 return lastCR;
1328 }
1329
1330 @SuppressWarnings("unused") private int stateLoop(int state, char c,
1331 int pos, @NoLength char[] buf, boolean reconsume, int returnState,
1332 int endPos) throws SAXException {
1333 /*
1334 * Idioms used in this code:
1335 *
1336 *
1337 * Consuming the next input character
1338 *
1339 * To consume the next input character, the code does this: if (++pos ==
1340 * endPos) { break stateloop; } c = checkChar(buf, pos);
1341 *
1342 *
1343 * Staying in a state
1344 *
1345 * When there's a state that the tokenizer may stay in over multiple
1346 * input characters, the state has a wrapper |for(;;)| loop and staying
1347 * in the state continues the loop.
1348 *
1349 *
1350 * Switching to another state
1351 *
1352 * To switch to another state, the code sets the state variable to the
1353 * magic number of the new state. Then it either continues stateloop or
1354 * breaks out of the state's own wrapper loop if the target state is
1355 * right after the current state in source order. (This is a partial
1356 * workaround for Java's lack of goto.)
1357 *
1358 *
1359 * Reconsume support
1360 *
1361 * The spec sometimes says that an input character is reconsumed in
1362 * another state. If a state can ever be entered so that an input
1363 * character can be reconsumed in it, the state's code starts with an
1364 * |if (reconsume)| that sets reconsume to false and skips over the
1365 * normal code for consuming a new character.
1366 *
1367 * To reconsume the current character in another state, the code sets
1368 * |reconsume| to true and then switches to the other state.
1369 *
1370 *
1371 * Emitting character tokens
1372 *
1373 * This method emits character tokens lazily. Whenever a new range of
1374 * character tokens starts, the field cstart must be set to the start
1375 * index of the range. The flushChars() method must be called at the end
1376 * of a range to flush it.
1377 *
1378 *
1379 * U+0000 handling
1380 *
1381 * The various states have to handle the replacement of U+0000 with
1382 * U+FFFD. However, if U+0000 would be reconsumed in another state, the
1383 * replacement doesn't need to happen, because it's handled by the
1384 * reconsuming state.
1385 *
1386 *
1387 * LF handling
1388 *
1389 * Every state needs to increment the line number upon LF unless the LF
1390 * gets reconsumed by another state which increments the line number.
1391 *
1392 *
1393 * CR handling
1394 *
1395 * Every state needs to handle CR unless the CR gets reconsumed and is
1396 * handled by the reconsuming state. The CR needs to be handled as if it
1397 * were and LF, the lastCR field must be set to true and then this
1398 * method must return. The IO driver will then swallow the next
1399 * character if it is an LF to coalesce CRLF.
1400 */
1401 stateloop: for (;;) {
1402 switch (state) {
1403 case DATA:
1404 dataloop: for (;;) {
1405 if (reconsume) {
1406 reconsume = false;
1407 } else {
1408 if (++pos == endPos) {
1409 break stateloop;
1410 }
1411 c = checkChar(buf, pos);
1412 }
1413 switch (c) {
1414 case '&':
1415 /*
1416 * U+0026 AMPERSAND (&) Switch to the character
1417 * reference in data state.
1418 */
1419 flushChars(buf, pos);
1420 clearStrBufAndAppend(c);
1421 setAdditionalAndRememberAmpersandLocation('\u0000');
1422 returnState = state;
1423 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
1424 continue stateloop;
1425 case '<':
1426 /*
1427 * U+003C LESS-THAN SIGN (<) Switch to the tag
1428 * open state.
1429 */
1430 flushChars(buf, pos);
1431
1432 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
1433 break dataloop; // FALL THROUGH continue
1434 // stateloop;
1435 case '\u0000':
1436 emitReplacementCharacter(buf, pos);
1437 continue;
1438 case '\r':
1439 emitCarriageReturn(buf, pos);
1440 break stateloop;
1441 case '\n':
1442 silentLineFeed();
1443 default:
1444 /*
1445 * Anything else Emit the input character as a
1446 * character token.
1447 *
1448 * Stay in the data state.
1449 */
1450 continue;
1451 }
1452 }
1453 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
1454 case TAG_OPEN:
1455 tagopenloop: for (;;) {
1456 /*
1457 * The behavior of this state depends on the content
1458 * model flag.
1459 */
1460 if (++pos == endPos) {
1461 break stateloop;
1462 }
1463 c = checkChar(buf, pos);
1464 /*
1465 * If the content model flag is set to the PCDATA state
1466 * Consume the next input character:
1467 */
1468 if (c >= 'A' && c <= 'Z') {
1469 /*
1470 * U+0041 LATIN CAPITAL LETTER A through to U+005A
1471 * LATIN CAPITAL LETTER Z Create a new start tag
1472 * token,
1473 */
1474 endTag = false;
1475 /*
1476 * set its tag name to the lowercase version of the
1477 * input character (add 0x0020 to the character's
1478 * code point),
1479 */
1480 clearStrBufAndAppend((char) (c + 0x20));
1481 /* then switch to the tag name state. */
1482 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1483 /*
1484 * (Don't emit the token yet; further details will
1485 * be filled in before it is emitted.)
1486 */
1487 break tagopenloop;
1488 // continue stateloop;
1489 } else if (c >= 'a' && c <= 'z') {
1490 /*
1491 * U+0061 LATIN SMALL LETTER A through to U+007A
1492 * LATIN SMALL LETTER Z Create a new start tag
1493 * token,
1494 */
1495 endTag = false;
1496 /*
1497 * set its tag name to the input character,
1498 */
1499 clearStrBufAndAppend(c);
1500 /* then switch to the tag name state. */
1501 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1502 /*
1503 * (Don't emit the token yet; further details will
1504 * be filled in before it is emitted.)
1505 */
1506 break tagopenloop;
1507 // continue stateloop;
1508 }
1509 switch (c) {
1510 case '!':
1511 /*
1512 * U+0021 EXCLAMATION MARK (!) Switch to the
1513 * markup declaration open state.
1514 */
1515 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
1516 continue stateloop;
1517 case '/':
1518 /*
1519 * U+002F SOLIDUS (/) Switch to the close tag
1520 * open state.
1521 */
1522 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
1523 continue stateloop;
1524 case '?':
1525 /*
1526 * U+003F QUESTION MARK (?) Parse error.
1527 */
1528 errProcessingInstruction();
1529 /*
1530 * Switch to the bogus comment state.
1531 */
1532 clearLongStrBufAndAppend(c);
1533 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
1534 continue stateloop;
1535 case '>':
1536 /*
1537 * U+003E GREATER-THAN SIGN (>) Parse error.
1538 */
1539 errLtGt();
1540 /*
1541 * Emit a U+003C LESS-THAN SIGN character token
1542 * and a U+003E GREATER-THAN SIGN character
1543 * token.
1544 */
1545 tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
1546 /* Switch to the data state. */
1547 cstart = pos + 1;
1548 state = transition(state, Tokenizer.DATA, reconsume, pos);
1549 continue stateloop;
1550 default:
1551 /*
1552 * Anything else Parse error.
1553 */
1554 errBadCharAfterLt(c);
1555 /*
1556 * Emit a U+003C LESS-THAN SIGN character token
1557 */
1558 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
1559 /*
1560 * and reconsume the current input character in
1561 * the data state.
1562 */
1563 cstart = pos;
1564 state = transition(state, Tokenizer.DATA, reconsume, pos);
1565 reconsume = true;
1566 continue stateloop;
1567 }
1568 }
1569 // FALL THROUGH DON'T REORDER
1570 case TAG_NAME:
1571 tagnameloop: for (;;) {
1572 if (++pos == endPos) {
1573 break stateloop;
1574 }
1575 c = checkChar(buf, pos);
1576 /*
1577 * Consume the next input character:
1578 */
1579 switch (c) {
1580 case '\r':
1581 silentCarriageReturn();
1582 strBufToElementNameString();
1583 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1584 break stateloop;
1585 case '\n':
1586 silentLineFeed();
1587 case ' ':
1588 case '\t':
1589 case '\u000C':
1590 /*
1591 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1592 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1593 * Switch to the before attribute name state.
1594 */
1595 strBufToElementNameString();
1596 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1597 break tagnameloop;
1598 // continue stateloop;
1599 case '/':
1600 /*
1601 * U+002F SOLIDUS (/) Switch to the self-closing
1602 * start tag state.
1603 */
1604 strBufToElementNameString();
1605 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1606 continue stateloop;
1607 case '>':
1608 /*
1609 * U+003E GREATER-THAN SIGN (>) Emit the current
1610 * tag token.
1611 */
1612 strBufToElementNameString();
1613 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1614 if (shouldSuspend) {
1615 break stateloop;
1616 }
1617 /*
1618 * Switch to the data state.
1619 */
1620 continue stateloop;
1621 case '\u0000':
1622 c = '\uFFFD';
1623 // fall thru
1624 default:
1625 if (c >= 'A' && c <= 'Z') {
1626 /*
1627 * U+0041 LATIN CAPITAL LETTER A through to
1628 * U+005A LATIN CAPITAL LETTER Z Append the
1629 * lowercase version of the current input
1630 * character (add 0x0020 to the character's
1631 * code point) to the current tag token's
1632 * tag name.
1633 */
1634 c += 0x20;
1635 }
1636 /*
1637 * Anything else Append the current input
1638 * character to the current tag token's tag
1639 * name.
1640 */
1641 appendStrBuf(c);
1642 /*
1643 * Stay in the tag name state.
1644 */
1645 continue;
1646 }
1647 }
1648 // FALLTHRU DON'T REORDER
1649 case BEFORE_ATTRIBUTE_NAME:
1650 beforeattributenameloop: for (;;) {
1651 if (reconsume) {
1652 reconsume = false;
1653 } else {
1654 if (++pos == endPos) {
1655 break stateloop;
1656 }
1657 c = checkChar(buf, pos);
1658 }
1659 /*
1660 * Consume the next input character:
1661 */
1662 switch (c) {
1663 case '\r':
1664 silentCarriageReturn();
1665 break stateloop;
1666 case '\n':
1667 silentLineFeed();
1668 // fall thru
1669 case ' ':
1670 case '\t':
1671 case '\u000C':
1672 /*
1673 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1674 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
1675 * in the before attribute name state.
1676 */
1677 continue;
1678 case '/':
1679 /*
1680 * U+002F SOLIDUS (/) Switch to the self-closing
1681 * start tag state.
1682 */
1683 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1684 continue stateloop;
1685 case '>':
1686 /*
1687 * U+003E GREATER-THAN SIGN (>) Emit the current
1688 * tag token.
1689 */
1690 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1691 if (shouldSuspend) {
1692 break stateloop;
1693 }
1694 /*
1695 * Switch to the data state.
1696 */
1697 continue stateloop;
1698 case '\u0000':
1699 c = '\uFFFD';
1700 // fall thru
1701 case '\"':
1702 case '\'':
1703 case '<':
1704 case '=':
1705 /*
1706 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1707 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
1708 * SIGN (=) Parse error.
1709 */
1710 errBadCharBeforeAttributeNameOrNull(c);
1711 /*
1712 * Treat it as per the "anything else" entry
1713 * below.
1714 */
1715 default:
1716 /*
1717 * Anything else Start a new attribute in the
1718 * current tag token.
1719 */
1720 if (c >= 'A' && c <= 'Z') {
1721 /*
1722 * U+0041 LATIN CAPITAL LETTER A through to
1723 * U+005A LATIN CAPITAL LETTER Z Set that
1724 * attribute's name to the lowercase version
1725 * of the current input character (add
1726 * 0x0020 to the character's code point)
1727 */
1728 c += 0x20;
1729 }
1730 /*
1731 * Set that attribute's name to the current
1732 * input character,
1733 */
1734 clearStrBufAndAppend(c);
1735 /*
1736 * and its value to the empty string.
1737 */
1738 // Will do later.
1739 /*
1740 * Switch to the attribute name state.
1741 */
1742 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
1743 break beforeattributenameloop;
1744 // continue stateloop;
1745 }
1746 }
1747 // FALLTHRU DON'T REORDER
1748 case ATTRIBUTE_NAME:
1749 attributenameloop: for (;;) {
1750 if (++pos == endPos) {
1751 break stateloop;
1752 }
1753 c = checkChar(buf, pos);
1754 /*
1755 * Consume the next input character:
1756 */
1757 switch (c) {
1758 case '\r':
1759 silentCarriageReturn();
1760 attributeNameComplete();
1761 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1762 break stateloop;
1763 case '\n':
1764 silentLineFeed();
1765 // fall thru
1766 case ' ':
1767 case '\t':
1768 case '\u000C':
1769 /*
1770 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1771 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1772 * Switch to the after attribute name state.
1773 */
1774 attributeNameComplete();
1775 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1776 continue stateloop;
1777 case '/':
1778 /*
1779 * U+002F SOLIDUS (/) Switch to the self-closing
1780 * start tag state.
1781 */
1782 attributeNameComplete();
1783 addAttributeWithoutValue();
1784 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1785 continue stateloop;
1786 case '=':
1787 /*
1788 * U+003D EQUALS SIGN (=) Switch to the before
1789 * attribute value state.
1790 */
1791 attributeNameComplete();
1792 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
1793 break attributenameloop;
1794 // continue stateloop;
1795 case '>':
1796 /*
1797 * U+003E GREATER-THAN SIGN (>) Emit the current
1798 * tag token.
1799 */
1800 attributeNameComplete();
1801 addAttributeWithoutValue();
1802 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1803 if (shouldSuspend) {
1804 break stateloop;
1805 }
1806 /*
1807 * Switch to the data state.
1808 */
1809 continue stateloop;
1810 case '\u0000':
1811 c = '\uFFFD';
1812 // fall thru
1813 case '\"':
1814 case '\'':
1815 case '<':
1816 /*
1817 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1818 * (') U+003C LESS-THAN SIGN (<) Parse error.
1819 */
1820 errQuoteOrLtInAttributeNameOrNull(c);
1821 /*
1822 * Treat it as per the "anything else" entry
1823 * below.
1824 */
1825 default:
1826 if (c >= 'A' && c <= 'Z') {
1827 /*
1828 * U+0041 LATIN CAPITAL LETTER A through to
1829 * U+005A LATIN CAPITAL LETTER Z Append the
1830 * lowercase version of the current input
1831 * character (add 0x0020 to the character's
1832 * code point) to the current attribute's
1833 * name.
1834 */
1835 c += 0x20;
1836 }
1837 /*
1838 * Anything else Append the current input
1839 * character to the current attribute's name.
1840 */
1841 appendStrBuf(c);
1842 /*
1843 * Stay in the attribute name state.
1844 */
1845 continue;
1846 }
1847 }
1848 // FALLTHRU DON'T REORDER
1849 case BEFORE_ATTRIBUTE_VALUE:
1850 beforeattributevalueloop: for (;;) {
1851 if (++pos == endPos) {
1852 break stateloop;
1853 }
1854 c = checkChar(buf, pos);
1855 /*
1856 * Consume the next input character:
1857 */
1858 switch (c) {
1859 case '\r':
1860 silentCarriageReturn();
1861 break stateloop;
1862 case '\n':
1863 silentLineFeed();
1864 // fall thru
1865 case ' ':
1866 case '\t':
1867 case '\u000C':
1868 /*
1869 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1870 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
1871 * in the before attribute value state.
1872 */
1873 continue;
1874 case '"':
1875 /*
1876 * U+0022 QUOTATION MARK (") Switch to the
1877 * attribute value (double-quoted) state.
1878 */
1879 clearLongStrBuf();
1880 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
1881 break beforeattributevalueloop;
1882 // continue stateloop;
1883 case '&':
1884 /*
1885 * U+0026 AMPERSAND (&) Switch to the attribute
1886 * value (unquoted) state and reconsume this
1887 * input character.
1888 */
1889 clearLongStrBuf();
1890 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
1891 noteUnquotedAttributeValue();
1892 reconsume = true;
1893 continue stateloop;
1894 case '\'':
1895 /*
1896 * U+0027 APOSTROPHE (') Switch to the attribute
1897 * value (single-quoted) state.
1898 */
1899 clearLongStrBuf();
1900 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
1901 continue stateloop;
1902 case '>':
1903 /*
1904 * U+003E GREATER-THAN SIGN (>) Parse error.
1905 */
1906 errAttributeValueMissing();
1907 /*
1908 * Emit the current tag token.
1909 */
1910 addAttributeWithoutValue();
1911 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1912 if (shouldSuspend) {
1913 break stateloop;
1914 }
1915 /*
1916 * Switch to the data state.
1917 */
1918 continue stateloop;
1919 case '\u0000':
1920 c = '\uFFFD';
1921 // fall thru
1922 case '<':
1923 case '=':
1924 case '`':
1925 /*
1926 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
1927 * (=) U+0060 GRAVE ACCENT (`)
1928 */
1929 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
1930 /*
1931 * Treat it as per the "anything else" entry
1932 * below.
1933 */
1934 default:
1935 // [NOCPP[
1936 errHtml4NonNameInUnquotedAttribute(c);
1937 // ]NOCPP]
1938 /*
1939 * Anything else Append the current input
1940 * character to the current attribute's value.
1941 */
1942 clearLongStrBufAndAppend(c);
1943 /*
1944 * Switch to the attribute value (unquoted)
1945 * state.
1946 */
1947
1948 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
1949 noteUnquotedAttributeValue();
1950 continue stateloop;
1951 }
1952 }
1953 // FALLTHRU DON'T REORDER
1954 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
1955 attributevaluedoublequotedloop: for (;;) {
1956 if (reconsume) {
1957 reconsume = false;
1958 } else {
1959 if (++pos == endPos) {
1960 break stateloop;
1961 }
1962 c = checkChar(buf, pos);
1963 }
1964 /*
1965 * Consume the next input character:
1966 */
1967 switch (c) {
1968 case '"':
1969 /*
1970 * U+0022 QUOTATION MARK (") Switch to the after
1971 * attribute value (quoted) state.
1972 */
1973 addAttributeWithValue();
1974
1975 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
1976 break attributevaluedoublequotedloop;
1977 // continue stateloop;
1978 case '&':
1979 /*
1980 * U+0026 AMPERSAND (&) Switch to the character
1981 * reference in attribute value state, with the
1982 * additional allowed character being U+0022
1983 * QUOTATION MARK (").
1984 */
1985 clearStrBufAndAppend(c);
1986 setAdditionalAndRememberAmpersandLocation('\"');
1987 returnState = state;
1988 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
1989 continue stateloop;
1990 case '\r':
1991 appendLongStrBufCarriageReturn();
1992 break stateloop;
1993 case '\n':
1994 appendLongStrBufLineFeed();
1995 continue;
1996 case '\u0000':
1997 c = '\uFFFD';
1998 // fall thru
1999 default:
2000 /*
2001 * Anything else Append the current input
2002 * character to the current attribute's value.
2003 */
2004 appendLongStrBuf(c);
2005 /*
2006 * Stay in the attribute value (double-quoted)
2007 * state.
2008 */
2009 continue;
2010 }
2011 }
2012 // FALLTHRU DON'T REORDER
2013 case AFTER_ATTRIBUTE_VALUE_QUOTED:
2014 afterattributevaluequotedloop: for (;;) {
2015 if (++pos == endPos) {
2016 break stateloop;
2017 }
2018 c = checkChar(buf, pos);
2019 /*
2020 * Consume the next input character:
2021 */
2022 switch (c) {
2023 case '\r':
2024 silentCarriageReturn();
2025 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2026 break stateloop;
2027 case '\n':
2028 silentLineFeed();
2029 // fall thru
2030 case ' ':
2031 case '\t':
2032 case '\u000C':
2033 /*
2034 * U+0009 CHARACTER TABULATION U+000A LINE FEED
2035 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2036 * Switch to the before attribute name state.
2037 */
2038 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2039 continue stateloop;
2040 case '/':
2041 /*
2042 * U+002F SOLIDUS (/) Switch to the self-closing
2043 * start tag state.
2044 */
2045 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2046 break afterattributevaluequotedloop;
2047 // continue stateloop;
2048 case '>':
2049 /*
2050 * U+003E GREATER-THAN SIGN (>) Emit the current
2051 * tag token.
2052 */
2053 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2054 if (shouldSuspend) {
2055 break stateloop;
2056 }
2057 /*
2058 * Switch to the data state.
2059 */
2060 continue stateloop;
2061 default:
2062 /*
2063 * Anything else Parse error.
2064 */
2065 errNoSpaceBetweenAttributes();
2066 /*
2067 * Reconsume the character in the before
2068 * attribute name state.
2069 */
2070 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2071 reconsume = true;
2072 continue stateloop;
2073 }
2074 }
2075 // FALLTHRU DON'T REORDER
2076 case SELF_CLOSING_START_TAG:
2077 if (++pos == endPos) {
2078 break stateloop;
2079 }
2080 c = checkChar(buf, pos);
2081 /*
2082 * Consume the next input character:
2083 */
2084 switch (c) {
2085 case '>':
2086 /*
2087 * U+003E GREATER-THAN SIGN (>) Set the self-closing
2088 * flag of the current tag token. Emit the current
2089 * tag token.
2090 */
2091 // [NOCPP[
2092 errHtml4XmlVoidSyntax();
2093 // ]NOCPP]
2094 state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
2095 if (shouldSuspend) {
2096 break stateloop;
2097 }
2098 /*
2099 * Switch to the data state.
2100 */
2101 continue stateloop;
2102 default:
2103 /* Anything else Parse error. */
2104 errSlashNotFollowedByGt();
2105 /*
2106 * Reconsume the character in the before attribute
2107 * name state.
2108 */
2109 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2110 reconsume = true;
2111 continue stateloop;
2112 }
2113 // XXX reorder point
2114 case ATTRIBUTE_VALUE_UNQUOTED:
2115 for (;;) {
2116 if (reconsume) {
2117 reconsume = false;
2118 } else {
2119 if (++pos == endPos) {
2120 break stateloop;
2121 }
2122 c = checkChar(buf, pos);
2123 }
2124 /*
2125 * Consume the next input character:
2126 */
2127 switch (c) {
2128 case '\r':
2129 silentCarriageReturn();
2130 addAttributeWithValue();
2131 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2132 break stateloop;
2133 case '\n':
2134 silentLineFeed();
2135 // fall thru
2136 case ' ':
2137 case '\t':
2138 case '\u000C':
2139 /*
2140 * U+0009 CHARACTER TABULATION U+000A LINE FEED
2141 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2142 * Switch to the before attribute name state.
2143 */
2144 addAttributeWithValue();
2145 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2146 continue stateloop;
2147 case '&':
2148 /*
2149 * U+0026 AMPERSAND (&) Switch to the character
2150 * reference in attribute value state, with the
2151 * additional allowed character being U+003E
2152 * GREATER-THAN SIGN (>)
2153 */
2154 clearStrBufAndAppend(c);
2155 setAdditionalAndRememberAmpersandLocation('>');
2156 returnState = state;
2157 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2158 continue stateloop;
2159 case '>':
2160 /*
2161 * U+003E GREATER-THAN SIGN (>) Emit the current
2162 * tag token.
2163 */
2164 addAttributeWithValue();
2165 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2166 if (shouldSuspend) {
2167 break stateloop;
2168 }
2169 /*
2170 * Switch to the data state.
2171 */
2172 continue stateloop;
2173 case '\u0000':
2174 c = '\uFFFD';
2175 // fall thru
2176 case '<':
2177 case '\"':
2178 case '\'':
2179 case '=':
2180 case '`':
2181 /*
2182 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
2183 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
2184 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
2185 */
2186 errUnquotedAttributeValOrNull(c);
2187 /*
2188 * Treat it as per the "anything else" entry
2189 * below.
2190 */
2191 // fall through
2192 default:
2193 // [NOCPP]
2194 errHtml4NonNameInUnquotedAttribute(c);
2195 // ]NOCPP]
2196 /*
2197 * Anything else Append the current input
2198 * character to the current attribute's value.
2199 */
2200 appendLongStrBuf(c);
2201 /*
2202 * Stay in the attribute value (unquoted) state.
2203 */
2204 continue;
2205 }
2206 }
2207 // XXX reorder point
2208 case AFTER_ATTRIBUTE_NAME:
2209 for (;;) {
2210 if (++pos == endPos) {
2211 break stateloop;
2212 }
2213 c = checkChar(buf, pos);
2214 /*
2215 * Consume the next input character:
2216 */
2217 switch (c) {
2218 case '\r':
2219 silentCarriageReturn();
2220 break stateloop;
2221 case '\n':
2222 silentLineFeed();
2223 // fall thru
2224 case ' ':
2225 case '\t':
2226 case '\u000C':
2227 /*
2228 * U+0009 CHARACTER TABULATION U+000A LINE FEED
2229 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
2230 * in the after attribute name state.
2231 */
2232 continue;
2233 case '/':
2234 /*
2235 * U+002F SOLIDUS (/) Switch to the self-closing
2236 * start tag state.
2237 */
2238 addAttributeWithoutValue();
2239 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2240 continue stateloop;
2241 case '=':
2242 /*
2243 * U+003D EQUALS SIGN (=) Switch to the before
2244 * attribute value state.
2245 */
2246 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
2247 continue stateloop;
2248 case '>':
2249 /*
2250 * U+003E GREATER-THAN SIGN (>) Emit the current
2251 * tag token.
2252 */
2253 addAttributeWithoutValue();
2254 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2255 if (shouldSuspend) {
2256 break stateloop;
2257 }
2258 /*
2259 * Switch to the data state.
2260 */
2261 continue stateloop;
2262 case '\u0000':
2263 c = '\uFFFD';
2264 // fall thru
2265 case '\"':
2266 case '\'':
2267 case '<':
2268 errQuoteOrLtInAttributeNameOrNull(c);
2269 /*
2270 * Treat it as per the "anything else" entry
2271 * below.
2272 */
2273 default:
2274 addAttributeWithoutValue();
2275 /*
2276 * Anything else Start a new attribute in the
2277 * current tag token.
2278 */
2279 if (c >= 'A' && c <= 'Z') {
2280 /*
2281 * U+0041 LATIN CAPITAL LETTER A through to
2282 * U+005A LATIN CAPITAL LETTER Z Set that
2283 * attribute's name to the lowercase version
2284 * of the current input character (add
2285 * 0x0020 to the character's code point)
2286 */
2287 c += 0x20;
2288 }
2289 /*
2290 * Set that attribute's name to the current
2291 * input character,
2292 */
2293 clearStrBufAndAppend(c);
2294 /*
2295 * and its value to the empty string.
2296 */
2297 // Will do later.
2298 /*
2299 * Switch to the attribute name state.
2300 */
2301 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
2302 continue stateloop;
2303 }
2304 }
2305 // XXX reorder point
2306 case MARKUP_DECLARATION_OPEN:
2307 markupdeclarationopenloop: for (;;) {
2308 if (++pos == endPos) {
2309 break stateloop;
2310 }
2311 c = checkChar(buf, pos);
2312 /*
2313 * If the next two characters are both U+002D
2314 * HYPHEN-MINUS characters (-), consume those two
2315 * characters, create a comment token whose data is the
2316 * empty string, and switch to the comment start state.
2317 *
2318 * Otherwise, if the next seven characters are an ASCII
2319 * case-insensitive match for the word "DOCTYPE", then
2320 * consume those characters and switch to the DOCTYPE
2321 * state.
2322 *
2323 * Otherwise, if the insertion mode is
2324 * "in foreign content" and the current node is not an
2325 * element in the HTML namespace and the next seven
2326 * characters are an case-sensitive match for the string
2327 * "[CDATA[" (the five uppercase letters "CDATA" with a
2328 * U+005B LEFT SQUARE BRACKET character before and
2329 * after), then consume those characters and switch to
2330 * the CDATA section state.
2331 *
2332 * Otherwise, is is a parse error. Switch to the bogus
2333 * comment state. The next character that is consumed,
2334 * if any, is the first character that will be in the
2335 * comment.
2336 */
2337 switch (c) {
2338 case '-':
2339 clearLongStrBufAndAppend(c);
2340 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
2341 break markupdeclarationopenloop;
2342 // continue stateloop;
2343 case 'd':
2344 case 'D':
2345 clearLongStrBufAndAppend(c);
2346 index = 0;
2347 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
2348 continue stateloop;
2349 case '[':
2350 if (tokenHandler.cdataSectionAllowed()) {
2351 clearLongStrBufAndAppend(c);
2352 index = 0;
2353 state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
2354 continue stateloop;
2355 }
2356 // else fall through
2357 default:
2358 errBogusComment();
2359 clearLongStrBuf();
2360 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2361 reconsume = true;
2362 continue stateloop;
2363 }
2364 }
2365 // FALLTHRU DON'T REORDER
2366 case MARKUP_DECLARATION_HYPHEN:
2367 markupdeclarationhyphenloop: for (;;) {
2368 if (++pos == endPos) {
2369 break stateloop;
2370 }
2371 c = checkChar(buf, pos);
2372 switch (c) {
2373 case '\u0000':
2374 break stateloop;
2375 case '-':
2376 clearLongStrBuf();
2377 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
2378 break markupdeclarationhyphenloop;
2379 // continue stateloop;
2380 default:
2381 errBogusComment();
2382 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2383 reconsume = true;
2384 continue stateloop;
2385 }
2386 }
2387 // FALLTHRU DON'T REORDER
2388 case COMMENT_START:
2389 commentstartloop: for (;;) {
2390 if (++pos == endPos) {
2391 break stateloop;
2392 }
2393 c = checkChar(buf, pos);
2394 /*
2395 * Comment start state
2396 *
2397 *
2398 * Consume the next input character:
2399 */
2400 switch (c) {
2401 case '-':
2402 /*
2403 * U+002D HYPHEN-MINUS (-) Switch to the comment
2404 * start dash state.
2405 */
2406 appendLongStrBuf(c);
2407 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
2408 continue stateloop;
2409 case '>':
2410 /*
2411 * U+003E GREATER-THAN SIGN (>) Parse error.
2412 */
2413 errPrematureEndOfComment();
2414 /* Emit the comment token. */
2415 emitComment(0, pos);
2416 /*
2417 * Switch to the data state.
2418 */
2419 state = transition(state, Tokenizer.DATA, reconsume, pos);
2420 continue stateloop;
2421 case '\r':
2422 appendLongStrBufCarriageReturn();
2423 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2424 break stateloop;
2425 case '\n':
2426 appendLongStrBufLineFeed();
2427 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2428 break commentstartloop;
2429 case '\u0000':
2430 c = '\uFFFD';
2431 // fall thru
2432 default:
2433 /*
2434 * Anything else Append the input character to
2435 * the comment token's data.
2436 */
2437 appendLongStrBuf(c);
2438 /*
2439 * Switch to the comment state.
2440 */
2441 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2442 break commentstartloop;
2443 // continue stateloop;
2444 }
2445 }
2446 // FALLTHRU DON'T REORDER
2447 case COMMENT:
2448 commentloop: for (;;) {
2449 if (++pos == endPos) {
2450 break stateloop;
2451 }
2452 c = checkChar(buf, pos);
2453 /*
2454 * Comment state Consume the next input character:
2455 */
2456 switch (c) {
2457 case '-':
2458 /*
2459 * U+002D HYPHEN-MINUS (-) Switch to the comment
2460 * end dash state
2461 */
2462 appendLongStrBuf(c);
2463 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2464 break commentloop;
2465 // continue stateloop;
2466 case '\r':
2467 appendLongStrBufCarriageReturn();
2468 break stateloop;
2469 case '\n':
2470 appendLongStrBufLineFeed();
2471 continue;
2472 case '\u0000':
2473 c = '\uFFFD';
2474 // fall thru
2475 default:
2476 /*
2477 * Anything else Append the input character to
2478 * the comment token's data.
2479 */
2480 appendLongStrBuf(c);
2481 /*
2482 * Stay in the comment state.
2483 */
2484 continue;
2485 }
2486 }
2487 // FALLTHRU DON'T REORDER
2488 case COMMENT_END_DASH:
2489 commentenddashloop: for (;;) {
2490 if (++pos == endPos) {
2491 break stateloop;
2492 }
2493 c = checkChar(buf, pos);
2494 /*
2495 * Comment end dash state Consume the next input
2496 * character:
2497 */
2498 switch (c) {
2499 case '-':
2500 /*
2501 * U+002D HYPHEN-MINUS (-) Switch to the comment
2502 * end state
2503 */
2504 appendLongStrBuf(c);
2505 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2506 break commentenddashloop;
2507 // continue stateloop;
2508 case '\r':
2509 appendLongStrBufCarriageReturn();
2510 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2511 break stateloop;
2512 case '\n':
2513 appendLongStrBufLineFeed();
2514 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2515 continue stateloop;
2516 case '\u0000':
2517 c = '\uFFFD';
2518 // fall thru
2519 default:
2520 /*
2521 * Anything else Append a U+002D HYPHEN-MINUS
2522 * (-) character and the input character to the
2523 * comment token's data.
2524 */
2525 appendLongStrBuf(c);
2526 /*
2527 * Switch to the comment state.
2528 */
2529 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2530 continue stateloop;
2531 }
2532 }
2533 // FALLTHRU DON'T REORDER
2534 case COMMENT_END:
2535 commentendloop: for (;;) {
2536 if (++pos == endPos) {
2537 break stateloop;
2538 }
2539 c = checkChar(buf, pos);
2540 /*
2541 * Comment end dash state Consume the next input
2542 * character:
2543 */
2544 switch (c) {
2545 case '>':
2546 /*
2547 * U+003E GREATER-THAN SIGN (>) Emit the comment
2548 * token.
2549 */
2550 emitComment(2, pos);
2551 /*
2552 * Switch to the data state.
2553 */
2554 state = transition(state, Tokenizer.DATA, reconsume, pos);
2555 continue stateloop;
2556 case '-':
2557 /* U+002D HYPHEN-MINUS (-) Parse error. */
2558 /*
2559 * Append a U+002D HYPHEN-MINUS (-) character to
2560 * the comment token's data.
2561 */
2562 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
2563 /*
2564 * Stay in the comment end state.
2565 */
2566 continue;
2567 case '\r':
2568 adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn();
2569 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2570 break stateloop;
2571 case '\n':
2572 adjustDoubleHyphenAndAppendToLongStrBufLineFeed();
2573 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2574 continue stateloop;
2575 case '!':
2576 errHyphenHyphenBang();
2577 appendLongStrBuf(c);
2578 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
2579 continue stateloop;
2580 case '\u0000':
2581 c = '\uFFFD';
2582 // fall thru
2583 default:
2584 /*
2585 * Append two U+002D HYPHEN-MINUS (-) characters
2586 * and the input character to the comment
2587 * token's data.
2588 */
2589 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
2590 /*
2591 * Switch to the comment state.
2592 */
2593 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2594 continue stateloop;
2595 }
2596 }
2597 // XXX reorder point
2598 case COMMENT_END_BANG:
2599 for (;;) {
2600 if (++pos == endPos) {
2601 break stateloop;
2602 }
2603 c = checkChar(buf, pos);
2604 /*
2605 * Comment end bang state
2606 *
2607 * Consume the next input character:
2608 */
2609 switch (c) {
2610 case '>':
2611 /*
2612 * U+003E GREATER-THAN SIGN (>) Emit the comment
2613 * token.
2614 */
2615 emitComment(3, pos);
2616 /*
2617 * Switch to the data state.
2618 */
2619 state = transition(state, Tokenizer.DATA, reconsume, pos);
2620 continue stateloop;
2621 case '-':
2622 /*
2623 * Append two U+002D HYPHEN-MINUS (-) characters
2624 * and a U+0021 EXCLAMATION MARK (!) character
2625 * to the comment token's data.
2626 */
2627 appendLongStrBuf(c);
2628 /*
2629 * Switch to the comment end dash state.
2630 */
2631 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2632 continue stateloop;
2633 case '\r':
2634 appendLongStrBufCarriageReturn();
2635 break stateloop;
2636 case '\n':
2637 appendLongStrBufLineFeed();
2638 continue;
2639 case '\u0000':
2640 c = '\uFFFD';
2641 // fall thru
2642 default:
2643 /*
2644 * Anything else Append two U+002D HYPHEN-MINUS
2645 * (-) characters, a U+0021 EXCLAMATION MARK (!)
2646 * character, and the input character to the
2647 * comment token's data. Switch to the comment
2648 * state.
2649 */
2650 appendLongStrBuf(c);
2651 /*
2652 * Switch to the comment state.
2653 */
2654 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2655 continue stateloop;
2656 }
2657 }
2658 // XXX reorder point
2659 case COMMENT_START_DASH:
2660 if (++pos == endPos) {
2661 break stateloop;
2662 }
2663 c = checkChar(buf, pos);
2664 /*
2665 * Comment start dash state
2666 *
2667 * Consume the next input character:
2668 */
2669 switch (c) {
2670 case '-':
2671 /*
2672 * U+002D HYPHEN-MINUS (-) Switch to the comment end
2673 * state
2674 */
2675 appendLongStrBuf(c);
2676 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2677 continue stateloop;
2678 case '>':
2679 errPrematureEndOfComment();
2680 /* Emit the comment token. */
2681 emitComment(1, pos);
2682 /*
2683 * Switch to the data state.
2684 */
2685 state = transition(state, Tokenizer.DATA, reconsume, pos);
2686 continue stateloop;
2687 case '\r':
2688 appendLongStrBufCarriageReturn();
2689 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2690 break stateloop;
2691 case '\n':
2692 appendLongStrBufLineFeed();
2693 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2694 continue stateloop;
2695 case '\u0000':
2696 c = '\uFFFD';
2697 // fall thru
2698 default:
2699 /*
2700 * Append a U+002D HYPHEN-MINUS character (-) and
2701 * the current input character to the comment
2702 * token's data.
2703 */
2704 appendLongStrBuf(c);
2705 /*
2706 * Switch to the comment state.
2707 */
2708 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2709 continue stateloop;
2710 }
2711 // XXX reorder point
2712 case CDATA_START:
2713 for (;;) {
2714 if (++pos == endPos) {
2715 break stateloop;
2716 }
2717 c = checkChar(buf, pos);
2718 if (index < 6) { // CDATA_LSQB.length
2719 if (c == Tokenizer.CDATA_LSQB[index]) {
2720 appendLongStrBuf(c);
2721 } else {
2722 errBogusComment();
2723 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2724 reconsume = true;
2725 continue stateloop;
2726 }
2727 index++;
2728 continue;
2729 } else {
2730 cstart = pos; // start coalescing
2731 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2732 reconsume = true;
2733 break; // FALL THROUGH continue stateloop;
2734 }
2735 }
2736 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2737 case CDATA_SECTION:
2738 cdatasectionloop: for (;;) {
2739 if (reconsume) {
2740 reconsume = false;
2741 } else {
2742 if (++pos == endPos) {
2743 break stateloop;
2744 }
2745 c = checkChar(buf, pos);
2746 }
2747 switch (c) {
2748 case ']':
2749 flushChars(buf, pos);
2750 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
2751 break cdatasectionloop; // FALL THROUGH
2752 case '\u0000':
2753 emitReplacementCharacter(buf, pos);
2754 continue;
2755 case '\r':
2756 emitCarriageReturn(buf, pos);
2757 break stateloop;
2758 case '\n':
2759 silentLineFeed();
2760 // fall thru
2761 default:
2762 continue;
2763 }
2764 }
2765 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2766 case CDATA_RSQB:
2767 cdatarsqb: for (;;) {
2768 if (++pos == endPos) {
2769 break stateloop;
2770 }
2771 c = checkChar(buf, pos);
2772 switch (c) {
2773 case ']':
2774 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
2775 break cdatarsqb;
2776 default:
2777 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
2778 1);
2779 cstart = pos;
2780 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2781 reconsume = true;
2782 continue stateloop;
2783 }
2784 }
2785 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2786 case CDATA_RSQB_RSQB:
2787 if (++pos == endPos) {
2788 break stateloop;
2789 }
2790 c = checkChar(buf, pos);
2791 switch (c) {
2792 case '>':
2793 cstart = pos + 1;
2794 state = transition(state, Tokenizer.DATA, reconsume, pos);
2795 continue stateloop;
2796 default:
2797 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
2798 cstart = pos;
2799 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2800 reconsume = true;
2801 continue stateloop;
2802
2803 }
2804 // XXX reorder point
2805 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
2806 attributevaluesinglequotedloop: for (;;) {
2807 if (reconsume) {
2808 reconsume = false;
2809 } else {
2810 if (++pos == endPos) {
2811 break stateloop;
2812 }
2813 c = checkChar(buf, pos);
2814 }
2815 /*
2816 * Consume the next input character:
2817 */
2818 switch (c) {
2819 case '\'':
2820 /*
2821 * U+0027 APOSTROPHE (') Switch to the after
2822 * attribute value (quoted) state.
2823 */
2824 addAttributeWithValue();
2825
2826 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2827 continue stateloop;
2828 case '&':
2829 /*
2830 * U+0026 AMPERSAND (&) Switch to the character
2831 * reference in attribute value state, with the
2832 * + additional allowed character being U+0027
2833 * APOSTROPHE (').
2834 */
2835 clearStrBufAndAppend(c);
2836 setAdditionalAndRememberAmpersandLocation('\'');
2837 returnState = state;
2838 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2839 break attributevaluesinglequotedloop;
2840 // continue stateloop;
2841 case '\r':
2842 appendLongStrBufCarriageReturn();
2843 break stateloop;
2844 case '\n':
2845 appendLongStrBufLineFeed();
2846 continue;
2847 case '\u0000':
2848 c = '\uFFFD';
2849 // fall thru
2850 default:
2851 /*
2852 * Anything else Append the current input
2853 * character to the current attribute's value.
2854 */
2855 appendLongStrBuf(c);
2856 /*
2857 * Stay in the attribute value (double-quoted)
2858 * state.
2859 */
2860 continue;
2861 }
2862 }
2863 // FALLTHRU DON'T REORDER
2864 case CONSUME_CHARACTER_REFERENCE:
2865 if (++pos == endPos) {
2866 break stateloop;
2867 }
2868 c = checkChar(buf, pos);
2869 if (c == '\u0000') {
2870 break stateloop;
2871 }
2872 /*
2873 * Unlike the definition is the spec, this state does not
2874 * return a value and never requires the caller to
2875 * backtrack. This state takes care of emitting characters
2876 * or appending to the current attribute value. It also
2877 * takes care of that in the case when consuming the
2878 * character reference fails.
2879 */
2880 /*
2881 * This section defines how to consume a character
2882 * reference. This definition is used when parsing character
2883 * references in text and in attributes.
2884 *
2885 * The behavior depends on the identity of the next
2886 * character (the one immediately after the U+0026 AMPERSAND
2887 * character):
2888 */
2889 switch (c) {
2890 case ' ':
2891 case '\t':
2892 case '\n':
2893 case '\r': // we'll reconsume!
2894 case '\u000C':
2895 case '<':
2896 case '&':
2897 emitOrAppendStrBuf(returnState);
2898 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
2899 cstart = pos;
2900 }
2901 state = transition(state, returnState, reconsume, pos);
2902 reconsume = true;
2903 continue stateloop;
2904 case '#':
2905 /*
2906 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
2907 * SIGN.
2908 */
2909 appendStrBuf('#');
2910 state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
2911 continue stateloop;
2912 default:
2913 if (c == additional) {
2914 emitOrAppendStrBuf(returnState);
2915 state = transition(state, returnState, reconsume, pos);
2916 reconsume = true;
2917 continue stateloop;
2918 }
2919 if (c >= 'a' && c <= 'z') {
2920 firstCharKey = c - 'a' + 26;
2921 } else if (c >= 'A' && c <= 'Z') {
2922 firstCharKey = c - 'A';
2923 } else {
2924 // No match
2925 /*
2926 * If no match can be made, then this is a parse
2927 * error.
2928 */
2929 errNoNamedCharacterMatch();
2930 emitOrAppendStrBuf(returnState);
2931 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
2932 cstart = pos;
2933 }
2934 state = transition(state, returnState, reconsume, pos);
2935 reconsume = true;
2936 continue stateloop;
2937 }
2938 // Didn't fail yet
2939 appendStrBuf(c);
2940 state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
2941 // FALL THROUGH continue stateloop;
2942 }
2943 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2944 case CHARACTER_REFERENCE_HILO_LOOKUP:
2945 {
2946 if (++pos == endPos) {
2947 break stateloop;
2948 }
2949 c = checkChar(buf, pos);
2950 if (c == '\u0000') {
2951 break stateloop;
2952 }
2953 /*
2954 * The data structure is as follows:
2955 *
2956 * HILO_ACCEL is a two-dimensional int array whose major
2957 * index corresponds to the second character of the
2958 * character reference (code point as index) and the
2959 * minor index corresponds to the first character of the
2960 * character reference (packed so that A-Z runs from 0
2961 * to 25 and a-z runs from 26 to 51). This layout makes
2962 * it easier to use the sparseness of the data structure
2963 * to omit parts of it: The second dimension of the
2964 * table is null when no character reference starts with
2965 * the character corresponding to that row.
2966 *
2967 * The int value HILO_ACCEL (by these indeces) is zero
2968 * if there exists no character reference starting with
2969 * that two-letter prefix. Otherwise, the value is an
2970 * int that packs two shorts so that the higher short is
2971 * the index of the highest character reference name
2972 * with that prefix in NAMES and the lower short
2973 * corresponds to the index of the lowest character
2974 * reference name with that prefix. (It happens that the
2975 * first two character reference names share their
2976 * prefix so the packed int cannot be 0 by packing the
2977 * two shorts.)
2978 *
2979 * NAMES is an array of byte arrays where each byte
2980 * array encodes the name of a character references as
2981 * ASCII. The names omit the first two letters of the
2982 * name. (Since storing the first two letters would be
2983 * redundant with the data contained in HILO_ACCEL.) The
2984 * entries are lexically sorted.
2985 *
2986 * For a given index in NAMES, the same index in VALUES
2987 * contains the corresponding expansion as an array of
2988 * two UTF-16 code units (either the character and
2989 * U+0000 or a suggogate pair).
2990 */
2991 int hilo = 0;
2992 if (c <= 'z') {
2993 @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
2994 if (row != null) {
2995 hilo = row[firstCharKey];
2996 }
2997 }
2998 if (hilo == 0) {
2999 /*
3000 * If no match can be made, then this is a parse
3001 * error.
3002 */
3003 errNoNamedCharacterMatch();
3004 emitOrAppendStrBuf(returnState);
3005 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3006 cstart = pos;
3007 }
3008 state = transition(state, returnState, reconsume, pos);
3009 reconsume = true;
3010 continue stateloop;
3011 }
3012 // Didn't fail yet
3013 appendStrBuf(c);
3014 lo = hilo & 0xFFFF;
3015 hi = hilo >> 16;
3016 entCol = -1;
3017 candidate = -1;
3018 strBufMark = 0;
3019 state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
3020 // FALL THROUGH continue stateloop;
3021 }
3022 case CHARACTER_REFERENCE_TAIL:
3023 outer: for (;;) {
3024 if (++pos == endPos) {
3025 break stateloop;
3026 }
3027 c = checkChar(buf, pos);
3028 if (c == '\u0000') {
3029 break stateloop;
3030 }
3031 entCol++;
3032 /*
3033 * Consume the maximum number of characters possible,
3034 * with the consumed characters matching one of the
3035 * identifiers in the first column of the named
3036 * character references table (in a case-sensitive
3037 * manner).
3038 */
3039 loloop: for (;;) {
3040 if (hi < lo) {
3041 break outer;
3042 }
3043 if (entCol == NamedCharacters.NAMES[lo].length()) {
3044 candidate = lo;
3045 strBufMark = strBufLen;
3046 lo++;
3047 } else if (entCol > NamedCharacters.NAMES[lo].length()) {
3048 break outer;
3049 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
3050 lo++;
3051 } else {
3052 break loloop;
3053 }
3054 }
3055
3056 hiloop: for (;;) {
3057 if (hi < lo) {
3058 break outer;
3059 }
3060 if (entCol == NamedCharacters.NAMES[hi].length()) {
3061 break hiloop;
3062 }
3063 if (entCol > NamedCharacters.NAMES[hi].length()) {
3064 break outer;
3065 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
3066 hi--;
3067 } else {
3068 break hiloop;
3069 }
3070 }
3071
3072 if (hi < lo) {
3073 break outer;
3074 }
3075 appendStrBuf(c);
3076 continue;
3077 }
3078
3079 if (candidate == -1) {
3080 // reconsume deals with CR, LF or nul
3081 /*
3082 * If no match can be made, then this is a parse error.
3083 */
3084 errNoNamedCharacterMatch();
3085 emitOrAppendStrBuf(returnState);
3086 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3087 cstart = pos;
3088 }
3089 state = transition(state, returnState, reconsume, pos);
3090 reconsume = true;
3091 continue stateloop;
3092 } else {
3093 // c can't be CR, LF or nul if we got here
3094 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
3095 if (candidateName.length() == 0
3096 || candidateName.charAt(candidateName.length() - 1) != ';') {
3097 /*
3098 * If the last character matched is not a U+003B
3099 * SEMICOLON (;), there is a parse error.
3100 */
3101 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3102 /*
3103 * If the entity is being consumed as part of an
3104 * attribute, and the last character matched is
3105 * not a U+003B SEMICOLON (;),
3106 */
3107 char ch;
3108 if (strBufMark == strBufLen) {
3109 ch = c;
3110 } else {
3111 // if (strBufOffset != -1) {
3112 // ch = buf[strBufOffset + strBufMark];
3113 // } else {
3114 ch = strBuf[strBufMark];
3115 // }
3116 }
3117 if (ch == '=' || (ch >= '0' && ch <= '9')
3118 || (ch >= 'A' && ch <= 'Z')
3119 || (ch >= 'a' && ch <= 'z')) {
3120 /*
3121 * and the next character is either a U+003D
3122 * EQUALS SIGN character (=) or in the range
3123 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
3124 * U+0041 LATIN CAPITAL LETTER A to U+005A
3125 * LATIN CAPITAL LETTER Z, or U+0061 LATIN
3126 * SMALL LETTER A to U+007A LATIN SMALL
3127 * LETTER Z, then, for historical reasons,
3128 * all the characters that were matched
3129 * after the U+0026 AMPERSAND (&) must be
3130 * unconsumed, and nothing is returned.
3131 */
3132 errNoNamedCharacterMatch();
3133 appendStrBufToLongStrBuf();
3134 state = transition(state, returnState, reconsume, pos);
3135 reconsume = true;
3136 continue stateloop;
3137 }
3138 }
3139 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3140 errUnescapedAmpersandInterpretedAsCharacterReference();
3141 } else {
3142 errNotSemicolonTerminated();
3143 }
3144 }
3145
3146 /*
3147 * Otherwise, return a character token for the character
3148 * corresponding to the entity name (as given by the
3149 * second column of the named character references
3150 * table).
3151 */
3152 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
3153 if (
3154 // [NOCPP[
3155 val.length == 1
3156 // ]NOCPP]
3157 // CPPONLY: val[1] == 0
3158 ) {
3159 emitOrAppendOne(val, returnState);
3160 } else {
3161 emitOrAppendTwo(val, returnState);
3162 }
3163 // this is so complicated!
3164 if (strBufMark < strBufLen) {
3165 // if (strBufOffset != -1) {
3166 // if ((returnState & (~1)) != 0) {
3167 // for (int i = strBufMark; i < strBufLen; i++) {
3168 // appendLongStrBuf(buf[strBufOffset + i]);
3169 // }
3170 // } else {
3171 // tokenHandler.characters(buf, strBufOffset
3172 // + strBufMark, strBufLen
3173 // - strBufMark);
3174 // }
3175 // } else {
3176 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3177 for (int i = strBufMark; i < strBufLen; i++) {
3178 appendLongStrBuf(strBuf[i]);
3179 }
3180 } else {
3181 tokenHandler.characters(strBuf, strBufMark,
3182 strBufLen - strBufMark);
3183 }
3184 // }
3185 }
3186 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3187 cstart = pos;
3188 }
3189 state = transition(state, returnState, reconsume, pos);
3190 reconsume = true;
3191 continue stateloop;
3192 /*
3193 * If the markup contains I'm ¬it; I tell you, the
3194 * entity is parsed as "not", as in, I'm ¬it; I tell
3195 * you. But if the markup was I'm ∉ I tell you,
3196 * the entity would be parsed as "notin;", resulting in
3197 * I'm ∉ I tell you.
3198 */
3199 }
3200 // XXX reorder point
3201 case CONSUME_NCR:
3202 if (++pos == endPos) {
3203 break stateloop;
3204 }
3205 c = checkChar(buf, pos);
3206 prevValue = -1;
3207 value = 0;
3208 seenDigits = false;
3209 /*
3210 * The behavior further depends on the character after the
3211 * U+0023 NUMBER SIGN:
3212 */
3213 switch (c) {
3214 case 'x':
3215 case 'X':
3216
3217 /*
3218 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
3219 * LETTER X Consume the X.
3220 *
3221 * Follow the steps below, but using the range of
3222 * characters U+0030 DIGIT ZERO through to U+0039
3223 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
3224 * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
3225 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
3226 * LETTER F (in other words, 0-9, A-F, a-f).
3227 *
3228 * When it comes to interpreting the number,
3229 * interpret it as a hexadecimal number.
3230 */
3231 appendStrBuf(c);
3232 state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
3233 continue stateloop;
3234 default:
3235 /*
3236 * Anything else Follow the steps below, but using
3237 * the range of characters U+0030 DIGIT ZERO through
3238 * to U+0039 DIGIT NINE (i.e. just 0-9).
3239 *
3240 * When it comes to interpreting the number,
3241 * interpret it as a decimal number.
3242 */
3243 state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
3244 reconsume = true;
3245 // FALL THROUGH continue stateloop;
3246 }
3247 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3248 case DECIMAL_NRC_LOOP:
3249 decimalloop: for (;;) {
3250 if (reconsume) {
3251 reconsume = false;
3252 } else {
3253 if (++pos == endPos) {
3254 break stateloop;
3255 }
3256 c = checkChar(buf, pos);
3257 }
3258 // Deal with overflow gracefully
3259 if (value < prevValue) {
3260 value = 0x110000; // Value above Unicode range but
3261 // within int
3262 // range
3263 }
3264 prevValue = value;
3265 /*
3266 * Consume as many characters as match the range of
3267 * characters given above.
3268 */
3269 if (c >= '0' && c <= '9') {
3270 seenDigits = true;
3271 value *= 10;
3272 value += c - '0';
3273 continue;
3274 } else if (c == ';') {
3275 if (seenDigits) {
3276 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3277 cstart = pos + 1;
3278 }
3279 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3280 // FALL THROUGH continue stateloop;
3281 break decimalloop;
3282 } else {
3283 errNoDigitsInNCR();
3284 appendStrBuf(';');
3285 emitOrAppendStrBuf(returnState);
3286 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3287 cstart = pos + 1;
3288 }
3289 state = transition(state, returnState, reconsume, pos);
3290 continue stateloop;
3291 }
3292 } else {
3293 /*
3294 * If no characters match the range, then don't
3295 * consume any characters (and unconsume the U+0023
3296 * NUMBER SIGN character and, if appropriate, the X
3297 * character). This is a parse error; nothing is
3298 * returned.
3299 *
3300 * Otherwise, if the next character is a U+003B
3301 * SEMICOLON, consume that too. If it isn't, there
3302 * is a parse error.
3303 */
3304 if (!seenDigits) {
3305 errNoDigitsInNCR();
3306 emitOrAppendStrBuf(returnState);
3307 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3308 cstart = pos;
3309 }
3310 state = transition(state, returnState, reconsume, pos);
3311 reconsume = true;
3312 continue stateloop;
3313 } else {
3314 errCharRefLacksSemicolon();
3315 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3316 cstart = pos;
3317 }
3318 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3319 reconsume = true;
3320 // FALL THROUGH continue stateloop;
3321 break decimalloop;
3322 }
3323 }
3324 }
3325 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3326 case HANDLE_NCR_VALUE:
3327 // WARNING previous state sets reconsume
3328 // XXX inline this case if the method size can take it
3329 handleNcrValue(returnState);
3330 state = transition(state, returnState, reconsume, pos);
3331 continue stateloop;
3332 // XXX reorder point
3333 case HEX_NCR_LOOP:
3334 for (;;) {
3335 if (++pos == endPos) {
3336 break stateloop;
3337 }
3338 c = checkChar(buf, pos);
3339 // Deal with overflow gracefully
3340 if (value < prevValue) {
3341 value = 0x110000; // Value above Unicode range but
3342 // within int
3343 // range
3344 }
3345 prevValue = value;
3346 /*
3347 * Consume as many characters as match the range of
3348 * characters given above.
3349 */
3350 if (c >= '0' && c <= '9') {
3351 seenDigits = true;
3352 value *= 16;
3353 value += c - '0';
3354 continue;
3355 } else if (c >= 'A' && c <= 'F') {
3356 seenDigits = true;
3357 value *= 16;
3358 value += c - 'A' + 10;
3359 continue;
3360 } else if (c >= 'a' && c <= 'f') {
3361 seenDigits = true;
3362 value *= 16;
3363 value += c - 'a' + 10;
3364 continue;
3365 } else if (c == ';') {
3366 if (seenDigits) {
3367 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3368 cstart = pos + 1;
3369 }
3370 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3371 continue stateloop;
3372 } else {
3373 errNoDigitsInNCR();
3374 appendStrBuf(';');
3375 emitOrAppendStrBuf(returnState);
3376 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3377 cstart = pos + 1;
3378 }
3379 state = transition(state, returnState, reconsume, pos);
3380 continue stateloop;
3381 }
3382 } else {
3383 /*
3384 * If no characters match the range, then don't
3385 * consume any characters (and unconsume the U+0023
3386 * NUMBER SIGN character and, if appropriate, the X
3387 * character). This is a parse error; nothing is
3388 * returned.
3389 *
3390 * Otherwise, if the next character is a U+003B
3391 * SEMICOLON, consume that too. If it isn't, there
3392 * is a parse error.
3393 */
3394 if (!seenDigits) {
3395 errNoDigitsInNCR();
3396 emitOrAppendStrBuf(returnState);
3397 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3398 cstart = pos;
3399 }
3400 state = transition(state, returnState, reconsume, pos);
3401 reconsume = true;
3402 continue stateloop;
3403 } else {
3404 errCharRefLacksSemicolon();
3405 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3406 cstart = pos;
3407 }
3408 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3409 reconsume = true;
3410 continue stateloop;
3411 }
3412 }
3413 }
3414 // XXX reorder point
3415 case PLAINTEXT:
3416 plaintextloop: for (;;) {
3417 if (reconsume) {
3418 reconsume = false;
3419 } else {
3420 if (++pos == endPos) {
3421 break stateloop;
3422 }
3423 c = checkChar(buf, pos);
3424 }
3425 switch (c) {
3426 case '\u0000':
3427 emitPlaintextReplacementCharacter(buf, pos);
3428 continue;
3429 case '\r':
3430 emitCarriageReturn(buf, pos);
3431 break stateloop;
3432 case '\n':
3433 silentLineFeed();
3434 default:
3435 /*
3436 * Anything else Emit the current input
3437 * character as a character token. Stay in the
3438 * RAWTEXT state.
3439 */
3440 continue;
3441 }
3442 }
3443 // XXX reorder point
3444 case CLOSE_TAG_OPEN:
3445 if (++pos == endPos) {
3446 break stateloop;
3447 }
3448 c = checkChar(buf, pos);
3449 /*
3450 * Otherwise, if the content model flag is set to the PCDATA
3451 * state, or if the next few characters do match that tag
3452 * name, consume the next input character:
3453 */
3454 switch (c) {
3455 case '>':
3456 /* U+003E GREATER-THAN SIGN (>) Parse error. */
3457 errLtSlashGt();
3458 /*
3459 * Switch to the data state.
3460 */
3461 cstart = pos + 1;
3462 state = transition(state, Tokenizer.DATA, reconsume, pos);
3463 continue stateloop;
3464 case '\r':
3465 silentCarriageReturn();
3466 /* Anything else Parse error. */
3467 errGarbageAfterLtSlash();
3468 /*
3469 * Switch to the bogus comment state.
3470 */
3471 clearLongStrBufAndAppend('\n');
3472 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3473 break stateloop;
3474 case '\n':
3475 silentLineFeed();
3476 /* Anything else Parse error. */
3477 errGarbageAfterLtSlash();
3478 /*
3479 * Switch to the bogus comment state.
3480 */
3481 clearLongStrBufAndAppend('\n');
3482 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3483 continue stateloop;
3484 case '\u0000':
3485 c = '\uFFFD';
3486 // fall thru
3487 default:
3488 if (c >= 'A' && c <= 'Z') {
3489 c += 0x20;
3490 }
3491 if (c >= 'a' && c <= 'z') {
3492 /*
3493 * U+0061 LATIN SMALL LETTER A through to U+007A
3494 * LATIN SMALL LETTER Z Create a new end tag
3495 * token,
3496 */
3497 endTag = true;
3498 /*
3499 * set its tag name to the input character,
3500 */
3501 clearStrBufAndAppend(c);
3502 /*
3503 * then switch to the tag name state. (Don't
3504 * emit the token yet; further details will be
3505 * filled in before it is emitted.)
3506 */
3507 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
3508 continue stateloop;
3509 } else {
3510 /* Anything else Parse error. */
3511 errGarbageAfterLtSlash();
3512 /*
3513 * Switch to the bogus comment state.
3514 */
3515 clearLongStrBufAndAppend(c);
3516 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3517 continue stateloop;
3518 }
3519 }
3520 // XXX reorder point
3521 case RCDATA:
3522 rcdataloop: for (;;) {
3523 if (reconsume) {
3524 reconsume = false;
3525 } else {
3526 if (++pos == endPos) {
3527 break stateloop;
3528 }
3529 c = checkChar(buf, pos);
3530 }
3531 switch (c) {
3532 case '&':
3533 /*
3534 * U+0026 AMPERSAND (&) Switch to the character
3535 * reference in RCDATA state.
3536 */
3537 flushChars(buf, pos);
3538 clearStrBufAndAppend(c);
3539 additional = '\u0000';
3540 returnState = state;
3541 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
3542 continue stateloop;
3543 case '<':
3544 /*
3545 * U+003C LESS-THAN SIGN (<) Switch to the
3546 * RCDATA less-than sign state.
3547 */
3548 flushChars(buf, pos);
3549
3550 returnState = state;
3551 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3552 continue stateloop;
3553 case '\u0000':
3554 emitReplacementCharacter(buf, pos);
3555 continue;
3556 case '\r':
3557 emitCarriageReturn(buf, pos);
3558 break stateloop;
3559 case '\n':
3560 silentLineFeed();
3561 default:
3562 /*
3563 * Emit the current input character as a
3564 * character token. Stay in the RCDATA state.
3565 */
3566 continue;
3567 }
3568 }
3569 // XXX reorder point
3570 case RAWTEXT:
3571 rawtextloop: for (;;) {
3572 if (reconsume) {
3573 reconsume = false;
3574 } else {
3575 if (++pos == endPos) {
3576 break stateloop;
3577 }
3578 c = checkChar(buf, pos);
3579 }
3580 switch (c) {
3581 case '<':
3582 /*
3583 * U+003C LESS-THAN SIGN (<) Switch to the
3584 * RAWTEXT less-than sign state.
3585 */
3586 flushChars(buf, pos);
3587
3588 returnState = state;
3589 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3590 break rawtextloop;
3591 // FALL THRU continue stateloop;
3592 case '\u0000':
3593 emitReplacementCharacter(buf, pos);
3594 continue;
3595 case '\r':
3596 emitCarriageReturn(buf, pos);
3597 break stateloop;
3598 case '\n':
3599 silentLineFeed();
3600 default:
3601 /*
3602 * Emit the current input character as a
3603 * character token. Stay in the RAWTEXT state.
3604 */
3605 continue;
3606 }
3607 }
3608 // XXX fallthru don't reorder
3609 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
3610 rawtextrcdatalessthansignloop: for (;;) {
3611 if (++pos == endPos) {
3612 break stateloop;
3613 }
3614 c = checkChar(buf, pos);
3615 switch (c) {
3616 case '/':
3617 /*
3618 * U+002F SOLIDUS (/) Set the temporary buffer
3619 * to the empty string. Switch to the script
3620 * data end tag open state.
3621 */
3622 index = 0;
3623 clearStrBuf();
3624 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
3625 break rawtextrcdatalessthansignloop;
3626 // FALL THRU continue stateloop;
3627 default:
3628 /*
3629 * Otherwise, emit a U+003C LESS-THAN SIGN
3630 * character token
3631 */
3632 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3633 /*
3634 * and reconsume the current input character in
3635 * the data state.
3636 */
3637 cstart = pos;
3638 state = transition(state, returnState, reconsume, pos);
3639 reconsume = true;
3640 continue stateloop;
3641 }
3642 }
3643 // XXX fall thru. don't reorder.
3644 case NON_DATA_END_TAG_NAME:
3645 for (;;) {
3646 if (++pos == endPos) {
3647 break stateloop;
3648 }
3649 c = checkChar(buf, pos);
3650 /*
3651 * ASSERT! when entering this state, set index to 0 and
3652 * call clearStrBuf() assert (contentModelElement !=
3653 * null); Let's implement the above without lookahead.
3654 * strBuf is the 'temporary buffer'.
3655 */
3656 if (index < endTagExpectationAsArray.length) {
3657 char e = endTagExpectationAsArray[index];
3658 char folded = c;
3659 if (c >= 'A' && c <= 'Z') {
3660 folded += 0x20;
3661 }
3662 if (folded != e) {
3663 // [NOCPP[
3664 errHtml4LtSlashInRcdata(folded);
3665 // ]NOCPP]
3666 tokenHandler.characters(Tokenizer.LT_SOLIDUS,
3667 0, 2);
3668 emitStrBuf();
3669 cstart = pos;
3670 state = transition(state, returnState, reconsume, pos);
3671 reconsume = true;
3672 continue stateloop;
3673 }
3674 appendStrBuf(c);
3675 index++;
3676 continue;
3677 } else {
3678 endTag = true;
3679 // XXX replace contentModelElement with different
3680 // type
3681 tagName = endTagExpectation;
3682 switch (c) {
3683 case '\r':
3684 silentCarriageReturn();
3685 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3686 break stateloop;
3687 case '\n':
3688 silentLineFeed();
3689 // fall thru
3690 case ' ':
3691 case '\t':
3692 case '\u000C':
3693 /*
3694 * U+0009 CHARACTER TABULATION U+000A LINE
3695 * FEED (LF) U+000C FORM FEED (FF) U+0020
3696 * SPACE If the current end tag token is an
3697 * appropriate end tag token, then switch to
3698 * the before attribute name state.
3699 */
3700 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3701 continue stateloop;
3702 case '/':
3703 /*
3704 * U+002F SOLIDUS (/) If the current end tag
3705 * token is an appropriate end tag token,
3706 * then switch to the self-closing start tag
3707 * state.
3708 */
3709 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
3710 continue stateloop;
3711 case '>':
3712 /*
3713 * U+003E GREATER-THAN SIGN (>) If the
3714 * current end tag token is an appropriate
3715 * end tag token, then emit the current tag
3716 * token and switch to the data state.
3717 */
3718 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
3719 if (shouldSuspend) {
3720 break stateloop;
3721 }
3722 continue stateloop;
3723 default:
3724 /*
3725 * Emit a U+003C LESS-THAN SIGN character
3726 * token, a U+002F SOLIDUS character token,
3727 * a character token for each of the
3728 * characters in the temporary buffer (in
3729 * the order they were added to the buffer),
3730 * and reconsume the current input character
3731 * in the RAWTEXT state.
3732 */
3733 // [NOCPP[
3734 errWarnLtSlashInRcdata();
3735 // ]NOCPP]
3736 tokenHandler.characters(
3737 Tokenizer.LT_SOLIDUS, 0, 2);
3738 emitStrBuf();
3739 if (c == '\u0000') {
3740 emitReplacementCharacter(buf, pos);
3741 } else {
3742 cstart = pos; // don't drop the
3743 // character
3744 }
3745 state = transition(state, returnState, reconsume, pos);
3746 continue stateloop;
3747 }
3748 }
3749 }
3750 // XXX reorder point
3751 // BEGIN HOTSPOT WORKAROUND
3752 case BOGUS_COMMENT:
3753 boguscommentloop: for (;;) {
3754 if (reconsume) {
3755 reconsume = false;
3756 } else {
3757 if (++pos == endPos) {
3758 break stateloop;
3759 }
3760 c = checkChar(buf, pos);
3761 }
3762 /*
3763 * Consume every character up to and including the first
3764 * U+003E GREATER-THAN SIGN character (>) or the end of
3765 * the file (EOF), whichever comes first. Emit a comment
3766 * token whose data is the concatenation of all the
3767 * characters starting from and including the character
3768 * that caused the state machine to switch into the
3769 * bogus comment state, up to and including the
3770 * character immediately before the last consumed
3771 * character (i.e. up to the character just before the
3772 * U+003E or EOF character). (If the comment was started
3773 * by the end of the file (EOF), the token is empty.)
3774 *
3775 * Switch to the data state.
3776 *
3777 * If the end of the file was reached, reconsume the EOF
3778 * character.
3779 */
3780 switch (c) {
3781 case '>':
3782 emitComment(0, pos);
3783 state = transition(state, Tokenizer.DATA, reconsume, pos);
3784 continue stateloop;
3785 case '-':
3786 appendLongStrBuf(c);
3787 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
3788 break boguscommentloop;
3789 case '\r':
3790 appendLongStrBufCarriageReturn();
3791 break stateloop;
3792 case '\n':
3793 appendLongStrBufLineFeed();
3794 continue;
3795 case '\u0000':
3796 c = '\uFFFD';
3797 // fall thru
3798 default:
3799 appendLongStrBuf(c);
3800 continue;
3801 }
3802 }
3803 // FALLTHRU DON'T REORDER
3804 case BOGUS_COMMENT_HYPHEN:
3805 boguscommenthyphenloop: for (;;) {
3806 if (++pos == endPos) {
3807 break stateloop;
3808 }
3809 c = checkChar(buf, pos);
3810 switch (c) {
3811 case '>':
3812 // [NOCPP[
3813 maybeAppendSpaceToBogusComment();
3814 // ]NOCPP]
3815 emitComment(0, pos);
3816 state = transition(state, Tokenizer.DATA, reconsume, pos);
3817 continue stateloop;
3818 case '-':
3819 appendSecondHyphenToBogusComment();
3820 continue boguscommenthyphenloop;
3821 case '\r':
3822 appendLongStrBufCarriageReturn();
3823 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3824 break stateloop;
3825 case '\n':
3826 appendLongStrBufLineFeed();
3827 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3828 continue stateloop;
3829 case '\u0000':
3830 c = '\uFFFD';
3831 // fall thru
3832 default:
3833 appendLongStrBuf(c);
3834 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3835 continue stateloop;
3836 }
3837 }
3838 // XXX reorder point
3839 case SCRIPT_DATA:
3840 scriptdataloop: for (;;) {
3841 if (reconsume) {
3842 reconsume = false;
3843 } else {
3844 if (++pos == endPos) {
3845 break stateloop;
3846 }
3847 c = checkChar(buf, pos);
3848 }
3849 switch (c) {
3850 case '<':
3851 /*
3852 * U+003C LESS-THAN SIGN (<) Switch to the
3853 * script data less-than sign state.
3854 */
3855 flushChars(buf, pos);
3856 returnState = state;
3857 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
3858 break scriptdataloop; // FALL THRU continue
3859 // stateloop;
3860 case '\u0000':
3861 emitReplacementCharacter(buf, pos);
3862 continue;
3863 case '\r':
3864 emitCarriageReturn(buf, pos);
3865 break stateloop;
3866 case '\n':
3867 silentLineFeed();
3868 default:
3869 /*
3870 * Anything else Emit the current input
3871 * character as a character token. Stay in the
3872 * script data state.
3873 */
3874 continue;
3875 }
3876 }
3877 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3878 case SCRIPT_DATA_LESS_THAN_SIGN:
3879 scriptdatalessthansignloop: for (;;) {
3880 if (++pos == endPos) {
3881 break stateloop;
3882 }
3883 c = checkChar(buf, pos);
3884 switch (c) {
3885 case '/':
3886 /*
3887 * U+002F SOLIDUS (/) Set the temporary buffer
3888 * to the empty string. Switch to the script
3889 * data end tag open state.
3890 */
3891 index = 0;
3892 clearStrBuf();
3893 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
3894 continue stateloop;
3895 case '!':
3896 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3897 cstart = pos;
3898 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
3899 break scriptdatalessthansignloop; // FALL THRU
3900 // continue
3901 // stateloop;
3902 default:
3903 /*
3904 * Otherwise, emit a U+003C LESS-THAN SIGN
3905 * character token
3906 */
3907 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3908 /*
3909 * and reconsume the current input character in
3910 * the data state.
3911 */
3912 cstart = pos;
3913 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
3914 reconsume = true;
3915 continue stateloop;
3916 }
3917 }
3918 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3919 case SCRIPT_DATA_ESCAPE_START:
3920 scriptdataescapestartloop: for (;;) {
3921 if (++pos == endPos) {
3922 break stateloop;
3923 }
3924 c = checkChar(buf, pos);
3925 /*
3926 * Consume the next input character:
3927 */
3928 switch (c) {
3929 case '-':
3930 /*
3931 * U+002D HYPHEN-MINUS (-) Emit a U+002D
3932 * HYPHEN-MINUS character token. Switch to the
3933 * script data escape start dash state.
3934 */
3935 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
3936 break scriptdataescapestartloop; // FALL THRU
3937 // continue
3938 // stateloop;
3939 default:
3940 /*
3941 * Anything else Reconsume the current input
3942 * character in the script data state.
3943 */
3944 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
3945 reconsume = true;
3946 continue stateloop;
3947 }
3948 }
3949 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3950 case SCRIPT_DATA_ESCAPE_START_DASH:
3951 scriptdataescapestartdashloop: for (;;) {
3952 if (++pos == endPos) {
3953 break stateloop;
3954 }
3955 c = checkChar(buf, pos);
3956 /*
3957 * Consume the next input character:
3958 */
3959 switch (c) {
3960 case '-':
3961 /*
3962 * U+002D HYPHEN-MINUS (-) Emit a U+002D
3963 * HYPHEN-MINUS character token. Switch to the
3964 * script data escaped dash dash state.
3965 */
3966 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
3967 break scriptdataescapestartdashloop;
3968 // continue stateloop;
3969 default:
3970 /*
3971 * Anything else Reconsume the current input
3972 * character in the script data state.
3973 */
3974 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
3975 reconsume = true;
3976 continue stateloop;
3977 }
3978 }
3979 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3980 case SCRIPT_DATA_ESCAPED_DASH_DASH:
3981 scriptdataescapeddashdashloop: for (;;) {
3982 if (++pos == endPos) {
3983 break stateloop;
3984 }
3985 c = checkChar(buf, pos);
3986 /*
3987 * Consume the next input character:
3988 */
3989 switch (c) {
3990 case '-':
3991 /*
3992 * U+002D HYPHEN-MINUS (-) Emit a U+002D
3993 * HYPHEN-MINUS character token. Stay in the
3994 * script data escaped dash dash state.
3995 */
3996 continue;
3997 case '<':
3998 /*
3999 * U+003C LESS-THAN SIGN (<) Switch to the
4000 * script data escaped less-than sign state.
4001 */
4002 flushChars(buf, pos);
4003 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4004 continue stateloop;
4005 case '>':
4006 /*
4007 * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4008 * GREATER-THAN SIGN character token. Switch to
4009 * the script data state.
4010 */
4011 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4012 continue stateloop;
4013 case '\u0000':
4014 emitReplacementCharacter(buf, pos);
4015 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4016 break scriptdataescapeddashdashloop;
4017 case '\r':
4018 emitCarriageReturn(buf, pos);
4019 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4020 break stateloop;
4021 case '\n':
4022 silentLineFeed();
4023 default:
4024 /*
4025 * Anything else Emit the current input
4026 * character as a character token. Switch to the
4027 * script data escaped state.
4028 */
4029 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4030 break scriptdataescapeddashdashloop;
4031 // continue stateloop;
4032 }
4033 }
4034 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4035 case SCRIPT_DATA_ESCAPED:
4036 scriptdataescapedloop: for (;;) {
4037 if (reconsume) {
4038 reconsume = false;
4039 } else {
4040 if (++pos == endPos) {
4041 break stateloop;
4042 }
4043 c = checkChar(buf, pos);
4044 }
4045 /*
4046 * Consume the next input character:
4047 */
4048 switch (c) {
4049 case '-':
4050 /*
4051 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4052 * HYPHEN-MINUS character token. Switch to the
4053 * script data escaped dash state.
4054 */
4055 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
4056 break scriptdataescapedloop; // FALL THRU
4057 // continue
4058 // stateloop;
4059 case '<':
4060 /*
4061 * U+003C LESS-THAN SIGN (<) Switch to the
4062 * script data escaped less-than sign state.
4063 */
4064 flushChars(buf, pos);
4065 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4066 continue stateloop;
4067 case '\u0000':
4068 emitReplacementCharacter(buf, pos);
4069 continue;
4070 case '\r':
4071 emitCarriageReturn(buf, pos);
4072 break stateloop;
4073 case '\n':
4074 silentLineFeed();
4075 default:
4076 /*
4077 * Anything else Emit the current input
4078 * character as a character token. Stay in the
4079 * script data escaped state.
4080 */
4081 continue;
4082 }
4083 }
4084 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4085 case SCRIPT_DATA_ESCAPED_DASH:
4086 scriptdataescapeddashloop: for (;;) {
4087 if (++pos == endPos) {
4088 break stateloop;
4089 }
4090 c = checkChar(buf, pos);
4091 /*
4092 * Consume the next input character:
4093 */
4094 switch (c) {
4095 case '-':
4096 /*
4097 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4098 * HYPHEN-MINUS character token. Switch to the
4099 * script data escaped dash dash state.
4100 */
4101 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4102 continue stateloop;
4103 case '<':
4104 /*
4105 * U+003C LESS-THAN SIGN (<) Switch to the
4106 * script data escaped less-than sign state.
4107 */
4108 flushChars(buf, pos);
4109 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4110 break scriptdataescapeddashloop;
4111 // continue stateloop;
4112 case '\u0000':
4113 emitReplacementCharacter(buf, pos);
4114 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4115 continue stateloop;
4116 case '\r':
4117 emitCarriageReturn(buf, pos);
4118 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4119 break stateloop;
4120 case '\n':
4121 silentLineFeed();
4122 default:
4123 /*
4124 * Anything else Emit the current input
4125 * character as a character token. Switch to the
4126 * script data escaped state.
4127 */
4128 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4129 continue stateloop;
4130 }
4131 }
4132 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4133 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
4134 scriptdataescapedlessthanloop: for (;;) {
4135 if (++pos == endPos) {
4136 break stateloop;
4137 }
4138 c = checkChar(buf, pos);
4139 /*
4140 * Consume the next input character:
4141 */
4142 switch (c) {
4143 case '/':
4144 /*
4145 * U+002F SOLIDUS (/) Set the temporary buffer
4146 * to the empty string. Switch to the script
4147 * data escaped end tag open state.
4148 */
4149 index = 0;
4150 clearStrBuf();
4151 returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
4152 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
4153 continue stateloop;
4154 case 'S':
4155 case 's':
4156 /*
4157 * U+0041 LATIN CAPITAL LETTER A through to
4158 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
4159 * LESS-THAN SIGN character token and the
4160 * current input character as a character token.
4161 */
4162 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4163 cstart = pos;
4164 index = 1;
4165 /*
4166 * Set the temporary buffer to the empty string.
4167 * Append the lowercase version of the current
4168 * input character (add 0x0020 to the
4169 * character's code point) to the temporary
4170 * buffer. Switch to the script data double
4171 * escape start state.
4172 */
4173 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
4174 break scriptdataescapedlessthanloop;
4175 // continue stateloop;
4176 default:
4177 /*
4178 * Anything else Emit a U+003C LESS-THAN SIGN
4179 * character token and reconsume the current
4180 * input character in the script data escaped
4181 * state.
4182 */
4183 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4184 cstart = pos;
4185 reconsume = true;
4186 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4187 continue stateloop;
4188 }
4189 }
4190 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4191 case SCRIPT_DATA_DOUBLE_ESCAPE_START:
4192 scriptdatadoubleescapestartloop: for (;;) {
4193 if (++pos == endPos) {
4194 break stateloop;
4195 }
4196 c = checkChar(buf, pos);
4197 assert (index > 0);
4198 if (index < 6) { // SCRIPT_ARR.length
4199 char folded = c;
4200 if (c >= 'A' && c <= 'Z') {
4201 folded += 0x20;
4202 }
4203 if (folded != Tokenizer.SCRIPT_ARR[index]) {
4204 reconsume = true;
4205 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4206 continue stateloop;
4207 }
4208 index++;
4209 continue;
4210 }
4211 switch (c) {
4212 case '\r':
4213 emitCarriageReturn(buf, pos);
4214 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4215 break stateloop;
4216 case '\n':
4217 silentLineFeed();
4218 case ' ':
4219 case '\t':
4220 case '\u000C':
4221 case '/':
4222 case '>':
4223 /*
4224 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4225 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4226 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4227 * (>) Emit the current input character as a
4228 * character token. If the temporary buffer is
4229 * the string "script", then switch to the
4230 * script data double escaped state.
4231 */
4232 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4233 break scriptdatadoubleescapestartloop;
4234 // continue stateloop;
4235 default:
4236 /*
4237 * Anything else Reconsume the current input
4238 * character in the script data escaped state.
4239 */
4240 reconsume = true;
4241 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4242 continue stateloop;
4243 }
4244 }
4245 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4246 case SCRIPT_DATA_DOUBLE_ESCAPED:
4247 scriptdatadoubleescapedloop: for (;;) {
4248 if (reconsume) {
4249 reconsume = false;
4250 } else {
4251 if (++pos == endPos) {
4252 break stateloop;
4253 }
4254 c = checkChar(buf, pos);
4255 }
4256 /*
4257 * Consume the next input character:
4258 */
4259 switch (c) {
4260 case '-':
4261 /*
4262 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4263 * HYPHEN-MINUS character token. Switch to the
4264 * script data double escaped dash state.
4265 */
4266 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
4267 break scriptdatadoubleescapedloop; // FALL THRU
4268 // continue
4269 // stateloop;
4270 case '<':
4271 /*
4272 * U+003C LESS-THAN SIGN (<) Emit a U+003C
4273 * LESS-THAN SIGN character token. Switch to the
4274 * script data double escaped less-than sign
4275 * state.
4276 */
4277 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4278 continue stateloop;
4279 case '\u0000':
4280 emitReplacementCharacter(buf, pos);
4281 continue;
4282 case '\r':
4283 emitCarriageReturn(buf, pos);
4284 break stateloop;
4285 case '\n':
4286 silentLineFeed();
4287 default:
4288 /*
4289 * Anything else Emit the current input
4290 * character as a character token. Stay in the
4291 * script data double escaped state.
4292 */
4293 continue;
4294 }
4295 }
4296 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4297 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
4298 scriptdatadoubleescapeddashloop: for (;;) {
4299 if (++pos == endPos) {
4300 break stateloop;
4301 }
4302 c = checkChar(buf, pos);
4303 /*
4304 * Consume the next input character:
4305 */
4306 switch (c) {
4307 case '-':
4308 /*
4309 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4310 * HYPHEN-MINUS character token. Switch to the
4311 * script data double escaped dash dash state.
4312 */
4313 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
4314 break scriptdatadoubleescapeddashloop;
4315 // continue stateloop;
4316 case '<':
4317 /*
4318 * U+003C LESS-THAN SIGN (<) Emit a U+003C
4319 * LESS-THAN SIGN character token. Switch to the
4320 * script data double escaped less-than sign
4321 * state.
4322 */
4323 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4324 continue stateloop;
4325 case '\u0000':
4326 emitReplacementCharacter(buf, pos);
4327 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4328 continue stateloop;
4329 case '\r':
4330 emitCarriageReturn(buf, pos);
4331 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4332 break stateloop;
4333 case '\n':
4334 silentLineFeed();
4335 default:
4336 /*
4337 * Anything else Emit the current input
4338 * character as a character token. Switch to the
4339 * script data double escaped state.
4340 */
4341 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4342 continue stateloop;
4343 }
4344 }
4345 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4346 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
4347 scriptdatadoubleescapeddashdashloop: for (;;) {
4348 if (++pos == endPos) {
4349 break stateloop;
4350 }
4351 c = checkChar(buf, pos);
4352 /*
4353 * Consume the next input character:
4354 */
4355 switch (c) {
4356 case '-':
4357 /*
4358 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4359 * HYPHEN-MINUS character token. Stay in the
4360 * script data double escaped dash dash state.
4361 */
4362 continue;
4363 case '<':
4364 /*
4365 * U+003C LESS-THAN SIGN (<) Emit a U+003C
4366 * LESS-THAN SIGN character token. Switch to the
4367 * script data double escaped less-than sign
4368 * state.
4369 */
4370 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4371 break scriptdatadoubleescapeddashdashloop;
4372 case '>':
4373 /*
4374 * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4375 * GREATER-THAN SIGN character token. Switch to
4376 * the script data state.
4377 */
4378 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4379 continue stateloop;
4380 case '\u0000':
4381 emitReplacementCharacter(buf, pos);
4382 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4383 continue stateloop;
4384 case '\r':
4385 emitCarriageReturn(buf, pos);
4386 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4387 break stateloop;
4388 case '\n':
4389 silentLineFeed();
4390 default:
4391 /*
4392 * Anything else Emit the current input
4393 * character as a character token. Switch to the
4394 * script data double escaped state.
4395 */
4396 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4397 continue stateloop;
4398 }
4399 }
4400 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4401 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
4402 scriptdatadoubleescapedlessthanloop: for (;;) {
4403 if (++pos == endPos) {
4404 break stateloop;
4405 }
4406 c = checkChar(buf, pos);
4407 /*
4408 * Consume the next input character:
4409 */
4410 switch (c) {
4411 case '/':
4412 /*
4413 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
4414 * character token. Set the temporary buffer to
4415 * the empty string. Switch to the script data
4416 * double escape end state.
4417 */
4418 index = 0;
4419 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
4420 break scriptdatadoubleescapedlessthanloop;
4421 default:
4422 /*
4423 * Anything else Reconsume the current input
4424 * character in the script data double escaped
4425 * state.
4426 */
4427 reconsume = true;
4428 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4429 continue stateloop;
4430 }
4431 }
4432 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4433 case SCRIPT_DATA_DOUBLE_ESCAPE_END:
4434 scriptdatadoubleescapeendloop: for (;;) {
4435 if (++pos == endPos) {
4436 break stateloop;
4437 }
4438 c = checkChar(buf, pos);
4439 if (index < 6) { // SCRIPT_ARR.length
4440 char folded = c;
4441 if (c >= 'A' && c <= 'Z') {
4442 folded += 0x20;
4443 }
4444 if (folded != Tokenizer.SCRIPT_ARR[index]) {
4445 reconsume = true;
4446 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4447 continue stateloop;
4448 }
4449 index++;
4450 continue;
4451 }
4452 switch (c) {
4453 case '\r':
4454 emitCarriageReturn(buf, pos);
4455 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4456 break stateloop;
4457 case '\n':
4458 silentLineFeed();
4459 case ' ':
4460 case '\t':
4461 case '\u000C':
4462 case '/':
4463 case '>':
4464 /*
4465 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4466 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4467 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4468 * (>) Emit the current input character as a
4469 * character token. If the temporary buffer is
4470 * the string "script", then switch to the
4471 * script data escaped state.
4472 */
4473 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4474 continue stateloop;
4475 default:
4476 /*
4477 * Reconsume the current input character in the
4478 * script data double escaped state.
4479 */
4480 reconsume = true;
4481 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4482 continue stateloop;
4483 }
4484 }
4485 // XXX reorder point
4486 case MARKUP_DECLARATION_OCTYPE:
4487 markupdeclarationdoctypeloop: for (;;) {
4488 if (++pos == endPos) {
4489 break stateloop;
4490 }
4491 c = checkChar(buf, pos);
4492 if (index < 6) { // OCTYPE.length
4493 char folded = c;
4494 if (c >= 'A' && c <= 'Z') {
4495 folded += 0x20;
4496 }
4497 if (folded == Tokenizer.OCTYPE[index]) {
4498 appendLongStrBuf(c);
4499 } else {
4500 errBogusComment();
4501 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4502 reconsume = true;
4503 continue stateloop;
4504 }
4505 index++;
4506 continue;
4507 } else {
4508 state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
4509 reconsume = true;
4510 break markupdeclarationdoctypeloop;
4511 // continue stateloop;
4512 }
4513 }
4514 // FALLTHRU DON'T REORDER
4515 case DOCTYPE:
4516 doctypeloop: for (;;) {
4517 if (reconsume) {
4518 reconsume = false;
4519 } else {
4520 if (++pos == endPos) {
4521 break stateloop;
4522 }
4523 c = checkChar(buf, pos);
4524 }
4525 initDoctypeFields();
4526 /*
4527 * Consume the next input character:
4528 */
4529 switch (c) {
4530 case '\r':
4531 silentCarriageReturn();
4532 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4533 break stateloop;
4534 case '\n':
4535 silentLineFeed();
4536 // fall thru
4537 case ' ':
4538 case '\t':
4539 case '\u000C':
4540 /*
4541 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4542 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4543 * Switch to the before DOCTYPE name state.
4544 */
4545 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4546 break doctypeloop;
4547 // continue stateloop;
4548 default:
4549 /*
4550 * Anything else Parse error.
4551 */
4552 errMissingSpaceBeforeDoctypeName();
4553 /*
4554 * Reconsume the current character in the before
4555 * DOCTYPE name state.
4556 */
4557 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4558 reconsume = true;
4559 break doctypeloop;
4560 // continue stateloop;
4561 }
4562 }
4563 // FALLTHRU DON'T REORDER
4564 case BEFORE_DOCTYPE_NAME:
4565 beforedoctypenameloop: for (;;) {
4566 if (reconsume) {
4567 reconsume = false;
4568 } else {
4569 if (++pos == endPos) {
4570 break stateloop;
4571 }
4572 c = checkChar(buf, pos);
4573 }
4574 /*
4575 * Consume the next input character:
4576 */
4577 switch (c) {
4578 case '\r':
4579 silentCarriageReturn();
4580 break stateloop;
4581 case '\n':
4582 silentLineFeed();
4583 // fall thru
4584 case ' ':
4585 case '\t':
4586 case '\u000C':
4587 /*
4588 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4589 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4590 * in the before DOCTYPE name state.
4591 */
4592 continue;
4593 case '>':
4594 /*
4595 * U+003E GREATER-THAN SIGN (>) Parse error.
4596 */
4597 errNamelessDoctype();
4598 /*
4599 * Create a new DOCTYPE token. Set its
4600 * force-quirks flag to on.
4601 */
4602 forceQuirks = true;
4603 /*
4604 * Emit the token.
4605 */
4606 emitDoctypeToken(pos);
4607 /*
4608 * Switch to the data state.
4609 */
4610 state = transition(state, Tokenizer.DATA, reconsume, pos);
4611 continue stateloop;
4612 case '\u0000':
4613 c = '\uFFFD';
4614 // fall thru
4615 default:
4616 if (c >= 'A' && c <= 'Z') {
4617 /*
4618 * U+0041 LATIN CAPITAL LETTER A through to
4619 * U+005A LATIN CAPITAL LETTER Z Create a
4620 * new DOCTYPE token. Set the token's name
4621 * to the lowercase version of the input
4622 * character (add 0x0020 to the character's
4623 * code point).
4624 */
4625 c += 0x20;
4626 }
4627 /* Anything else Create a new DOCTYPE token. */
4628 /*
4629 * Set the token's name name to the current
4630 * input character.
4631 */
4632 clearStrBufAndAppend(c);
4633 /*
4634 * Switch to the DOCTYPE name state.
4635 */
4636 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
4637 break beforedoctypenameloop;
4638 // continue stateloop;
4639 }
4640 }
4641 // FALLTHRU DON'T REORDER
4642 case DOCTYPE_NAME:
4643 doctypenameloop: for (;;) {
4644 if (++pos == endPos) {
4645 break stateloop;
4646 }
4647 c = checkChar(buf, pos);
4648 /*
4649 * Consume the next input character:
4650 */
4651 switch (c) {
4652 case '\r':
4653 silentCarriageReturn();
4654 strBufToDoctypeName();
4655 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4656 break stateloop;
4657 case '\n':
4658 silentLineFeed();
4659 // fall thru
4660 case ' ':
4661 case '\t':
4662 case '\u000C':
4663 /*
4664 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4665 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4666 * Switch to the after DOCTYPE name state.
4667 */
4668 strBufToDoctypeName();
4669 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4670 break doctypenameloop;
4671 // continue stateloop;
4672 case '>':
4673 /*
4674 * U+003E GREATER-THAN SIGN (>) Emit the current
4675 * DOCTYPE token.
4676 */
4677 strBufToDoctypeName();
4678 emitDoctypeToken(pos);
4679 /*
4680 * Switch to the data state.
4681 */
4682 state = transition(state, Tokenizer.DATA, reconsume, pos);
4683 continue stateloop;
4684 case '\u0000':
4685 c = '\uFFFD';
4686 // fall thru
4687 default:
4688 /*
4689 * U+0041 LATIN CAPITAL LETTER A through to
4690 * U+005A LATIN CAPITAL LETTER Z Append the
4691 * lowercase version of the input character (add
4692 * 0x0020 to the character's code point) to the
4693 * current DOCTYPE token's name.
4694 */
4695 if (c >= 'A' && c <= 'Z') {
4696 c += 0x0020;
4697 }
4698 /*
4699 * Anything else Append the current input
4700 * character to the current DOCTYPE token's
4701 * name.
4702 */
4703 appendStrBuf(c);
4704 /*
4705 * Stay in the DOCTYPE name state.
4706 */
4707 continue;
4708 }
4709 }
4710 // FALLTHRU DON'T REORDER
4711 case AFTER_DOCTYPE_NAME:
4712 afterdoctypenameloop: for (;;) {
4713 if (++pos == endPos) {
4714 break stateloop;
4715 }
4716 c = checkChar(buf, pos);
4717 /*
4718 * Consume the next input character:
4719 */
4720 switch (c) {
4721 case '\r':
4722 silentCarriageReturn();
4723 break stateloop;
4724 case '\n':
4725 silentLineFeed();
4726 // fall thru
4727 case ' ':
4728 case '\t':
4729 case '\u000C':
4730 /*
4731 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4732 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4733 * in the after DOCTYPE name state.
4734 */
4735 continue;
4736 case '>':
4737 /*
4738 * U+003E GREATER-THAN SIGN (>) Emit the current
4739 * DOCTYPE token.
4740 */
4741 emitDoctypeToken(pos);
4742 /*
4743 * Switch to the data state.
4744 */
4745 state = transition(state, Tokenizer.DATA, reconsume, pos);
4746 continue stateloop;
4747 case 'p':
4748 case 'P':
4749 index = 0;
4750 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
4751 break afterdoctypenameloop;
4752 // continue stateloop;
4753 case 's':
4754 case 'S':
4755 index = 0;
4756 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
4757 continue stateloop;
4758 default:
4759 /*
4760 * Otherwise, this is the parse error.
4761 */
4762 bogusDoctype();
4763
4764 /*
4765 * Set the DOCTYPE token's force-quirks flag to
4766 * on.
4767 */
4768 // done by bogusDoctype();
4769 /*
4770 * Switch to the bogus DOCTYPE state.
4771 */
4772 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4773 continue stateloop;
4774 }
4775 }
4776 // FALLTHRU DON'T REORDER
4777 case DOCTYPE_UBLIC:
4778 doctypeublicloop: for (;;) {
4779 if (++pos == endPos) {
4780 break stateloop;
4781 }
4782 c = checkChar(buf, pos);
4783 /*
4784 * If the six characters starting from the current input
4785 * character are an ASCII case-insensitive match for the
4786 * word "PUBLIC", then consume those characters and
4787 * switch to the before DOCTYPE public identifier state.
4788 */
4789 if (index < 5) { // UBLIC.length
4790 char folded = c;
4791 if (c >= 'A' && c <= 'Z') {
4792 folded += 0x20;
4793 }
4794 if (folded != Tokenizer.UBLIC[index]) {
4795 bogusDoctype();
4796 // forceQuirks = true;
4797 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4798 reconsume = true;
4799 continue stateloop;
4800 }
4801 index++;
4802 continue;
4803 } else {
4804 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
4805 reconsume = true;
4806 break doctypeublicloop;
4807 // continue stateloop;
4808 }
4809 }
4810 // FALLTHRU DON'T REORDER
4811 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
4812 afterdoctypepublickeywordloop: for (;;) {
4813 if (reconsume) {
4814 reconsume = false;
4815 } else {
4816 if (++pos == endPos) {
4817 break stateloop;
4818 }
4819 c = checkChar(buf, pos);
4820 }
4821 /*
4822 * Consume the next input character:
4823 */
4824 switch (c) {
4825 case '\r':
4826 silentCarriageReturn();
4827 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
4828 break stateloop;
4829 case '\n':
4830 silentLineFeed();
4831 // fall thru
4832 case ' ':
4833 case '\t':
4834 case '\u000C':
4835 /*
4836 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4837 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4838 * Switch to the before DOCTYPE public
4839 * identifier state.
4840 */
4841 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
4842 break afterdoctypepublickeywordloop;
4843 // FALL THROUGH continue stateloop
4844 case '"':
4845 /*
4846 * U+0022 QUOTATION MARK (") Parse Error.
4847 */
4848 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
4849 /*
4850 * Set the DOCTYPE token's public identifier to
4851 * the empty string (not missing),
4852 */
4853 clearLongStrBuf();
4854 /*
4855 * then switch to the DOCTYPE public identifier
4856 * (double-quoted) state.
4857 */
4858 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
4859 continue stateloop;
4860 case '\'':
4861 /*
4862 * U+0027 APOSTROPHE (') Parse Error.
4863 */
4864 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
4865 /*
4866 * Set the DOCTYPE token's public identifier to
4867 * the empty string (not missing),
4868 */
4869 clearLongStrBuf();
4870 /*
4871 * then switch to the DOCTYPE public identifier
4872 * (single-quoted) state.
4873 */
4874 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
4875 continue stateloop;
4876 case '>':
4877 /* U+003E GREATER-THAN SIGN (>) Parse error. */
4878 errExpectedPublicId();
4879 /*
4880 * Set the DOCTYPE token's force-quirks flag to
4881 * on.
4882 */
4883 forceQuirks = true;
4884 /*
4885 * Emit that DOCTYPE token.
4886 */
4887 emitDoctypeToken(pos);
4888 /*
4889 * Switch to the data state.
4890 */
4891 state = transition(state, Tokenizer.DATA, reconsume, pos);
4892 continue stateloop;
4893 default:
4894 bogusDoctype();
4895 /*
4896 * Set the DOCTYPE token's force-quirks flag to
4897 * on.
4898 */
4899 // done by bogusDoctype();
4900 /*
4901 * Switch to the bogus DOCTYPE state.
4902 */
4903 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4904 continue stateloop;
4905 }
4906 }
4907 // FALLTHRU DON'T REORDER
4908 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
4909 beforedoctypepublicidentifierloop: for (;;) {
4910 if (++pos == endPos) {
4911 break stateloop;
4912 }
4913 c = checkChar(buf, pos);
4914 /*
4915 * Consume the next input character:
4916 */
4917 switch (c) {
4918 case '\r':
4919 silentCarriageReturn();
4920 break stateloop;
4921 case '\n':
4922 silentLineFeed();
4923 // fall thru
4924 case ' ':
4925 case '\t':
4926 case '\u000C':
4927 /*
4928 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4929 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4930 * in the before DOCTYPE public identifier
4931 * state.
4932 */
4933 continue;
4934 case '"':
4935 /*
4936 * U+0022 QUOTATION MARK (") Set the DOCTYPE
4937 * token's public identifier to the empty string
4938 * (not missing),
4939 */
4940 clearLongStrBuf();
4941 /*
4942 * then switch to the DOCTYPE public identifier
4943 * (double-quoted) state.
4944 */
4945 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
4946 break beforedoctypepublicidentifierloop;
4947 // continue stateloop;
4948 case '\'':
4949 /*
4950 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
4951 * public identifier to the empty string (not
4952 * missing),
4953 */
4954 clearLongStrBuf();
4955 /*
4956 * then switch to the DOCTYPE public identifier
4957 * (single-quoted) state.
4958 */
4959 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
4960 continue stateloop;
4961 case '>':
4962 /* U+003E GREATER-THAN SIGN (>) Parse error. */
4963 errExpectedPublicId();
4964 /*
4965 * Set the DOCTYPE token's force-quirks flag to
4966 * on.
4967 */
4968 forceQuirks = true;
4969 /*
4970 * Emit that DOCTYPE token.
4971 */
4972 emitDoctypeToken(pos);
4973 /*
4974 * Switch to the data state.
4975 */
4976 state = transition(state, Tokenizer.DATA, reconsume, pos);
4977 continue stateloop;
4978 default:
4979 bogusDoctype();
4980 /*
4981 * Set the DOCTYPE token's force-quirks flag to
4982 * on.
4983 */
4984 // done by bogusDoctype();
4985 /*
4986 * Switch to the bogus DOCTYPE state.
4987 */
4988 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4989 continue stateloop;
4990 }
4991 }
4992 // FALLTHRU DON'T REORDER
4993 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
4994 doctypepublicidentifierdoublequotedloop: for (;;) {
4995 if (++pos == endPos) {
4996 break stateloop;
4997 }
4998 c = checkChar(buf, pos);
4999 /*
5000 * Consume the next input character:
5001 */
5002 switch (c) {
5003 case '"':
5004 /*
5005 * U+0022 QUOTATION MARK (") Switch to the after
5006 * DOCTYPE public identifier state.
5007 */
5008 publicIdentifier = longStrBufToString();
5009 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5010 break doctypepublicidentifierdoublequotedloop;
5011 // continue stateloop;
5012 case '>':
5013 /*
5014 * U+003E GREATER-THAN SIGN (>) Parse error.
5015 */
5016 errGtInPublicId();
5017 /*
5018 * Set the DOCTYPE token's force-quirks flag to
5019 * on.
5020 */
5021 forceQuirks = true;
5022 /*
5023 * Emit that DOCTYPE token.
5024 */
5025 publicIdentifier = longStrBufToString();
5026 emitDoctypeToken(pos);
5027 /*
5028 * Switch to the data state.
5029 */
5030 state = transition(state, Tokenizer.DATA, reconsume, pos);
5031 continue stateloop;
5032 case '\r':
5033 appendLongStrBufCarriageReturn();
5034 break stateloop;
5035 case '\n':
5036 appendLongStrBufLineFeed();
5037 continue;
5038 case '\u0000':
5039 c = '\uFFFD';
5040 // fall thru
5041 default:
5042 /*
5043 * Anything else Append the current input
5044 * character to the current DOCTYPE token's
5045 * public identifier.
5046 */
5047 appendLongStrBuf(c);
5048 /*
5049 * Stay in the DOCTYPE public identifier
5050 * (double-quoted) state.
5051 */
5052 continue;
5053 }
5054 }
5055 // FALLTHRU DON'T REORDER
5056 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
5057 afterdoctypepublicidentifierloop: for (;;) {
5058 if (++pos == endPos) {
5059 break stateloop;
5060 }
5061 c = checkChar(buf, pos);
5062 /*
5063 * Consume the next input character:
5064 */
5065 switch (c) {
5066 case '\r':
5067 silentCarriageReturn();
5068 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5069 break stateloop;
5070 case '\n':
5071 silentLineFeed();
5072 // fall thru
5073 case ' ':
5074 case '\t':
5075 case '\u000C':
5076 /*
5077 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5078 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5079 * Switch to the between DOCTYPE public and
5080 * system identifiers state.
5081 */
5082 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5083 break afterdoctypepublicidentifierloop;
5084 // continue stateloop;
5085 case '>':
5086 /*
5087 * U+003E GREATER-THAN SIGN (>) Emit the current
5088 * DOCTYPE token.
5089 */
5090 emitDoctypeToken(pos);
5091 /*
5092 * Switch to the data state.
5093 */
5094 state = transition(state, Tokenizer.DATA, reconsume, pos);
5095 continue stateloop;
5096 case '"':
5097 /*
5098 * U+0022 QUOTATION MARK (") Parse error.
5099 */
5100 errNoSpaceBetweenPublicAndSystemIds();
5101 /*
5102 * Set the DOCTYPE token's system identifier to
5103 * the empty string (not missing),
5104 */
5105 clearLongStrBuf();
5106 /*
5107 * then switch to the DOCTYPE system identifier
5108 * (double-quoted) state.
5109 */
5110 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5111 continue stateloop;
5112 case '\'':
5113 /*
5114 * U+0027 APOSTROPHE (') Parse error.
5115 */
5116 errNoSpaceBetweenPublicAndSystemIds();
5117 /*
5118 * Set the DOCTYPE token's system identifier to
5119 * the empty string (not missing),
5120 */
5121 clearLongStrBuf();
5122 /*
5123 * then switch to the DOCTYPE system identifier
5124 * (single-quoted) state.
5125 */
5126 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5127 continue stateloop;
5128 default:
5129 bogusDoctype();
5130 /*
5131 * Set the DOCTYPE token's force-quirks flag to
5132 * on.
5133 */
5134 // done by bogusDoctype();
5135 /*
5136 * Switch to the bogus DOCTYPE state.
5137 */
5138 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5139 continue stateloop;
5140 }
5141 }
5142 // FALLTHRU DON'T REORDER
5143 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
5144 betweendoctypepublicandsystemidentifiersloop: for (;;) {
5145 if (++pos == endPos) {
5146 break stateloop;
5147 }
5148 c = checkChar(buf, pos);
5149 /*
5150 * Consume the next input character:
5151 */
5152 switch (c) {
5153 case '\r':
5154 silentCarriageReturn();
5155 break stateloop;
5156 case '\n':
5157 silentLineFeed();
5158 // fall thru
5159 case ' ':
5160 case '\t':
5161 case '\u000C':
5162 /*
5163 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5164 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5165 * in the between DOCTYPE public and system
5166 * identifiers state.
5167 */
5168 continue;
5169 case '>':
5170 /*
5171 * U+003E GREATER-THAN SIGN (>) Emit the current
5172 * DOCTYPE token.
5173 */
5174 emitDoctypeToken(pos);
5175 /*
5176 * Switch to the data state.
5177 */
5178 state = transition(state, Tokenizer.DATA, reconsume, pos);
5179 continue stateloop;
5180 case '"':
5181 /*
5182 * U+0022 QUOTATION MARK (") Set the DOCTYPE
5183 * token's system identifier to the empty string
5184 * (not missing),
5185 */
5186 clearLongStrBuf();
5187 /*
5188 * then switch to the DOCTYPE system identifier
5189 * (double-quoted) state.
5190 */
5191 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5192 break betweendoctypepublicandsystemidentifiersloop;
5193 // continue stateloop;
5194 case '\'':
5195 /*
5196 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5197 * system identifier to the empty string (not
5198 * missing),
5199 */
5200 clearLongStrBuf();
5201 /*
5202 * then switch to the DOCTYPE system identifier
5203 * (single-quoted) state.
5204 */
5205 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5206 continue stateloop;
5207 default:
5208 bogusDoctype();
5209 /*
5210 * Set the DOCTYPE token's force-quirks flag to
5211 * on.
5212 */
5213 // done by bogusDoctype();
5214 /*
5215 * Switch to the bogus DOCTYPE state.
5216 */
5217 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5218 continue stateloop;
5219 }
5220 }
5221 // FALLTHRU DON'T REORDER
5222 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
5223 doctypesystemidentifierdoublequotedloop: for (;;) {
5224 if (++pos == endPos) {
5225 break stateloop;
5226 }
5227 c = checkChar(buf, pos);
5228 /*
5229 * Consume the next input character:
5230 */
5231 switch (c) {
5232 case '"':
5233 /*
5234 * U+0022 QUOTATION MARK (") Switch to the after
5235 * DOCTYPE system identifier state.
5236 */
5237 systemIdentifier = longStrBufToString();
5238 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5239 continue stateloop;
5240 case '>':
5241 /*
5242 * U+003E GREATER-THAN SIGN (>) Parse error.
5243 */
5244 errGtInSystemId();
5245 /*
5246 * Set the DOCTYPE token's force-quirks flag to
5247 * on.
5248 */
5249 forceQuirks = true;
5250 /*
5251 * Emit that DOCTYPE token.
5252 */
5253 systemIdentifier = longStrBufToString();
5254 emitDoctypeToken(pos);
5255 /*
5256 * Switch to the data state.
5257 */
5258 state = transition(state, Tokenizer.DATA, reconsume, pos);
5259 continue stateloop;
5260 case '\r':
5261 appendLongStrBufCarriageReturn();
5262 break stateloop;
5263 case '\n':
5264 appendLongStrBufLineFeed();
5265 continue;
5266 case '\u0000':
5267 c = '\uFFFD';
5268 // fall thru
5269 default:
5270 /*
5271 * Anything else Append the current input
5272 * character to the current DOCTYPE token's
5273 * system identifier.
5274 */
5275 appendLongStrBuf(c);
5276 /*
5277 * Stay in the DOCTYPE system identifier
5278 * (double-quoted) state.
5279 */
5280 continue;
5281 }
5282 }
5283 // FALLTHRU DON'T REORDER
5284 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
5285 afterdoctypesystemidentifierloop: for (;;) {
5286 if (++pos == endPos) {
5287 break stateloop;
5288 }
5289 c = checkChar(buf, pos);
5290 /*
5291 * Consume the next input character:
5292 */
5293 switch (c) {
5294 case '\r':
5295 silentCarriageReturn();
5296 break stateloop;
5297 case '\n':
5298 silentLineFeed();
5299 // fall thru
5300 case ' ':
5301 case '\t':
5302 case '\u000C':
5303 /*
5304 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5305 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5306 * in the after DOCTYPE system identifier state.
5307 */
5308 continue;
5309 case '>':
5310 /*
5311 * U+003E GREATER-THAN SIGN (>) Emit the current
5312 * DOCTYPE token.
5313 */
5314 emitDoctypeToken(pos);
5315 /*
5316 * Switch to the data state.
5317 */
5318 state = transition(state, Tokenizer.DATA, reconsume, pos);
5319 continue stateloop;
5320 default:
5321 /*
5322 * Switch to the bogus DOCTYPE state. (This does
5323 * not set the DOCTYPE token's force-quirks flag
5324 * to on.)
5325 */
5326 bogusDoctypeWithoutQuirks();
5327 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5328 break afterdoctypesystemidentifierloop;
5329 // continue stateloop;
5330 }
5331 }
5332 // FALLTHRU DON'T REORDER
5333 case BOGUS_DOCTYPE:
5334 for (;;) {
5335 if (reconsume) {
5336 reconsume = false;
5337 } else {
5338 if (++pos == endPos) {
5339 break stateloop;
5340 }
5341 c = checkChar(buf, pos);
5342 }
5343 /*
5344 * Consume the next input character:
5345 */
5346 switch (c) {
5347 case '>':
5348 /*
5349 * U+003E GREATER-THAN SIGN (>) Emit that
5350 * DOCTYPE token.
5351 */
5352 emitDoctypeToken(pos);
5353 /*
5354 * Switch to the data state.
5355 */
5356 state = transition(state, Tokenizer.DATA, reconsume, pos);
5357 continue stateloop;
5358 case '\r':
5359 silentCarriageReturn();
5360 break stateloop;
5361 case '\n':
5362 silentLineFeed();
5363 // fall thru
5364 default:
5365 /*
5366 * Anything else Stay in the bogus DOCTYPE
5367 * state.
5368 */
5369 continue;
5370 }
5371 }
5372 // XXX reorder point
5373 case DOCTYPE_YSTEM:
5374 doctypeystemloop: for (;;) {
5375 if (++pos == endPos) {
5376 break stateloop;
5377 }
5378 c = checkChar(buf, pos);
5379 /*
5380 * Otherwise, if the six characters starting from the
5381 * current input character are an ASCII case-insensitive
5382 * match for the word "SYSTEM", then consume those
5383 * characters and switch to the before DOCTYPE system
5384 * identifier state.
5385 */
5386 if (index < 5) { // YSTEM.length
5387 char folded = c;
5388 if (c >= 'A' && c <= 'Z') {
5389 folded += 0x20;
5390 }
5391 if (folded != Tokenizer.YSTEM[index]) {
5392 bogusDoctype();
5393 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5394 reconsume = true;
5395 continue stateloop;
5396 }
5397 index++;
5398 continue stateloop;
5399 } else {
5400 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
5401 reconsume = true;
5402 break doctypeystemloop;
5403 // continue stateloop;
5404 }
5405 }
5406 // FALLTHRU DON'T REORDER
5407 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
5408 afterdoctypesystemkeywordloop: for (;;) {
5409 if (reconsume) {
5410 reconsume = false;
5411 } else {
5412 if (++pos == endPos) {
5413 break stateloop;
5414 }
5415 c = checkChar(buf, pos);
5416 }
5417 /*
5418 * Consume the next input character:
5419 */
5420 switch (c) {
5421 case '\r':
5422 silentCarriageReturn();
5423 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5424 break stateloop;
5425 case '\n':
5426 silentLineFeed();
5427 // fall thru
5428 case ' ':
5429 case '\t':
5430 case '\u000C':
5431 /*
5432 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5433 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5434 * Switch to the before DOCTYPE public
5435 * identifier state.
5436 */
5437 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5438 break afterdoctypesystemkeywordloop;
5439 // FALL THROUGH continue stateloop
5440 case '"':
5441 /*
5442 * U+0022 QUOTATION MARK (") Parse Error.
5443 */
5444 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5445 /*
5446 * Set the DOCTYPE token's system identifier to
5447 * the empty string (not missing),
5448 */
5449 clearLongStrBuf();
5450 /*
5451 * then switch to the DOCTYPE public identifier
5452 * (double-quoted) state.
5453 */
5454 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5455 continue stateloop;
5456 case '\'':
5457 /*
5458 * U+0027 APOSTROPHE (') Parse Error.
5459 */
5460 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5461 /*
5462 * Set the DOCTYPE token's public identifier to
5463 * the empty string (not missing),
5464 */
5465 clearLongStrBuf();
5466 /*
5467 * then switch to the DOCTYPE public identifier
5468 * (single-quoted) state.
5469 */
5470 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5471 continue stateloop;
5472 case '>':
5473 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5474 errExpectedPublicId();
5475 /*
5476 * Set the DOCTYPE token's force-quirks flag to
5477 * on.
5478 */
5479 forceQuirks = true;
5480 /*
5481 * Emit that DOCTYPE token.
5482 */
5483 emitDoctypeToken(pos);
5484 /*
5485 * Switch to the data state.
5486 */
5487 state = transition(state, Tokenizer.DATA, reconsume, pos);
5488 continue stateloop;
5489 default:
5490 bogusDoctype();
5491 /*
5492 * Set the DOCTYPE token's force-quirks flag to
5493 * on.
5494 */
5495 // done by bogusDoctype();
5496 /*
5497 * Switch to the bogus DOCTYPE state.
5498 */
5499 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5500 continue stateloop;
5501 }
5502 }
5503 // FALLTHRU DON'T REORDER
5504 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
5505 beforedoctypesystemidentifierloop: for (;;) {
5506 if (++pos == endPos) {
5507 break stateloop;
5508 }
5509 c = checkChar(buf, pos);
5510 /*
5511 * Consume the next input character:
5512 */
5513 switch (c) {
5514 case '\r':
5515 silentCarriageReturn();
5516 break stateloop;
5517 case '\n':
5518 silentLineFeed();
5519 // fall thru
5520 case ' ':
5521 case '\t':
5522 case '\u000C':
5523 /*
5524 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5525 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5526 * in the before DOCTYPE system identifier
5527 * state.
5528 */
5529 continue;
5530 case '"':
5531 /*
5532 * U+0022 QUOTATION MARK (") Set the DOCTYPE
5533 * token's system identifier to the empty string
5534 * (not missing),
5535 */
5536 clearLongStrBuf();
5537 /*
5538 * then switch to the DOCTYPE system identifier
5539 * (double-quoted) state.
5540 */
5541 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5542 continue stateloop;
5543 case '\'':
5544 /*
5545 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5546 * system identifier to the empty string (not
5547 * missing),
5548 */
5549 clearLongStrBuf();
5550 /*
5551 * then switch to the DOCTYPE system identifier
5552 * (single-quoted) state.
5553 */
5554 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5555 break beforedoctypesystemidentifierloop;
5556 // continue stateloop;
5557 case '>':
5558 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5559 errExpectedSystemId();
5560 /*
5561 * Set the DOCTYPE token's force-quirks flag to
5562 * on.
5563 */
5564 forceQuirks = true;
5565 /*
5566 * Emit that DOCTYPE token.
5567 */
5568 emitDoctypeToken(pos);
5569 /*
5570 * Switch to the data state.
5571 */
5572 state = transition(state, Tokenizer.DATA, reconsume, pos);
5573 continue stateloop;
5574 default:
5575 bogusDoctype();
5576 /*
5577 * Set the DOCTYPE token's force-quirks flag to
5578 * on.
5579 */
5580 // done by bogusDoctype();
5581 /*
5582 * Switch to the bogus DOCTYPE state.
5583 */
5584 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5585 continue stateloop;
5586 }
5587 }
5588 // FALLTHRU DON'T REORDER
5589 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
5590 for (;;) {
5591 if (++pos == endPos) {
5592 break stateloop;
5593 }
5594 c = checkChar(buf, pos);
5595 /*
5596 * Consume the next input character:
5597 */
5598 switch (c) {
5599 case '\'':
5600 /*
5601 * U+0027 APOSTROPHE (') Switch to the after
5602 * DOCTYPE system identifier state.
5603 */
5604 systemIdentifier = longStrBufToString();
5605 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5606 continue stateloop;
5607 case '>':
5608 errGtInSystemId();
5609 /*
5610 * Set the DOCTYPE token's force-quirks flag to
5611 * on.
5612 */
5613 forceQuirks = true;
5614 /*
5615 * Emit that DOCTYPE token.
5616 */
5617 systemIdentifier = longStrBufToString();
5618 emitDoctypeToken(pos);
5619 /*
5620 * Switch to the data state.
5621 */
5622 state = transition(state, Tokenizer.DATA, reconsume, pos);
5623 continue stateloop;
5624 case '\r':
5625 appendLongStrBufCarriageReturn();
5626 break stateloop;
5627 case '\n':
5628 appendLongStrBufLineFeed();
5629 continue;
5630 case '\u0000':
5631 c = '\uFFFD';
5632 // fall thru
5633 default:
5634 /*
5635 * Anything else Append the current input
5636 * character to the current DOCTYPE token's
5637 * system identifier.
5638 */
5639 appendLongStrBuf(c);
5640 /*
5641 * Stay in the DOCTYPE system identifier
5642 * (double-quoted) state.
5643 */
5644 continue;
5645 }
5646 }
5647 // XXX reorder point
5648 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
5649 for (;;) {
5650 if (++pos == endPos) {
5651 break stateloop;
5652 }
5653 c = checkChar(buf, pos);
5654 /*
5655 * Consume the next input character:
5656 */
5657 switch (c) {
5658 case '\'':
5659 /*
5660 * U+0027 APOSTROPHE (') Switch to the after
5661 * DOCTYPE public identifier state.
5662 */
5663 publicIdentifier = longStrBufToString();
5664 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5665 continue stateloop;
5666 case '>':
5667 errGtInPublicId();
5668 /*
5669 * Set the DOCTYPE token's force-quirks flag to
5670 * on.
5671 */
5672 forceQuirks = true;
5673 /*
5674 * Emit that DOCTYPE token.
5675 */
5676 publicIdentifier = longStrBufToString();
5677 emitDoctypeToken(pos);
5678 /*
5679 * Switch to the data state.
5680 */
5681 state = transition(state, Tokenizer.DATA, reconsume, pos);
5682 continue stateloop;
5683 case '\r':
5684 appendLongStrBufCarriageReturn();
5685 break stateloop;
5686 case '\n':
5687 appendLongStrBufLineFeed();
5688 continue;
5689 case '\u0000':
5690 c = '\uFFFD';
5691 // fall thru
5692 default:
5693 /*
5694 * Anything else Append the current input
5695 * character to the current DOCTYPE token's
5696 * public identifier.
5697 */
5698 appendLongStrBuf(c);
5699 /*
5700 * Stay in the DOCTYPE public identifier
5701 * (single-quoted) state.
5702 */
5703 continue;
5704 }
5705 }
5706 // END HOTSPOT WORKAROUND
5707 }
5708 }
5709 flushChars(buf, pos);
5710 /*
5711 * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
5712 */
5713 // Save locals
5714 stateSave = state;
5715 returnStateSave = returnState;
5716 return pos;
5717 }
5718
5719 // HOTSPOT WORKAROUND INSERTION POINT
5720
5721 // [NOCPP[
5722
5723 protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
5724 return to;
5725 }
5726
5727 // ]NOCPP]
5728
5729 private void initDoctypeFields() {
5730 doctypeName = "";
5731 if (systemIdentifier != null) {
5732 Portability.releaseString(systemIdentifier);
5733 systemIdentifier = null;
5734 }
5735 if (publicIdentifier != null) {
5736 Portability.releaseString(publicIdentifier);
5737 publicIdentifier = null;
5738 }
5739 forceQuirks = false;
5740 }
5741
5742 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn()
5743 throws SAXException {
5744 silentCarriageReturn();
5745 adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
5746 }
5747
5748 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed()
5749 throws SAXException {
5750 silentLineFeed();
5751 adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
5752 }
5753
5754 @Inline private void appendLongStrBufLineFeed() {
5755 silentLineFeed();
5756 appendLongStrBuf('\n');
5757 }
5758
5759 @Inline private void appendLongStrBufCarriageReturn() {
5760 silentCarriageReturn();
5761 appendLongStrBuf('\n');
5762 }
5763
5764 @Inline protected void silentCarriageReturn() {
5765 ++line;
5766 lastCR = true;
5767 }
5768
5769 @Inline protected void silentLineFeed() {
5770 ++line;
5771 }
5772
5773 private void emitCarriageReturn(@NoLength char[] buf, int pos)
5774 throws SAXException {
5775 silentCarriageReturn();
5776 flushChars(buf, pos);
5777 tokenHandler.characters(Tokenizer.LF, 0, 1);
5778 cstart = Integer.MAX_VALUE;
5779 }
5780
5781 private void emitReplacementCharacter(@NoLength char[] buf, int pos)
5782 throws SAXException {
5783 flushChars(buf, pos);
5784 tokenHandler.zeroOriginatingReplacementCharacter();
5785 cstart = pos + 1;
5786 }
5787
5788 private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
5789 throws SAXException {
5790 flushChars(buf, pos);
5791 tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
5792 cstart = pos + 1;
5793 }
5794
5795 private void setAdditionalAndRememberAmpersandLocation(char add) {
5796 additional = add;
5797 // [NOCPP[
5798 ampersandLocation = new LocatorImpl(this);
5799 // ]NOCPP]
5800 }
5801
5802 private void bogusDoctype() throws SAXException {
5803 errBogusDoctype();
5804 forceQuirks = true;
5805 }
5806
5807 private void bogusDoctypeWithoutQuirks() throws SAXException {
5808 errBogusDoctype();
5809 forceQuirks = false;
5810 }
5811
5812 private void emitOrAppendStrBuf(int returnState) throws SAXException {
5813 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
5814 appendStrBufToLongStrBuf();
5815 } else {
5816 emitStrBuf();
5817 }
5818 }
5819
5820 private void handleNcrValue(int returnState) throws SAXException {
5821 /*
5822 * If one or more characters match the range, then take them all and
5823 * interpret the string of characters as a number (either hexadecimal or
5824 * decimal as appropriate).
5825 */
5826 if (value <= 0xFFFF) {
5827 if (value >= 0x80 && value <= 0x9f) {
5828 /*
5829 * If that number is one of the numbers in the first column of
5830 * the following table, then this is a parse error.
5831 */
5832 errNcrInC1Range();
5833 /*
5834 * Find the row with that number in the first column, and return
5835 * a character token for the Unicode character given in the
5836 * second column of that row.
5837 */
5838 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
5839 emitOrAppendOne(val, returnState);
5840 // [NOCPP[
5841 } else if (value == 0xC
5842 && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
5843 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
5844 emitOrAppendOne(Tokenizer.SPACE, returnState);
5845 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
5846 fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
5847 }
5848 // ]NOCPP]
5849 } else if (value == 0x0) {
5850 errNcrZero();
5851 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
5852 } else if ((value & 0xF800) == 0xD800) {
5853 errNcrSurrogate();
5854 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
5855 } else {
5856 /*
5857 * Otherwise, return a character token for the Unicode character
5858 * whose code point is that number.
5859 */
5860 char ch = (char) value;
5861 // [NOCPP[
5862 if (value == 0x0D) {
5863 errNcrCr();
5864 } else if ((value <= 0x0008) || (value == 0x000B)
5865 || (value >= 0x000E && value <= 0x001F)) {
5866 ch = errNcrControlChar(ch);
5867 } else if (value >= 0xFDD0 && value <= 0xFDEF) {
5868 errNcrUnassigned();
5869 } else if ((value & 0xFFFE) == 0xFFFE) {
5870 ch = errNcrNonCharacter(ch);
5871 } else if (value >= 0x007F && value <= 0x009F) {
5872 errNcrControlChar();
5873 } else {
5874 maybeWarnPrivateUse(ch);
5875 }
5876 // ]NOCPP]
5877 bmpChar[0] = ch;
5878 emitOrAppendOne(bmpChar, returnState);
5879 }
5880 } else if (value <= 0x10FFFF) {
5881 // [NOCPP[
5882 maybeWarnPrivateUseAstral();
5883 if ((value & 0xFFFE) == 0xFFFE) {
5884 errAstralNonCharacter(value);
5885 }
5886 // ]NOCPP]
5887 astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
5888 astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
5889 emitOrAppendTwo(astralChar, returnState);
5890 } else {
5891 errNcrOutOfRange();
5892 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
5893 }
5894 }
5895
5896 public void eof() throws SAXException {
5897 int state = stateSave;
5898 int returnState = returnStateSave;
5899
5900 eofloop: for (;;) {
5901 switch (state) {
5902 case SCRIPT_DATA_LESS_THAN_SIGN:
5903 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
5904 /*
5905 * Otherwise, emit a U+003C LESS-THAN SIGN character token
5906 */
5907 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
5908 /*
5909 * and reconsume the current input character in the data
5910 * state.
5911 */
5912 break eofloop;
5913 case TAG_OPEN:
5914 /*
5915 * The behavior of this state depends on the content model
5916 * flag.
5917 */
5918 /*
5919 * Anything else Parse error.
5920 */
5921 errEofAfterLt();
5922 /*
5923 * Emit a U+003C LESS-THAN SIGN character token
5924 */
5925 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
5926 /*
5927 * and reconsume the current input character in the data
5928 * state.
5929 */
5930 break eofloop;
5931 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
5932 /*
5933 * Emit a U+003C LESS-THAN SIGN character token
5934 */
5935 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
5936 /*
5937 * and reconsume the current input character in the RCDATA
5938 * state.
5939 */
5940 break eofloop;
5941 case NON_DATA_END_TAG_NAME:
5942 /*
5943 * Emit a U+003C LESS-THAN SIGN character token, a U+002F
5944 * SOLIDUS character token,
5945 */
5946 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
5947 /*
5948 * a character token for each of the characters in the
5949 * temporary buffer (in the order they were added to the
5950 * buffer),
5951 */
5952 emitStrBuf();
5953 /*
5954 * and reconsume the current input character in the RCDATA
5955 * state.
5956 */
5957 break eofloop;
5958 case CLOSE_TAG_OPEN:
5959 /* EOF Parse error. */
5960 errEofAfterLt();
5961 /*
5962 * Emit a U+003C LESS-THAN SIGN character token and a U+002F
5963 * SOLIDUS character token.
5964 */
5965 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
5966 /*
5967 * Reconsume the EOF character in the data state.
5968 */
5969 break eofloop;
5970 case TAG_NAME:
5971 /*
5972 * EOF Parse error.
5973 */
5974 errEofInTagName();
5975 /*
5976 * Reconsume the EOF character in the data state.
5977 */
5978 break eofloop;
5979 case BEFORE_ATTRIBUTE_NAME:
5980 case AFTER_ATTRIBUTE_VALUE_QUOTED:
5981 case SELF_CLOSING_START_TAG:
5982 /* EOF Parse error. */
5983 errEofWithoutGt();
5984 /*
5985 * Reconsume the EOF character in the data state.
5986 */
5987 break eofloop;
5988 case ATTRIBUTE_NAME:
5989 /*
5990 * EOF Parse error.
5991 */
5992 errEofInAttributeName();
5993 /*
5994 * Reconsume the EOF character in the data state.
5995 */
5996 break eofloop;
5997 case AFTER_ATTRIBUTE_NAME:
5998 case BEFORE_ATTRIBUTE_VALUE:
5999 /* EOF Parse error. */
6000 errEofWithoutGt();
6001 /*
6002 * Reconsume the EOF character in the data state.
6003 */
6004 break eofloop;
6005 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
6006 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
6007 case ATTRIBUTE_VALUE_UNQUOTED:
6008 /* EOF Parse error. */
6009 errEofInAttributeValue();
6010 /*
6011 * Reconsume the EOF character in the data state.
6012 */
6013 break eofloop;
6014 case BOGUS_COMMENT:
6015 emitComment(0, 0);
6016 break eofloop;
6017 case BOGUS_COMMENT_HYPHEN:
6018 // [NOCPP[
6019 maybeAppendSpaceToBogusComment();
6020 // ]NOCPP]
6021 emitComment(0, 0);
6022 break eofloop;
6023 case MARKUP_DECLARATION_OPEN:
6024 errBogusComment();
6025 clearLongStrBuf();
6026 emitComment(0, 0);
6027 break eofloop;
6028 case MARKUP_DECLARATION_HYPHEN:
6029 errBogusComment();
6030 emitComment(0, 0);
6031 break eofloop;
6032 case MARKUP_DECLARATION_OCTYPE:
6033 if (index < 6) {
6034 errBogusComment();
6035 emitComment(0, 0);
6036 } else {
6037 /* EOF Parse error. */
6038 errEofInDoctype();
6039 /*
6040 * Create a new DOCTYPE token. Set its force-quirks flag
6041 * to on.
6042 */
6043 doctypeName = "";
6044 if (systemIdentifier != null) {
6045 Portability.releaseString(systemIdentifier);
6046 systemIdentifier = null;
6047 }
6048 if (publicIdentifier != null) {
6049 Portability.releaseString(publicIdentifier);
6050 publicIdentifier = null;
6051 }
6052 forceQuirks = true;
6053 /*
6054 * Emit the token.
6055 */
6056 emitDoctypeToken(0);
6057 /*
6058 * Reconsume the EOF character in the data state.
6059 */
6060 break eofloop;
6061 }
6062 break eofloop;
6063 case COMMENT_START:
6064 case COMMENT:
6065 /*
6066 * EOF Parse error.
6067 */
6068 errEofInComment();
6069 /* Emit the comment token. */
6070 emitComment(0, 0);
6071 /*
6072 * Reconsume the EOF character in the data state.
6073 */
6074 break eofloop;
6075 case COMMENT_END:
6076 errEofInComment();
6077 /* Emit the comment token. */
6078 emitComment(2, 0);
6079 /*
6080 * Reconsume the EOF character in the data state.
6081 */
6082 break eofloop;
6083 case COMMENT_END_DASH:
6084 case COMMENT_START_DASH:
6085 errEofInComment();
6086 /* Emit the comment token. */
6087 emitComment(1, 0);
6088 /*
6089 * Reconsume the EOF character in the data state.
6090 */
6091 break eofloop;
6092 case COMMENT_END_BANG:
6093 errEofInComment();
6094 /* Emit the comment token. */
6095 emitComment(3, 0);
6096 /*
6097 * Reconsume the EOF character in the data state.
6098 */
6099 break eofloop;
6100 case DOCTYPE:
6101 case BEFORE_DOCTYPE_NAME:
6102 errEofInDoctype();
6103 /*
6104 * Create a new DOCTYPE token. Set its force-quirks flag to
6105 * on.
6106 */
6107 forceQuirks = true;
6108 /*
6109 * Emit the token.
6110 */
6111 emitDoctypeToken(0);
6112 /*
6113 * Reconsume the EOF character in the data state.
6114 */
6115 break eofloop;
6116 case DOCTYPE_NAME:
6117 errEofInDoctype();
6118 strBufToDoctypeName();
6119 /*
6120 * Set the DOCTYPE token's force-quirks flag to on.
6121 */
6122 forceQuirks = true;
6123 /*
6124 * Emit that DOCTYPE token.
6125 */
6126 emitDoctypeToken(0);
6127 /*
6128 * Reconsume the EOF character in the data state.
6129 */
6130 break eofloop;
6131 case DOCTYPE_UBLIC:
6132 case DOCTYPE_YSTEM:
6133 case AFTER_DOCTYPE_NAME:
6134 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
6135 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
6136 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
6137 errEofInDoctype();
6138 /*
6139 * Set the DOCTYPE token's force-quirks flag to on.
6140 */
6141 forceQuirks = true;
6142 /*
6143 * Emit that DOCTYPE token.
6144 */
6145 emitDoctypeToken(0);
6146 /*
6147 * Reconsume the EOF character in the data state.
6148 */
6149 break eofloop;
6150 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
6151 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
6152 /* EOF Parse error. */
6153 errEofInPublicId();
6154 /*
6155 * Set the DOCTYPE token's force-quirks flag to on.
6156 */
6157 forceQuirks = true;
6158 /*
6159 * Emit that DOCTYPE token.
6160 */
6161 publicIdentifier = longStrBufToString();
6162 emitDoctypeToken(0);
6163 /*
6164 * Reconsume the EOF character in the data state.
6165 */
6166 break eofloop;
6167 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
6168 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
6169 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
6170 errEofInDoctype();
6171 /*
6172 * Set the DOCTYPE token's force-quirks flag to on.
6173 */
6174 forceQuirks = true;
6175 /*
6176 * Emit that DOCTYPE token.
6177 */
6178 emitDoctypeToken(0);
6179 /*
6180 * Reconsume the EOF character in the data state.
6181 */
6182 break eofloop;
6183 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
6184 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
6185 /* EOF Parse error. */
6186 errEofInSystemId();
6187 /*
6188 * Set the DOCTYPE token's force-quirks flag to on.
6189 */
6190 forceQuirks = true;
6191 /*
6192 * Emit that DOCTYPE token.
6193 */
6194 systemIdentifier = longStrBufToString();
6195 emitDoctypeToken(0);
6196 /*
6197 * Reconsume the EOF character in the data state.
6198 */
6199 break eofloop;
6200 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
6201 errEofInDoctype();
6202 /*
6203 * Set the DOCTYPE token's force-quirks flag to on.
6204 */
6205 forceQuirks = true;
6206 /*
6207 * Emit that DOCTYPE token.
6208 */
6209 emitDoctypeToken(0);
6210 /*
6211 * Reconsume the EOF character in the data state.
6212 */
6213 break eofloop;
6214 case BOGUS_DOCTYPE:
6215 /*
6216 * Emit that DOCTYPE token.
6217 */
6218 emitDoctypeToken(0);
6219 /*
6220 * Reconsume the EOF character in the data state.
6221 */
6222 break eofloop;
6223 case CONSUME_CHARACTER_REFERENCE:
6224 /*
6225 * Unlike the definition is the spec, this state does not
6226 * return a value and never requires the caller to
6227 * backtrack. This state takes care of emitting characters
6228 * or appending to the current attribute value. It also
6229 * takes care of that in the case when consuming the entity
6230 * fails.
6231 */
6232 /*
6233 * This section defines how to consume an entity. This
6234 * definition is used when parsing entities in text and in
6235 * attributes.
6236 *
6237 * The behavior depends on the identity of the next
6238 * character (the one immediately after the U+0026 AMPERSAND
6239 * character):
6240 */
6241
6242 emitOrAppendStrBuf(returnState);
6243 state = returnState;
6244 continue;
6245 case CHARACTER_REFERENCE_HILO_LOOKUP:
6246 errNoNamedCharacterMatch();
6247 emitOrAppendStrBuf(returnState);
6248 state = returnState;
6249 continue;
6250 case CHARACTER_REFERENCE_TAIL:
6251 outer: for (;;) {
6252 char c = '\u0000';
6253 entCol++;
6254 /*
6255 * Consume the maximum number of characters possible,
6256 * with the consumed characters matching one of the
6257 * identifiers in the first column of the named
6258 * character references table (in a case-sensitive
6259 * manner).
6260 */
6261 hiloop: for (;;) {
6262 if (hi == -1) {
6263 break hiloop;
6264 }
6265 if (entCol == NamedCharacters.NAMES[hi].length()) {
6266 break hiloop;
6267 }
6268 if (entCol > NamedCharacters.NAMES[hi].length()) {
6269 break outer;
6270 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
6271 hi--;
6272 } else {
6273 break hiloop;
6274 }
6275 }
6276
6277 loloop: for (;;) {
6278 if (hi < lo) {
6279 break outer;
6280 }
6281 if (entCol == NamedCharacters.NAMES[lo].length()) {
6282 candidate = lo;
6283 strBufMark = strBufLen;
6284 lo++;
6285 } else if (entCol > NamedCharacters.NAMES[lo].length()) {
6286 break outer;
6287 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
6288 lo++;
6289 } else {
6290 break loloop;
6291 }
6292 }
6293 if (hi < lo) {
6294 break outer;
6295 }
6296 continue;
6297 }
6298
6299 if (candidate == -1) {
6300 /*
6301 * If no match can be made, then this is a parse error.
6302 */
6303 errNoNamedCharacterMatch();
6304 emitOrAppendStrBuf(returnState);
6305 state = returnState;
6306 continue eofloop;
6307 } else {
6308 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
6309 if (candidateName.length() == 0
6310 || candidateName.charAt(candidateName.length() - 1) != ';') {
6311 /*
6312 * If the last character matched is not a U+003B
6313 * SEMICOLON (;), there is a parse error.
6314 */
6315 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6316 /*
6317 * If the entity is being consumed as part of an
6318 * attribute, and the last character matched is
6319 * not a U+003B SEMICOLON (;),
6320 */
6321 char ch;
6322 if (strBufMark == strBufLen) {
6323 ch = '\u0000';
6324 } else {
6325 ch = strBuf[strBufMark];
6326 }
6327 if ((ch >= '0' && ch <= '9')
6328 || (ch >= 'A' && ch <= 'Z')
6329 || (ch >= 'a' && ch <= 'z')) {
6330 /*
6331 * and the next character is in the range
6332 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
6333 * U+0041 LATIN CAPITAL LETTER A to U+005A
6334 * LATIN CAPITAL LETTER Z, or U+0061 LATIN
6335 * SMALL LETTER A to U+007A LATIN SMALL
6336 * LETTER Z, then, for historical reasons,
6337 * all the characters that were matched
6338 * after the U+0026 AMPERSAND (&) must be
6339 * unconsumed, and nothing is returned.
6340 */
6341 errNoNamedCharacterMatch();
6342 appendStrBufToLongStrBuf();
6343 state = returnState;
6344 continue eofloop;
6345 }
6346 }
6347 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6348 errUnescapedAmpersandInterpretedAsCharacterReference();
6349 } else {
6350 errNotSemicolonTerminated();
6351 }
6352 }
6353
6354 /*
6355 * Otherwise, return a character token for the character
6356 * corresponding to the entity name (as given by the
6357 * second column of the named character references
6358 * table).
6359 */
6360 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
6361 if (
6362 // [NOCPP[
6363 val.length == 1
6364 // ]NOCPP]
6365 // CPPONLY: val[1] == 0
6366 ) {
6367 emitOrAppendOne(val, returnState);
6368 } else {
6369 emitOrAppendTwo(val, returnState);
6370 }
6371 // this is so complicated!
6372 if (strBufMark < strBufLen) {
6373 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6374 for (int i = strBufMark; i < strBufLen; i++) {
6375 appendLongStrBuf(strBuf[i]);
6376 }
6377 } else {
6378 tokenHandler.characters(strBuf, strBufMark,
6379 strBufLen - strBufMark);
6380 }
6381 }
6382 state = returnState;
6383 continue eofloop;
6384 /*
6385 * If the markup contains I'm ¬it; I tell you, the
6386 * entity is parsed as "not", as in, I'm ¬it; I tell
6387 * you. But if the markup was I'm ∉ I tell you,
6388 * the entity would be parsed as "notin;", resulting in
6389 * I'm ∉ I tell you.
6390 */
6391 }
6392 case CONSUME_NCR:
6393 case DECIMAL_NRC_LOOP:
6394 case HEX_NCR_LOOP:
6395 /*
6396 * If no characters match the range, then don't consume any
6397 * characters (and unconsume the U+0023 NUMBER SIGN
6398 * character and, if appropriate, the X character). This is
6399 * a parse error; nothing is returned.
6400 *
6401 * Otherwise, if the next character is a U+003B SEMICOLON,
6402 * consume that too. If it isn't, there is a parse error.
6403 */
6404 if (!seenDigits) {
6405 errNoDigitsInNCR();
6406 emitOrAppendStrBuf(returnState);
6407 state = returnState;
6408 continue;
6409 } else {
6410 errCharRefLacksSemicolon();
6411 }
6412 // WARNING previous state sets reconsume
6413 handleNcrValue(returnState);
6414 state = returnState;
6415 continue;
6416 case CDATA_RSQB:
6417 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
6418 break eofloop;
6419 case CDATA_RSQB_RSQB:
6420 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
6421 break eofloop;
6422 case DATA:
6423 default:
6424 break eofloop;
6425 }
6426 }
6427 // case DATA:
6428 /*
6429 * EOF Emit an end-of-file token.
6430 */
6431 tokenHandler.eof();
6432 return;
6433 }
6434
6435 private void emitDoctypeToken(int pos) throws SAXException {
6436 cstart = pos + 1;
6437 tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
6438 forceQuirks);
6439 // It is OK and sufficient to release these here, since
6440 // there's no way out of the doctype states than through paths
6441 // that call this method.
6442 doctypeName = null;
6443 Portability.releaseString(publicIdentifier);
6444 publicIdentifier = null;
6445 Portability.releaseString(systemIdentifier);
6446 systemIdentifier = null;
6447 }
6448
6449 @Inline protected char checkChar(@NoLength char[] buf, int pos)
6450 throws SAXException {
6451 return buf[pos];
6452 }
6453
6454 // [NOCPP[
6455
6456 /**
6457 * Returns the alreadyComplainedAboutNonAscii.
6458 *
6459 * @return the alreadyComplainedAboutNonAscii
6460 */
6461 public boolean isAlreadyComplainedAboutNonAscii() {
6462 return true;
6463 }
6464
6465 // ]NOCPP]
6466
6467 public boolean internalEncodingDeclaration(String internalCharset)
6468 throws SAXException {
6469 if (encodingDeclarationHandler != null) {
6470 return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
6471 }
6472 return false;
6473 }
6474
6475 /**
6476 * @param val
6477 * @throws SAXException
6478 */
6479 private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
6480 throws SAXException {
6481 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6482 appendLongStrBuf(val[0]);
6483 appendLongStrBuf(val[1]);
6484 } else {
6485 tokenHandler.characters(val, 0, 2);
6486 }
6487 }
6488
6489 private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
6490 throws SAXException {
6491 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6492 appendLongStrBuf(val[0]);
6493 } else {
6494 tokenHandler.characters(val, 0, 1);
6495 }
6496 }
6497
6498 public void end() throws SAXException {
6499 strBuf = null;
6500 longStrBuf = null;
6501 doctypeName = null;
6502 if (systemIdentifier != null) {
6503 Portability.releaseString(systemIdentifier);
6504 systemIdentifier = null;
6505 }
6506 if (publicIdentifier != null) {
6507 Portability.releaseString(publicIdentifier);
6508 publicIdentifier = null;
6509 }
6510 if (tagName != null) {
6511 tagName.release();
6512 tagName = null;
6513 }
6514 if (attributeName != null) {
6515 attributeName.release();
6516 attributeName = null;
6517 }
6518 tokenHandler.endTokenization();
6519 if (attributes != null) {
6520 attributes.clear(mappingLangToXmlLang);
6521 Portability.delete(attributes);
6522 attributes = null;
6523 }
6524 }
6525
6526 public void requestSuspension() {
6527 shouldSuspend = true;
6528 }
6529
6530 // [NOCPP[
6531
6532 public void becomeConfident() {
6533 confident = true;
6534 }
6535
6536 /**
6537 * Returns the nextCharOnNewLine.
6538 *
6539 * @return the nextCharOnNewLine
6540 */
6541 public boolean isNextCharOnNewLine() {
6542 return false;
6543 }
6544
6545 public boolean isPrevCR() {
6546 return lastCR;
6547 }
6548
6549 /**
6550 * Returns the line.
6551 *
6552 * @return the line
6553 */
6554 public int getLine() {
6555 return -1;
6556 }
6557
6558 /**
6559 * Returns the col.
6560 *
6561 * @return the col
6562 */
6563 public int getCol() {
6564 return -1;
6565 }
6566
6567 // ]NOCPP]
6568
6569 public boolean isInDataState() {
6570 return (stateSave == DATA);
6571 }
6572
6573 public void resetToDataState() {
6574 strBufLen = 0;
6575 longStrBufLen = 0;
6576 stateSave = Tokenizer.DATA;
6577 // line = 1; XXX line numbers
6578 lastCR = false;
6579 index = 0;
6580 forceQuirks = false;
6581 additional = '\u0000';
6582 entCol = -1;
6583 firstCharKey = -1;
6584 lo = 0;
6585 hi = 0; // will always be overwritten before use anyway
6586 candidate = -1;
6587 strBufMark = 0;
6588 prevValue = -1;
6589 value = 0;
6590 seenDigits = false;
6591 endTag = false;
6592 shouldSuspend = false;
6593 initDoctypeFields();
6594 if (tagName != null) {
6595 tagName.release();
6596 tagName = null;
6597 }
6598 if (attributeName != null) {
6599 attributeName.release();
6600 attributeName = null;
6601 }
6602 // [NOCPP[
6603 if (newAttributesEachTime) {
6604 // ]NOCPP]
6605 if (attributes != null) {
6606 Portability.delete(attributes);
6607 attributes = null;
6608 }
6609 // [NOCPP[
6610 }
6611 // ]NOCPP]
6612 }
6613
6614 public void loadState(Tokenizer other) throws SAXException {
6615 strBufLen = other.strBufLen;
6616 if (strBufLen > strBuf.length) {
6617 strBuf = new char[strBufLen];
6618 }
6619 System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
6620
6621 longStrBufLen = other.longStrBufLen;
6622 if (longStrBufLen > longStrBuf.length) {
6623 longStrBuf = new char[longStrBufLen];
6624 }
6625 System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen);
6626
6627 stateSave = other.stateSave;
6628 returnStateSave = other.returnStateSave;
6629 endTagExpectation = other.endTagExpectation;
6630 endTagExpectationAsArray = other.endTagExpectationAsArray;
6631 // line = 1; XXX line numbers
6632 lastCR = other.lastCR;
6633 index = other.index;
6634 forceQuirks = other.forceQuirks;
6635 additional = other.additional;
6636 entCol = other.entCol;
6637 firstCharKey = other.firstCharKey;
6638 lo = other.lo;
6639 hi = other.hi;
6640 candidate = other.candidate;
6641 strBufMark = other.strBufMark;
6642 prevValue = other.prevValue;
6643 value = other.value;
6644 seenDigits = other.seenDigits;
6645 endTag = other.endTag;
6646 shouldSuspend = false;
6647
6648 if (other.doctypeName == null) {
6649 doctypeName = null;
6650 } else {
6651 doctypeName = Portability.newLocalFromLocal(other.doctypeName,
6652 interner);
6653 }
6654
6655 Portability.releaseString(systemIdentifier);
6656 if (other.systemIdentifier == null) {
6657 systemIdentifier = null;
6658 } else {
6659 systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
6660 }
6661
6662 Portability.releaseString(publicIdentifier);
6663 if (other.publicIdentifier == null) {
6664 publicIdentifier = null;
6665 } else {
6666 publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
6667 }
6668
6669 if (tagName != null) {
6670 tagName.release();
6671 }
6672 if (other.tagName == null) {
6673 tagName = null;
6674 } else {
6675 tagName = other.tagName.cloneElementName(interner);
6676 }
6677
6678 if (attributeName != null) {
6679 attributeName.release();
6680 }
6681 if (other.attributeName == null) {
6682 attributeName = null;
6683 } else {
6684 attributeName = other.attributeName.cloneAttributeName(interner);
6685 }
6686
6687 if (attributes != null) {
6688 Portability.delete(attributes);
6689 }
6690 if (other.attributes == null) {
6691 attributes = null;
6692 } else {
6693 attributes = other.attributes.cloneAttributes(interner);
6694 }
6695 }
6696
6697 public void initializeWithoutStarting() throws SAXException {
6698 confident = false;
6699 strBuf = new char[64];
6700 longStrBuf = new char[1024];
6701 line = 1;
6702 // [NOCPP[
6703 html4 = false;
6704 metaBoundaryPassed = false;
6705 wantsComments = tokenHandler.wantsComments();
6706 if (!newAttributesEachTime) {
6707 attributes = new HtmlAttributes(mappingLangToXmlLang);
6708 }
6709 // ]NOCPP]
6710 resetToDataState();
6711 }
6712
6713 protected void errGarbageAfterLtSlash() throws SAXException {
6714 }
6715
6716 protected void errLtSlashGt() throws SAXException {
6717 }
6718
6719 protected void errWarnLtSlashInRcdata() throws SAXException {
6720 }
6721
6722 protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
6723 }
6724
6725 protected void errCharRefLacksSemicolon() throws SAXException {
6726 }
6727
6728 protected void errNoDigitsInNCR() throws SAXException {
6729 }
6730
6731 protected void errGtInSystemId() throws SAXException {
6732 }
6733
6734 protected void errGtInPublicId() throws SAXException {
6735 }
6736
6737 protected void errNamelessDoctype() throws SAXException {
6738 }
6739
6740 protected void errConsecutiveHyphens() throws SAXException {
6741 }
6742
6743 protected void errPrematureEndOfComment() throws SAXException {
6744 }
6745
6746 protected void errBogusComment() throws SAXException {
6747 }
6748
6749 protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
6750 }
6751
6752 protected void errSlashNotFollowedByGt() throws SAXException {
6753 }
6754
6755 protected void errHtml4XmlVoidSyntax() throws SAXException {
6756 }
6757
6758 protected void errNoSpaceBetweenAttributes() throws SAXException {
6759 }
6760
6761 protected void errHtml4NonNameInUnquotedAttribute(char c)
6762 throws SAXException {
6763 }
6764
6765 protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
6766 throws SAXException {
6767 }
6768
6769 protected void errAttributeValueMissing() throws SAXException {
6770 }
6771
6772 protected void errBadCharBeforeAttributeNameOrNull(char c)
6773 throws SAXException {
6774 }
6775
6776 protected void errEqualsSignBeforeAttributeName() throws SAXException {
6777 }
6778
6779 protected void errBadCharAfterLt(char c) throws SAXException {
6780 }
6781
6782 protected void errLtGt() throws SAXException {
6783 }
6784
6785 protected void errProcessingInstruction() throws SAXException {
6786 }
6787
6788 protected void errUnescapedAmpersandInterpretedAsCharacterReference()
6789 throws SAXException {
6790 }
6791
6792 protected void errNotSemicolonTerminated() throws SAXException {
6793 }
6794
6795 protected void errNoNamedCharacterMatch() throws SAXException {
6796 }
6797
6798 protected void errQuoteBeforeAttributeName(char c) throws SAXException {
6799 }
6800
6801 protected void errQuoteOrLtInAttributeNameOrNull(char c)
6802 throws SAXException {
6803 }
6804
6805 protected void errExpectedPublicId() throws SAXException {
6806 }
6807
6808 protected void errBogusDoctype() throws SAXException {
6809 }
6810
6811 protected void maybeWarnPrivateUseAstral() throws SAXException {
6812 }
6813
6814 protected void maybeWarnPrivateUse(char ch) throws SAXException {
6815 }
6816
6817 protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
6818 throws SAXException {
6819 }
6820
6821 protected void maybeErrSlashInEndTag(boolean selfClosing)
6822 throws SAXException {
6823 }
6824
6825 protected char errNcrNonCharacter(char ch) throws SAXException {
6826 return ch;
6827 }
6828
6829 protected void errAstralNonCharacter(int ch) throws SAXException {
6830 }
6831
6832 protected void errNcrSurrogate() throws SAXException {
6833 }
6834
6835 protected char errNcrControlChar(char ch) throws SAXException {
6836 return ch;
6837 }
6838
6839 protected void errNcrCr() throws SAXException {
6840 }
6841
6842 protected void errNcrInC1Range() throws SAXException {
6843 }
6844
6845 protected void errEofInPublicId() throws SAXException {
6846 }
6847
6848 protected void errEofInComment() throws SAXException {
6849 }
6850
6851 protected void errEofInDoctype() throws SAXException {
6852 }
6853
6854 protected void errEofInAttributeValue() throws SAXException {
6855 }
6856
6857 protected void errEofInAttributeName() throws SAXException {
6858 }
6859
6860 protected void errEofWithoutGt() throws SAXException {
6861 }
6862
6863 protected void errEofInTagName() throws SAXException {
6864 }
6865
6866 protected void errEofInEndTag() throws SAXException {
6867 }
6868
6869 protected void errEofAfterLt() throws SAXException {
6870 }
6871
6872 protected void errNcrOutOfRange() throws SAXException {
6873 }
6874
6875 protected void errNcrUnassigned() throws SAXException {
6876 }
6877
6878 protected void errDuplicateAttribute() throws SAXException {
6879 }
6880
6881 protected void errEofInSystemId() throws SAXException {
6882 }
6883
6884 protected void errExpectedSystemId() throws SAXException {
6885 }
6886
6887 protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
6888 }
6889
6890 protected void errHyphenHyphenBang() throws SAXException {
6891 }
6892
6893 protected void errNcrControlChar() throws SAXException {
6894 }
6895
6896 protected void errNcrZero() throws SAXException {
6897 }
6898
6899 protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
6900 throws SAXException {
6901 }
6902
6903 protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
6904 }
6905
6906 protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
6907 throws SAXException {
6908 }
6909
6910 protected void noteAttributeWithoutValue() throws SAXException {
6911 }
6912
6913 protected void noteUnquotedAttributeValue() throws SAXException {
6914 }
6915
6916 /**
6917 * Sets the encodingDeclarationHandler.
6918 *
6919 * @param encodingDeclarationHandler
6920 * the encodingDeclarationHandler to set
6921 */
6922 public void setEncodingDeclarationHandler(
6923 EncodingDeclarationHandler encodingDeclarationHandler) {
6924 this.encodingDeclarationHandler = encodingDeclarationHandler;
6925 }
6926
6927 void destructor() {
6928 // The translator will write refcount tracing stuff here
6929 }
6930
6931 // [NOCPP[
6932
6933 /**
6934 * Sets an offset to be added to the position reported to
6935 * <code>TransitionHandler</code>.
6936 *
6937 * @param offset the offset
6938 */
6939 public void setTransitionBaseOffset(int offset) {
6940
6941 }
6942
6943 // ]NOCPP]
6944
6945 }