001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 * Copyright (c) 2008-2010 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.impl;
025
026 import java.io.IOException;
027
028 import nu.validator.htmlparser.annotation.Auto;
029 import nu.validator.htmlparser.annotation.Inline;
030 import nu.validator.htmlparser.common.ByteReadable;
031
032 import org.xml.sax.SAXException;
033
034 public abstract class MetaScanner {
035
036 /**
037 * Constant for "charset".
038 */
039 private static final char[] CHARSET = "harset".toCharArray();
040
041 /**
042 * Constant for "content".
043 */
044 private static final char[] CONTENT = "ontent".toCharArray();
045
046 /**
047 * Constant for "http-equiv".
048 */
049 private static final char[] HTTP_EQUIV = "ttp-equiv".toCharArray();
050
051 /**
052 * Constant for "content-type".
053 */
054 private static final char[] CONTENT_TYPE = "content-type".toCharArray();
055
056 private static final int NO = 0;
057
058 private static final int M = 1;
059
060 private static final int E = 2;
061
062 private static final int T = 3;
063
064 private static final int A = 4;
065
066 private static final int DATA = 0;
067
068 private static final int TAG_OPEN = 1;
069
070 private static final int SCAN_UNTIL_GT = 2;
071
072 private static final int TAG_NAME = 3;
073
074 private static final int BEFORE_ATTRIBUTE_NAME = 4;
075
076 private static final int ATTRIBUTE_NAME = 5;
077
078 private static final int AFTER_ATTRIBUTE_NAME = 6;
079
080 private static final int BEFORE_ATTRIBUTE_VALUE = 7;
081
082 private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8;
083
084 private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9;
085
086 private static final int ATTRIBUTE_VALUE_UNQUOTED = 10;
087
088 private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11;
089
090 private static final int MARKUP_DECLARATION_OPEN = 13;
091
092 private static final int MARKUP_DECLARATION_HYPHEN = 14;
093
094 private static final int COMMENT_START = 15;
095
096 private static final int COMMENT_START_DASH = 16;
097
098 private static final int COMMENT = 17;
099
100 private static final int COMMENT_END_DASH = 18;
101
102 private static final int COMMENT_END = 19;
103
104 private static final int SELF_CLOSING_START_TAG = 20;
105
106 private static final int HTTP_EQUIV_NOT_SEEN = 0;
107
108 private static final int HTTP_EQUIV_CONTENT_TYPE = 1;
109
110 private static final int HTTP_EQUIV_OTHER = 2;
111
112 /**
113 * The data source.
114 */
115 protected ByteReadable readable;
116
117 /**
118 * The state of the state machine that recognizes the tag name "meta".
119 */
120 private int metaState = NO;
121
122 /**
123 * The current position in recognizing the attribute name "content".
124 */
125 private int contentIndex = Integer.MAX_VALUE;
126
127 /**
128 * The current position in recognizing the attribute name "charset".
129 */
130 private int charsetIndex = Integer.MAX_VALUE;
131
132 /**
133 * The current position in recognizing the attribute name "http-equive".
134 */
135 private int httpEquivIndex = Integer.MAX_VALUE;
136
137 /**
138 * The current position in recognizing the attribute value "content-type".
139 */
140 private int contentTypeIndex = Integer.MAX_VALUE;
141
142 /**
143 * The tokenizer state.
144 */
145 protected int stateSave = DATA;
146
147 /**
148 * The currently filled length of strBuf.
149 */
150 private int strBufLen;
151
152 /**
153 * Accumulation buffer for attribute values.
154 */
155 private @Auto char[] strBuf;
156
157 private String content;
158
159 private String charset;
160
161 private int httpEquivState;
162
163 public MetaScanner() {
164 this.readable = null;
165 this.metaState = NO;
166 this.contentIndex = Integer.MAX_VALUE;
167 this.charsetIndex = Integer.MAX_VALUE;
168 this.httpEquivIndex = Integer.MAX_VALUE;
169 this.contentTypeIndex = Integer.MAX_VALUE;
170 this.stateSave = DATA;
171 this.strBufLen = 0;
172 this.strBuf = new char[36];
173 this.content = null;
174 this.charset = null;
175 this.httpEquivState = HTTP_EQUIV_NOT_SEEN;
176 }
177
178 @SuppressWarnings("unused") private void destructor() {
179 Portability.releaseString(content);
180 Portability.releaseString(charset);
181 }
182
183 // [NOCPP[
184
185 /**
186 * Reads a byte from the data source.
187 *
188 * -1 means end.
189 * @return
190 * @throws IOException
191 */
192 protected int read() throws IOException {
193 return readable.readByte();
194 }
195
196 // ]NOCPP]
197
198 // WARNING When editing this, makes sure the bytecode length shown by javap
199 // stays under 8000 bytes!
200 /**
201 * The runs the meta scanning algorithm.
202 */
203 protected final void stateLoop(int state)
204 throws SAXException, IOException {
205 int c = -1;
206 boolean reconsume = false;
207 stateloop: for (;;) {
208 switch (state) {
209 case DATA:
210 dataloop: for (;;) {
211 if (reconsume) {
212 reconsume = false;
213 } else {
214 c = read();
215 }
216 switch (c) {
217 case -1:
218 break stateloop;
219 case '<':
220 state = MetaScanner.TAG_OPEN;
221 break dataloop; // FALL THROUGH continue
222 // stateloop;
223 default:
224 continue;
225 }
226 }
227 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
228 case TAG_OPEN:
229 tagopenloop: for (;;) {
230 c = read();
231 switch (c) {
232 case -1:
233 break stateloop;
234 case 'm':
235 case 'M':
236 metaState = M;
237 state = MetaScanner.TAG_NAME;
238 break tagopenloop;
239 // continue stateloop;
240 case '!':
241 state = MetaScanner.MARKUP_DECLARATION_OPEN;
242 continue stateloop;
243 case '?':
244 case '/':
245 state = MetaScanner.SCAN_UNTIL_GT;
246 continue stateloop;
247 case '>':
248 state = MetaScanner.DATA;
249 continue stateloop;
250 default:
251 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
252 metaState = NO;
253 state = MetaScanner.TAG_NAME;
254 break tagopenloop;
255 // continue stateloop;
256 }
257 state = MetaScanner.DATA;
258 reconsume = true;
259 continue stateloop;
260 }
261 }
262 // FALL THROUGH DON'T REORDER
263 case TAG_NAME:
264 tagnameloop: for (;;) {
265 c = read();
266 switch (c) {
267 case -1:
268 break stateloop;
269 case ' ':
270 case '\t':
271 case '\n':
272 case '\u000C':
273 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
274 break tagnameloop;
275 // continue stateloop;
276 case '/':
277 state = MetaScanner.SELF_CLOSING_START_TAG;
278 continue stateloop;
279 case '>':
280 state = MetaScanner.DATA;
281 continue stateloop;
282 case 'e':
283 case 'E':
284 if (metaState == M) {
285 metaState = E;
286 } else {
287 metaState = NO;
288 }
289 continue;
290 case 't':
291 case 'T':
292 if (metaState == E) {
293 metaState = T;
294 } else {
295 metaState = NO;
296 }
297 continue;
298 case 'a':
299 case 'A':
300 if (metaState == T) {
301 metaState = A;
302 } else {
303 metaState = NO;
304 }
305 continue;
306 default:
307 metaState = NO;
308 continue;
309 }
310 }
311 // FALLTHRU DON'T REORDER
312 case BEFORE_ATTRIBUTE_NAME:
313 beforeattributenameloop: for (;;) {
314 if (reconsume) {
315 reconsume = false;
316 } else {
317 c = read();
318 }
319 /*
320 * Consume the next input character:
321 */
322 switch (c) {
323 case -1:
324 break stateloop;
325 case ' ':
326 case '\t':
327 case '\n':
328 case '\u000C':
329 continue;
330 case '/':
331 state = MetaScanner.SELF_CLOSING_START_TAG;
332 continue stateloop;
333 case '>':
334 if (handleTag()) {
335 break stateloop;
336 }
337 state = DATA;
338 continue stateloop;
339 case 'c':
340 case 'C':
341 contentIndex = 0;
342 charsetIndex = 0;
343 httpEquivIndex = Integer.MAX_VALUE;
344 contentTypeIndex = Integer.MAX_VALUE;
345 state = MetaScanner.ATTRIBUTE_NAME;
346 break beforeattributenameloop;
347 case 'h':
348 case 'H':
349 contentIndex = Integer.MAX_VALUE;
350 charsetIndex = Integer.MAX_VALUE;
351 httpEquivIndex = 0;
352 contentTypeIndex = Integer.MAX_VALUE;
353 state = MetaScanner.ATTRIBUTE_NAME;
354 break beforeattributenameloop;
355 default:
356 contentIndex = Integer.MAX_VALUE;
357 charsetIndex = Integer.MAX_VALUE;
358 httpEquivIndex = Integer.MAX_VALUE;
359 contentTypeIndex = Integer.MAX_VALUE;
360 state = MetaScanner.ATTRIBUTE_NAME;
361 break beforeattributenameloop;
362 // continue stateloop;
363 }
364 }
365 // FALLTHRU DON'T REORDER
366 case ATTRIBUTE_NAME:
367 attributenameloop: for (;;) {
368 c = read();
369 switch (c) {
370 case -1:
371 break stateloop;
372 case ' ':
373 case '\t':
374 case '\n':
375 case '\u000C':
376 state = MetaScanner.AFTER_ATTRIBUTE_NAME;
377 continue stateloop;
378 case '/':
379 state = MetaScanner.SELF_CLOSING_START_TAG;
380 continue stateloop;
381 case '=':
382 strBufLen = 0;
383 contentTypeIndex = 0;
384 state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
385 break attributenameloop;
386 // continue stateloop;
387 case '>':
388 if (handleTag()) {
389 break stateloop;
390 }
391 state = MetaScanner.DATA;
392 continue stateloop;
393 default:
394 if (metaState == A) {
395 if (c >= 'A' && c <= 'Z') {
396 c += 0x20;
397 }
398 if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) {
399 ++contentIndex;
400 } else {
401 contentIndex = Integer.MAX_VALUE;
402 }
403 if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) {
404 ++charsetIndex;
405 } else {
406 charsetIndex = Integer.MAX_VALUE;
407 }
408 if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) {
409 ++httpEquivIndex;
410 } else {
411 httpEquivIndex = Integer.MAX_VALUE;
412 }
413 }
414 continue;
415 }
416 }
417 // FALLTHRU DON'T REORDER
418 case BEFORE_ATTRIBUTE_VALUE:
419 beforeattributevalueloop: for (;;) {
420 c = read();
421 switch (c) {
422 case -1:
423 break stateloop;
424 case ' ':
425 case '\t':
426 case '\n':
427 case '\u000C':
428 continue;
429 case '"':
430 state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED;
431 break beforeattributevalueloop;
432 // continue stateloop;
433 case '\'':
434 state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED;
435 continue stateloop;
436 case '>':
437 if (handleTag()) {
438 break stateloop;
439 }
440 state = MetaScanner.DATA;
441 continue stateloop;
442 default:
443 handleCharInAttributeValue(c);
444 state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED;
445 continue stateloop;
446 }
447 }
448 // FALLTHRU DON'T REORDER
449 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
450 attributevaluedoublequotedloop: for (;;) {
451 if (reconsume) {
452 reconsume = false;
453 } else {
454 c = read();
455 }
456 switch (c) {
457 case -1:
458 break stateloop;
459 case '"':
460 handleAttributeValue();
461 state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
462 break attributevaluedoublequotedloop;
463 // continue stateloop;
464 default:
465 handleCharInAttributeValue(c);
466 continue;
467 }
468 }
469 // FALLTHRU DON'T REORDER
470 case AFTER_ATTRIBUTE_VALUE_QUOTED:
471 afterattributevaluequotedloop: for (;;) {
472 c = read();
473 switch (c) {
474 case -1:
475 break stateloop;
476 case ' ':
477 case '\t':
478 case '\n':
479 case '\u000C':
480 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
481 continue stateloop;
482 case '/':
483 state = MetaScanner.SELF_CLOSING_START_TAG;
484 break afterattributevaluequotedloop;
485 // continue stateloop;
486 case '>':
487 if (handleTag()) {
488 break stateloop;
489 }
490 state = MetaScanner.DATA;
491 continue stateloop;
492 default:
493 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
494 reconsume = true;
495 continue stateloop;
496 }
497 }
498 // FALLTHRU DON'T REORDER
499 case SELF_CLOSING_START_TAG:
500 c = read();
501 switch (c) {
502 case -1:
503 break stateloop;
504 case '>':
505 if (handleTag()) {
506 break stateloop;
507 }
508 state = MetaScanner.DATA;
509 continue stateloop;
510 default:
511 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
512 reconsume = true;
513 continue stateloop;
514 }
515 // XXX reorder point
516 case ATTRIBUTE_VALUE_UNQUOTED:
517 for (;;) {
518 if (reconsume) {
519 reconsume = false;
520 } else {
521 c = read();
522 }
523 switch (c) {
524 case -1:
525 break stateloop;
526 case ' ':
527 case '\t':
528 case '\n':
529
530 case '\u000C':
531 handleAttributeValue();
532 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
533 continue stateloop;
534 case '>':
535 handleAttributeValue();
536 if (handleTag()) {
537 break stateloop;
538 }
539 state = MetaScanner.DATA;
540 continue stateloop;
541 default:
542 handleCharInAttributeValue(c);
543 continue;
544 }
545 }
546 // XXX reorder point
547 case AFTER_ATTRIBUTE_NAME:
548 for (;;) {
549 c = read();
550 switch (c) {
551 case -1:
552 break stateloop;
553 case ' ':
554 case '\t':
555 case '\n':
556 case '\u000C':
557 continue;
558 case '/':
559 handleAttributeValue();
560 state = MetaScanner.SELF_CLOSING_START_TAG;
561 continue stateloop;
562 case '=':
563 strBufLen = 0;
564 contentTypeIndex = 0;
565 state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
566 continue stateloop;
567 case '>':
568 handleAttributeValue();
569 if (handleTag()) {
570 break stateloop;
571 }
572 state = MetaScanner.DATA;
573 continue stateloop;
574 case 'c':
575 case 'C':
576 contentIndex = 0;
577 charsetIndex = 0;
578 state = MetaScanner.ATTRIBUTE_NAME;
579 continue stateloop;
580 default:
581 contentIndex = -1;
582 charsetIndex = -1;
583 state = MetaScanner.ATTRIBUTE_NAME;
584 continue stateloop;
585 }
586 }
587 // XXX reorder point
588 case MARKUP_DECLARATION_OPEN:
589 markupdeclarationopenloop: for (;;) {
590 c = read();
591 switch (c) {
592 case -1:
593 break stateloop;
594 case '-':
595 state = MetaScanner.MARKUP_DECLARATION_HYPHEN;
596 break markupdeclarationopenloop;
597 // continue stateloop;
598 default:
599 state = MetaScanner.SCAN_UNTIL_GT;
600 reconsume = true;
601 continue stateloop;
602 }
603 }
604 // FALLTHRU DON'T REORDER
605 case MARKUP_DECLARATION_HYPHEN:
606 markupdeclarationhyphenloop: for (;;) {
607 c = read();
608 switch (c) {
609 case -1:
610 break stateloop;
611 case '-':
612 state = MetaScanner.COMMENT_START;
613 break markupdeclarationhyphenloop;
614 // continue stateloop;
615 default:
616 state = MetaScanner.SCAN_UNTIL_GT;
617 reconsume = true;
618 continue stateloop;
619 }
620 }
621 // FALLTHRU DON'T REORDER
622 case COMMENT_START:
623 commentstartloop: for (;;) {
624 c = read();
625 switch (c) {
626 case -1:
627 break stateloop;
628 case '-':
629 state = MetaScanner.COMMENT_START_DASH;
630 continue stateloop;
631 case '>':
632 state = MetaScanner.DATA;
633 continue stateloop;
634 default:
635 state = MetaScanner.COMMENT;
636 break commentstartloop;
637 // continue stateloop;
638 }
639 }
640 // FALLTHRU DON'T REORDER
641 case COMMENT:
642 commentloop: for (;;) {
643 c = read();
644 switch (c) {
645 case -1:
646 break stateloop;
647 case '-':
648 state = MetaScanner.COMMENT_END_DASH;
649 break commentloop;
650 // continue stateloop;
651 default:
652 continue;
653 }
654 }
655 // FALLTHRU DON'T REORDER
656 case COMMENT_END_DASH:
657 commentenddashloop: for (;;) {
658 c = read();
659 switch (c) {
660 case -1:
661 break stateloop;
662 case '-':
663 state = MetaScanner.COMMENT_END;
664 break commentenddashloop;
665 // continue stateloop;
666 default:
667 state = MetaScanner.COMMENT;
668 continue stateloop;
669 }
670 }
671 // FALLTHRU DON'T REORDER
672 case COMMENT_END:
673 for (;;) {
674 c = read();
675 switch (c) {
676 case -1:
677 break stateloop;
678 case '>':
679 state = MetaScanner.DATA;
680 continue stateloop;
681 case '-':
682 continue;
683 default:
684 state = MetaScanner.COMMENT;
685 continue stateloop;
686 }
687 }
688 // XXX reorder point
689 case COMMENT_START_DASH:
690 c = read();
691 switch (c) {
692 case -1:
693 break stateloop;
694 case '-':
695 state = MetaScanner.COMMENT_END;
696 continue stateloop;
697 case '>':
698 state = MetaScanner.DATA;
699 continue stateloop;
700 default:
701 state = MetaScanner.COMMENT;
702 continue stateloop;
703 }
704 // XXX reorder point
705 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
706 for (;;) {
707 if (reconsume) {
708 reconsume = false;
709 } else {
710 c = read();
711 }
712 switch (c) {
713 case -1:
714 break stateloop;
715 case '\'':
716 handleAttributeValue();
717 state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
718 continue stateloop;
719 default:
720 handleCharInAttributeValue(c);
721 continue;
722 }
723 }
724 // XXX reorder point
725 case SCAN_UNTIL_GT:
726 for (;;) {
727 if (reconsume) {
728 reconsume = false;
729 } else {
730 c = read();
731 }
732 switch (c) {
733 case -1:
734 break stateloop;
735 case '>':
736 state = MetaScanner.DATA;
737 continue stateloop;
738 default:
739 continue;
740 }
741 }
742 }
743 }
744 stateSave = state;
745 }
746
747 private void handleCharInAttributeValue(int c) {
748 if (metaState == A) {
749 if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) {
750 addToBuffer(c);
751 } else if (httpEquivIndex == HTTP_EQUIV.length) {
752 if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) {
753 ++contentTypeIndex;
754 } else {
755 contentTypeIndex = Integer.MAX_VALUE;
756 }
757 }
758 }
759 }
760
761 @Inline private int toAsciiLowerCase(int c) {
762 if (c >= 'A' && c <= 'Z') {
763 return c + 0x20;
764 }
765 return c;
766 }
767
768 /**
769 * Adds a character to the accumulation buffer.
770 * @param c the character to add
771 */
772 private void addToBuffer(int c) {
773 if (strBufLen == strBuf.length) {
774 char[] newBuf = new char[strBuf.length + (strBuf.length << 1)];
775 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
776 strBuf = newBuf;
777 }
778 strBuf[strBufLen++] = (char)c;
779 }
780
781 /**
782 * Attempts to extract a charset name from the accumulation buffer.
783 * @return <code>true</code> if successful
784 * @throws SAXException
785 */
786 private void handleAttributeValue() throws SAXException {
787 if (metaState != A) {
788 return;
789 }
790 if (contentIndex == CONTENT.length && content == null) {
791 content = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
792 return;
793 }
794 if (charsetIndex == CHARSET.length && charset == null) {
795 charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
796 return;
797 }
798 if (httpEquivIndex == HTTP_EQUIV.length
799 && httpEquivState == HTTP_EQUIV_NOT_SEEN) {
800 httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE
801 : HTTP_EQUIV_OTHER;
802 return;
803 }
804 }
805
806 private boolean handleTag() throws SAXException {
807 boolean stop = handleTagInner();
808 Portability.releaseString(content);
809 content = null;
810 Portability.releaseString(charset);
811 charset = null;
812 httpEquivState = HTTP_EQUIV_NOT_SEEN;
813 return stop;
814 }
815
816 private boolean handleTagInner() throws SAXException {
817 if (charset != null && tryCharset(charset)) {
818 return true;
819 }
820 if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) {
821 String extract = TreeBuilder.extractCharsetFromContent(content);
822 if (extract == null) {
823 return false;
824 }
825 boolean success = tryCharset(extract);
826 Portability.releaseString(extract);
827 return success;
828 }
829 return false;
830 }
831
832 /**
833 * Tries to switch to an encoding.
834 *
835 * @param encoding
836 * @return <code>true</code> if successful
837 * @throws SAXException
838 */
839 protected abstract boolean tryCharset(String encoding) throws SAXException;
840
841 }