001 /*
002 * Copyright (c) 2005, 2006, 2007 Henri Sivonen
003 * Copyright (c) 2007-2008 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.io;
025
026 import java.io.IOException;
027 import java.io.InputStream;
028 import java.io.Reader;
029 import java.nio.charset.UnsupportedCharsetException;
030
031 import nu.validator.htmlparser.common.CharacterHandler;
032 import nu.validator.htmlparser.common.EncodingDeclarationHandler;
033 import nu.validator.htmlparser.common.Heuristics;
034 import nu.validator.htmlparser.common.TokenHandler;
035 import nu.validator.htmlparser.common.TransitionHandler;
036 import nu.validator.htmlparser.common.XmlViolationPolicy;
037 import nu.validator.htmlparser.extra.NormalizationChecker;
038 import nu.validator.htmlparser.impl.ErrorReportingTokenizer;
039 import nu.validator.htmlparser.impl.Tokenizer;
040 import nu.validator.htmlparser.impl.TreeBuilder;
041 import nu.validator.htmlparser.impl.UTF16Buffer;
042 import nu.validator.htmlparser.rewindable.RewindableInputStream;
043
044 import org.xml.sax.ErrorHandler;
045 import org.xml.sax.InputSource;
046 import org.xml.sax.Locator;
047 import org.xml.sax.SAXException;
048 import org.xml.sax.SAXParseException;
049
050 public class Driver implements EncodingDeclarationHandler {
051
052 /**
053 * The input UTF-16 code unit stream. If a byte stream was given, this
054 * object is an instance of <code>HtmlInputStreamReader</code>.
055 */
056 private Reader reader;
057
058 /**
059 * The reference to the rewindable byte stream. <code>null</code> if p
060 * rohibited or no longer needed.
061 */
062 private RewindableInputStream rewindableInputStream;
063
064 private boolean swallowBom;
065
066 private Encoding characterEncoding;
067
068 private boolean allowRewinding = true;
069
070 private Heuristics heuristics = Heuristics.NONE;
071
072 private final Tokenizer tokenizer;
073
074 private Confidence confidence;
075
076 /**
077 * Used for NFC checking if non-<code>null</code>, source code capture,
078 * etc.
079 */
080 private CharacterHandler[] characterHandlers = new CharacterHandler[0];
081
082 public Driver(Tokenizer tokenizer) {
083 this.tokenizer = tokenizer;
084 tokenizer.setEncodingDeclarationHandler(this);
085 }
086
087 /**
088 * Returns the allowRewinding.
089 *
090 * @return the allowRewinding
091 */
092 public boolean isAllowRewinding() {
093 return allowRewinding;
094 }
095
096 /**
097 * Sets the allowRewinding.
098 *
099 * @param allowRewinding
100 * the allowRewinding to set
101 */
102 public void setAllowRewinding(boolean allowRewinding) {
103 this.allowRewinding = allowRewinding;
104 }
105
106 /**
107 * Turns NFC checking on or off.
108 *
109 * @param enable
110 * <code>true</code> if checking on
111 */
112 public void setCheckingNormalization(boolean enable) {
113 if (enable) {
114 if (isCheckingNormalization()) {
115 return;
116 } else {
117 NormalizationChecker normalizationChecker = new NormalizationChecker(tokenizer);
118 normalizationChecker.setErrorHandler(tokenizer.getErrorHandler());
119
120 }
121 } else {
122 if (isCheckingNormalization()) {
123 CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length - 1];
124 boolean skipped = false;
125 int j = 0;
126 for (int i = 0; i < characterHandlers.length; i++) {
127 CharacterHandler ch = characterHandlers[i];
128 if (!(!skipped && (ch instanceof NormalizationChecker))) {
129 newHandlers[j] = ch;
130 j++;
131 }
132 }
133 characterHandlers = newHandlers;
134 } else {
135 return;
136 }
137 }
138 }
139
140 public void addCharacterHandler(CharacterHandler characterHandler) {
141 if (characterHandler == null) {
142 throw new IllegalArgumentException("Null argument.");
143 }
144 CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length + 1];
145 System.arraycopy(characterHandlers, 0, newHandlers, 0,
146 characterHandlers.length);
147 newHandlers[characterHandlers.length] = characterHandler;
148 characterHandlers = newHandlers;
149 }
150
151 /**
152 * Query if checking normalization.
153 *
154 * @return <code>true</code> if checking on
155 */
156 public boolean isCheckingNormalization() {
157 for (int i = 0; i < characterHandlers.length; i++) {
158 CharacterHandler ch = characterHandlers[i];
159 if (ch instanceof NormalizationChecker) {
160 return true;
161 }
162 }
163 return false;
164 }
165
166 /**
167 * Runs the tokenization. This is the main entry point.
168 *
169 * @param is
170 * the input source
171 * @throws SAXException
172 * on fatal error (if configured to treat XML violations as
173 * fatal) or if the token handler threw
174 * @throws IOException
175 * if the stream threw
176 */
177 public void tokenize(InputSource is) throws SAXException, IOException {
178 if (is == null) {
179 throw new IllegalArgumentException("InputSource was null.");
180 }
181 tokenizer.start();
182 confidence = Confidence.TENTATIVE;
183 swallowBom = true;
184 rewindableInputStream = null;
185 tokenizer.initLocation(is.getPublicId(), is.getSystemId());
186 this.reader = is.getCharacterStream();
187 this.characterEncoding = encodingFromExternalDeclaration(is.getEncoding());
188 if (this.reader == null) {
189 InputStream inputStream = is.getByteStream();
190 if (inputStream == null) {
191 throw new SAXException("Both streams in InputSource were null.");
192 }
193 if (this.characterEncoding == null) {
194 if (allowRewinding) {
195 inputStream = rewindableInputStream = new RewindableInputStream(
196 inputStream);
197 }
198 this.reader = new HtmlInputStreamReader(inputStream,
199 tokenizer.getErrorHandler(), tokenizer, this, heuristics);
200 } else {
201 becomeConfident();
202 this.reader = new HtmlInputStreamReader(inputStream,
203 tokenizer.getErrorHandler(), tokenizer, this, this.characterEncoding);
204 }
205 } else {
206 becomeConfident();
207 }
208 Throwable t = null;
209 try {
210 for (;;) {
211 try {
212 for (int i = 0; i < characterHandlers.length; i++) {
213 CharacterHandler ch = characterHandlers[i];
214 ch.start();
215 }
216 runStates();
217 if (confidence == Confidence.TENTATIVE
218 && !tokenizer.isAlreadyComplainedAboutNonAscii()) {
219 warnWithoutLocation("The character encoding of the document was not declared.");
220 }
221 break;
222 } catch (ReparseException e) {
223 if (rewindableInputStream == null) {
224 tokenizer.fatal("Changing encoding at this point would need non-streamable behavior.");
225 } else {
226 rewindableInputStream.rewind();
227 becomeConfident();
228 this.reader = new HtmlInputStreamReader(
229 rewindableInputStream, tokenizer.getErrorHandler(), tokenizer,
230 this, this.characterEncoding);
231 }
232 continue;
233 }
234 }
235 } catch (Throwable tr) {
236 t = tr;
237 } finally {
238 try {
239 tokenizer.end();
240 characterEncoding = null;
241 for (int i = 0; i < characterHandlers.length; i++) {
242 CharacterHandler ch = characterHandlers[i];
243 ch.end();
244 }
245 reader.close();
246 reader = null;
247 rewindableInputStream = null;
248 } catch (Throwable tr) {
249 if (t == null) {
250 t = tr;
251 } // else drop the later throwable
252 }
253 if (t != null) {
254 if (t instanceof IOException) {
255 throw (IOException) t;
256 } else if (t instanceof SAXException) {
257 throw (SAXException) t;
258 } else if (t instanceof RuntimeException) {
259 throw (RuntimeException) t;
260 } else if (t instanceof Error) {
261 throw (Error) t;
262 } else {
263 // impossible
264 throw new RuntimeException(t);
265 }
266 }
267 }
268 }
269
270 void dontSwallowBom() {
271 swallowBom = false;
272 }
273
274 private void runStates() throws SAXException, IOException {
275 char[] buffer = new char[2048];
276 UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0);
277 boolean lastWasCR = false;
278 int len = -1;
279 if ((len = reader.read(buffer)) != -1) {
280 assert len > 0;
281 int streamOffset = 0;
282 int offset = 0;
283 int length = len;
284 if (swallowBom) {
285 if (buffer[0] == '\uFEFF') {
286 streamOffset = -1;
287 offset = 1;
288 length--;
289 }
290 }
291 if (length > 0) {
292 for (int i = 0; i < characterHandlers.length; i++) {
293 CharacterHandler ch = characterHandlers[i];
294 ch.characters(buffer, offset, length);
295 }
296 tokenizer.setTransitionBaseOffset(streamOffset);
297 bufr.setStart(offset);
298 bufr.setEnd(offset + length);
299 while (bufr.hasMore()) {
300 bufr.adjust(lastWasCR);
301 lastWasCR = false;
302 if (bufr.hasMore()) {
303 lastWasCR = tokenizer.tokenizeBuffer(bufr);
304 }
305 }
306 }
307 streamOffset = length;
308 while ((len = reader.read(buffer)) != -1) {
309 assert len > 0;
310 for (int i = 0; i < characterHandlers.length; i++) {
311 CharacterHandler ch = characterHandlers[i];
312 ch.characters(buffer, 0, len);
313 }
314 tokenizer.setTransitionBaseOffset(streamOffset);
315 bufr.setStart(0);
316 bufr.setEnd(len);
317 while (bufr.hasMore()) {
318 bufr.adjust(lastWasCR);
319 lastWasCR = false;
320 if (bufr.hasMore()) {
321 lastWasCR = tokenizer.tokenizeBuffer(bufr);
322 }
323 }
324 streamOffset += len;
325 }
326 }
327 tokenizer.eof();
328 }
329
330 public void setEncoding(Encoding encoding, Confidence confidence) {
331 this.characterEncoding = encoding;
332 if (confidence == Confidence.CERTAIN) {
333 becomeConfident();
334 }
335 }
336
337 public boolean internalEncodingDeclaration(String internalCharset)
338 throws SAXException {
339 try {
340 internalCharset = Encoding.toAsciiLowerCase(internalCharset);
341 Encoding cs;
342 if ("utf-16".equals(internalCharset)
343 || "utf-16be".equals(internalCharset)
344 || "utf-16le".equals(internalCharset)) {
345 tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
346 + internalCharset
347 + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D.");
348 cs = Encoding.UTF8;
349 internalCharset = "utf-8";
350 } else {
351 cs = Encoding.forName(internalCharset);
352 }
353 Encoding actual = cs.getActualHtmlEncoding();
354 if (actual == null) {
355 actual = cs;
356 }
357 if (!actual.isAsciiSuperset()) {
358 tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
359 + internalCharset
360 + "\u201D which is not an ASCII superset. Not changing the encoding.");
361 return false;
362 }
363 if (characterEncoding == null) {
364 // Reader case
365 return true;
366 }
367 if (characterEncoding == actual) {
368 becomeConfident();
369 return true;
370 }
371 if (confidence == Confidence.CERTAIN && actual != characterEncoding) {
372 tokenizer.errTreeBuilder("Internal encoding declaration \u201C"
373 + internalCharset
374 + "\u201D disagrees with the actual encoding of the document (\u201C"
375 + characterEncoding.getCanonName() + "\u201D).");
376 } else {
377 Encoding newEnc = whineAboutEncodingAndReturnActual(
378 internalCharset, cs);
379 tokenizer.errTreeBuilder("Changing character encoding \u201C"
380 + internalCharset + "\u201D and reparsing.");
381 characterEncoding = newEnc;
382 throw new ReparseException();
383 }
384 return true;
385 } catch (UnsupportedCharsetException e) {
386 tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C"
387 + internalCharset + "\u201D.");
388 return false;
389 }
390 }
391
392 /**
393 *
394 */
395 private void becomeConfident() {
396 if (rewindableInputStream != null) {
397 rewindableInputStream.willNotRewind();
398 }
399 confidence = Confidence.CERTAIN;
400 tokenizer.becomeConfident();
401 }
402
403 /**
404 * Sets the encoding sniffing heuristics.
405 *
406 * @param heuristics
407 * the heuristics to set
408 */
409 public void setHeuristics(Heuristics heuristics) {
410 this.heuristics = heuristics;
411 }
412
413 /**
414 * Reports a warning without line/col
415 *
416 * @param message
417 * the message
418 * @throws SAXException
419 */
420 protected void warnWithoutLocation(String message) throws SAXException {
421 ErrorHandler errorHandler = tokenizer.getErrorHandler();
422 if (errorHandler == null) {
423 return;
424 }
425 SAXParseException spe = new SAXParseException(message, null,
426 tokenizer.getSystemId(), -1, -1);
427 errorHandler.warning(spe);
428 }
429
430 /**
431 * Initializes a decoder from external decl.
432 */
433 protected Encoding encodingFromExternalDeclaration(String encoding)
434 throws SAXException {
435 if (encoding == null) {
436 return null;
437 }
438 encoding = Encoding.toAsciiLowerCase(encoding);
439 try {
440 Encoding cs = Encoding.forName(encoding);
441 if ("utf-16".equals(cs.getCanonName())
442 || "utf-32".equals(cs.getCanonName())) {
443 swallowBom = false;
444 }
445 return whineAboutEncodingAndReturnActual(encoding, cs);
446 } catch (UnsupportedCharsetException e) {
447 tokenizer.err("Unsupported character encoding name: \u201C" + encoding
448 + "\u201D. Will sniff.");
449 swallowBom = true;
450 }
451 return null; // keep the compiler happy
452 }
453
454 /**
455 * @param encoding
456 * @param cs
457 * @return
458 * @throws SAXException
459 */
460 protected Encoding whineAboutEncodingAndReturnActual(String encoding,
461 Encoding cs) throws SAXException {
462 String canonName = cs.getCanonName();
463 if (!cs.isRegistered()) {
464 if (encoding.startsWith("x-")) {
465 tokenizer.err("The encoding \u201C"
466 + encoding
467 + "\u201D is not an IANA-registered encoding. (Charmod C022)");
468 } else {
469 tokenizer.err("The encoding \u201C"
470 + encoding
471 + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
472 }
473 } else if (!canonName.equals(encoding)) {
474 tokenizer.err("The encoding \u201C"
475 + encoding
476 + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
477 + canonName + "\u201D. (Charmod C024)");
478 }
479 if (cs.isShouldNot()) {
480 tokenizer.warn("Authors should not use the character encoding \u201C"
481 + encoding
482 + "\u201D. It is recommended to use \u201CUTF-8\u201D.");
483 } else if (cs.isLikelyEbcdic()) {
484 tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D.");
485 } else if (cs.isObscure()) {
486 tokenizer.warn("The character encoding \u201C"
487 + encoding
488 + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
489 }
490 Encoding actual = cs.getActualHtmlEncoding();
491 if (actual == null) {
492 return cs;
493 } else {
494 tokenizer.warn("Using \u201C" + actual.getCanonName()
495 + "\u201D instead of the declared encoding \u201C"
496 + encoding + "\u201D.");
497 return actual;
498 }
499 }
500
501 private class ReparseException extends SAXException {
502
503 }
504
505 void notifyAboutMetaBoundary() {
506 tokenizer.notifyAboutMetaBoundary();
507 }
508
509 /**
510 * @param commentPolicy
511 * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
512 */
513 public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
514 tokenizer.setCommentPolicy(commentPolicy);
515 }
516
517 /**
518 * @param contentNonXmlCharPolicy
519 * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
520 */
521 public void setContentNonXmlCharPolicy(
522 XmlViolationPolicy contentNonXmlCharPolicy) {
523 tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
524 }
525
526 /**
527 * @param contentSpacePolicy
528 * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
529 */
530 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
531 tokenizer.setContentSpacePolicy(contentSpacePolicy);
532 }
533
534 /**
535 * @param eh
536 * @see nu.validator.htmlparser.impl.Tokenizer#setErrorHandler(org.xml.sax.ErrorHandler)
537 */
538 public void setErrorHandler(ErrorHandler eh) {
539 tokenizer.setErrorHandler(eh);
540 for (int i = 0; i < characterHandlers.length; i++) {
541 CharacterHandler ch = characterHandlers[i];
542 if (ch instanceof NormalizationChecker) {
543 NormalizationChecker nc = (NormalizationChecker) ch;
544 nc.setErrorHandler(eh);
545 }
546 }
547 }
548
549 public void setTransitionHandler(TransitionHandler transitionHandler) {
550 if (tokenizer instanceof ErrorReportingTokenizer) {
551 ErrorReportingTokenizer ert = (ErrorReportingTokenizer) tokenizer;
552 ert.setTransitionHandler(transitionHandler);
553 } else if (transitionHandler != null) {
554 throw new IllegalStateException("Attempt to set a transition handler on a plain tokenizer.");
555 }
556 }
557
558 /**
559 * @param html4ModeCompatibleWithXhtml1Schemata
560 * @see nu.validator.htmlparser.impl.Tokenizer#setHtml4ModeCompatibleWithXhtml1Schemata(boolean)
561 */
562 public void setHtml4ModeCompatibleWithXhtml1Schemata(
563 boolean html4ModeCompatibleWithXhtml1Schemata) {
564 tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
565 }
566
567 /**
568 * @param mappingLangToXmlLang
569 * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
570 */
571 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
572 tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
573 }
574
575 /**
576 * @param namePolicy
577 * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
578 */
579 public void setNamePolicy(XmlViolationPolicy namePolicy) {
580 tokenizer.setNamePolicy(namePolicy);
581 }
582
583 /**
584 * @param xmlnsPolicy
585 * @see nu.validator.htmlparser.impl.Tokenizer#setXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
586 */
587 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
588 tokenizer.setXmlnsPolicy(xmlnsPolicy);
589 }
590
591 public String getCharacterEncoding() throws SAXException {
592 return characterEncoding.getCanonName();
593 }
594
595 public Locator getDocumentLocator() {
596 return tokenizer;
597 }
598 }