001 /* 002 * Copyright (c) 2005, 2006, 2007 Henri Sivonen 003 * Copyright (c) 2007-2008 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.htmlparser.io; 025 026 import java.io.IOException; 027 import java.io.InputStream; 028 import java.io.Reader; 029 import java.nio.charset.UnsupportedCharsetException; 030 031 import nu.validator.htmlparser.common.CharacterHandler; 032 import nu.validator.htmlparser.common.EncodingDeclarationHandler; 033 import nu.validator.htmlparser.common.Heuristics; 034 import nu.validator.htmlparser.common.TokenHandler; 035 import nu.validator.htmlparser.common.TransitionHandler; 036 import nu.validator.htmlparser.common.XmlViolationPolicy; 037 import nu.validator.htmlparser.extra.NormalizationChecker; 038 import nu.validator.htmlparser.impl.ErrorReportingTokenizer; 039 import nu.validator.htmlparser.impl.Tokenizer; 040 import nu.validator.htmlparser.impl.TreeBuilder; 041 import nu.validator.htmlparser.impl.UTF16Buffer; 042 import nu.validator.htmlparser.rewindable.RewindableInputStream; 043 044 import org.xml.sax.ErrorHandler; 045 import org.xml.sax.InputSource; 046 import org.xml.sax.Locator; 047 import org.xml.sax.SAXException; 048 import org.xml.sax.SAXParseException; 049 050 public class Driver implements EncodingDeclarationHandler { 051 052 /** 053 * The input UTF-16 code unit stream. If a byte stream was given, this 054 * object is an instance of <code>HtmlInputStreamReader</code>. 055 */ 056 private Reader reader; 057 058 /** 059 * The reference to the rewindable byte stream. <code>null</code> if p 060 * rohibited or no longer needed. 061 */ 062 private RewindableInputStream rewindableInputStream; 063 064 private boolean swallowBom; 065 066 private Encoding characterEncoding; 067 068 private boolean allowRewinding = true; 069 070 private Heuristics heuristics = Heuristics.NONE; 071 072 private final Tokenizer tokenizer; 073 074 private Confidence confidence; 075 076 /** 077 * Used for NFC checking if non-<code>null</code>, source code capture, 078 * etc. 079 */ 080 private CharacterHandler[] characterHandlers = new CharacterHandler[0]; 081 082 public Driver(Tokenizer tokenizer) { 083 this.tokenizer = tokenizer; 084 tokenizer.setEncodingDeclarationHandler(this); 085 } 086 087 /** 088 * Returns the allowRewinding. 089 * 090 * @return the allowRewinding 091 */ 092 public boolean isAllowRewinding() { 093 return allowRewinding; 094 } 095 096 /** 097 * Sets the allowRewinding. 098 * 099 * @param allowRewinding 100 * the allowRewinding to set 101 */ 102 public void setAllowRewinding(boolean allowRewinding) { 103 this.allowRewinding = allowRewinding; 104 } 105 106 /** 107 * Turns NFC checking on or off. 108 * 109 * @param enable 110 * <code>true</code> if checking on 111 */ 112 public void setCheckingNormalization(boolean enable) { 113 if (enable) { 114 if (isCheckingNormalization()) { 115 return; 116 } else { 117 NormalizationChecker normalizationChecker = new NormalizationChecker(tokenizer); 118 normalizationChecker.setErrorHandler(tokenizer.getErrorHandler()); 119 120 } 121 } else { 122 if (isCheckingNormalization()) { 123 CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length - 1]; 124 boolean skipped = false; 125 int j = 0; 126 for (int i = 0; i < characterHandlers.length; i++) { 127 CharacterHandler ch = characterHandlers[i]; 128 if (!(!skipped && (ch instanceof NormalizationChecker))) { 129 newHandlers[j] = ch; 130 j++; 131 } 132 } 133 characterHandlers = newHandlers; 134 } else { 135 return; 136 } 137 } 138 } 139 140 public void addCharacterHandler(CharacterHandler characterHandler) { 141 if (characterHandler == null) { 142 throw new IllegalArgumentException("Null argument."); 143 } 144 CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length + 1]; 145 System.arraycopy(characterHandlers, 0, newHandlers, 0, 146 characterHandlers.length); 147 newHandlers[characterHandlers.length] = characterHandler; 148 characterHandlers = newHandlers; 149 } 150 151 /** 152 * Query if checking normalization. 153 * 154 * @return <code>true</code> if checking on 155 */ 156 public boolean isCheckingNormalization() { 157 for (int i = 0; i < characterHandlers.length; i++) { 158 CharacterHandler ch = characterHandlers[i]; 159 if (ch instanceof NormalizationChecker) { 160 return true; 161 } 162 } 163 return false; 164 } 165 166 /** 167 * Runs the tokenization. This is the main entry point. 168 * 169 * @param is 170 * the input source 171 * @throws SAXException 172 * on fatal error (if configured to treat XML violations as 173 * fatal) or if the token handler threw 174 * @throws IOException 175 * if the stream threw 176 */ 177 public void tokenize(InputSource is) throws SAXException, IOException { 178 if (is == null) { 179 throw new IllegalArgumentException("InputSource was null."); 180 } 181 tokenizer.start(); 182 confidence = Confidence.TENTATIVE; 183 swallowBom = true; 184 rewindableInputStream = null; 185 tokenizer.initLocation(is.getPublicId(), is.getSystemId()); 186 this.reader = is.getCharacterStream(); 187 this.characterEncoding = encodingFromExternalDeclaration(is.getEncoding()); 188 if (this.reader == null) { 189 InputStream inputStream = is.getByteStream(); 190 if (inputStream == null) { 191 throw new SAXException("Both streams in InputSource were null."); 192 } 193 if (this.characterEncoding == null) { 194 if (allowRewinding) { 195 inputStream = rewindableInputStream = new RewindableInputStream( 196 inputStream); 197 } 198 this.reader = new HtmlInputStreamReader(inputStream, 199 tokenizer.getErrorHandler(), tokenizer, this, heuristics); 200 } else { 201 becomeConfident(); 202 this.reader = new HtmlInputStreamReader(inputStream, 203 tokenizer.getErrorHandler(), tokenizer, this, this.characterEncoding); 204 } 205 } else { 206 becomeConfident(); 207 } 208 Throwable t = null; 209 try { 210 for (;;) { 211 try { 212 for (int i = 0; i < characterHandlers.length; i++) { 213 CharacterHandler ch = characterHandlers[i]; 214 ch.start(); 215 } 216 runStates(); 217 if (confidence == Confidence.TENTATIVE 218 && !tokenizer.isAlreadyComplainedAboutNonAscii()) { 219 warnWithoutLocation("The character encoding of the document was not declared."); 220 } 221 break; 222 } catch (ReparseException e) { 223 if (rewindableInputStream == null) { 224 tokenizer.fatal("Changing encoding at this point would need non-streamable behavior."); 225 } else { 226 rewindableInputStream.rewind(); 227 becomeConfident(); 228 this.reader = new HtmlInputStreamReader( 229 rewindableInputStream, tokenizer.getErrorHandler(), tokenizer, 230 this, this.characterEncoding); 231 } 232 continue; 233 } 234 } 235 } catch (Throwable tr) { 236 t = tr; 237 } finally { 238 try { 239 tokenizer.end(); 240 characterEncoding = null; 241 for (int i = 0; i < characterHandlers.length; i++) { 242 CharacterHandler ch = characterHandlers[i]; 243 ch.end(); 244 } 245 reader.close(); 246 reader = null; 247 rewindableInputStream = null; 248 } catch (Throwable tr) { 249 if (t == null) { 250 t = tr; 251 } // else drop the later throwable 252 } 253 if (t != null) { 254 if (t instanceof IOException) { 255 throw (IOException) t; 256 } else if (t instanceof SAXException) { 257 throw (SAXException) t; 258 } else if (t instanceof RuntimeException) { 259 throw (RuntimeException) t; 260 } else if (t instanceof Error) { 261 throw (Error) t; 262 } else { 263 // impossible 264 throw new RuntimeException(t); 265 } 266 } 267 } 268 } 269 270 void dontSwallowBom() { 271 swallowBom = false; 272 } 273 274 private void runStates() throws SAXException, IOException { 275 char[] buffer = new char[2048]; 276 UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0); 277 boolean lastWasCR = false; 278 int len = -1; 279 if ((len = reader.read(buffer)) != -1) { 280 assert len > 0; 281 int streamOffset = 0; 282 int offset = 0; 283 int length = len; 284 if (swallowBom) { 285 if (buffer[0] == '\uFEFF') { 286 streamOffset = -1; 287 offset = 1; 288 length--; 289 } 290 } 291 if (length > 0) { 292 for (int i = 0; i < characterHandlers.length; i++) { 293 CharacterHandler ch = characterHandlers[i]; 294 ch.characters(buffer, offset, length); 295 } 296 tokenizer.setTransitionBaseOffset(streamOffset); 297 bufr.setStart(offset); 298 bufr.setEnd(offset + length); 299 while (bufr.hasMore()) { 300 bufr.adjust(lastWasCR); 301 lastWasCR = false; 302 if (bufr.hasMore()) { 303 lastWasCR = tokenizer.tokenizeBuffer(bufr); 304 } 305 } 306 } 307 streamOffset = length; 308 while ((len = reader.read(buffer)) != -1) { 309 assert len > 0; 310 for (int i = 0; i < characterHandlers.length; i++) { 311 CharacterHandler ch = characterHandlers[i]; 312 ch.characters(buffer, 0, len); 313 } 314 tokenizer.setTransitionBaseOffset(streamOffset); 315 bufr.setStart(0); 316 bufr.setEnd(len); 317 while (bufr.hasMore()) { 318 bufr.adjust(lastWasCR); 319 lastWasCR = false; 320 if (bufr.hasMore()) { 321 lastWasCR = tokenizer.tokenizeBuffer(bufr); 322 } 323 } 324 streamOffset += len; 325 } 326 } 327 tokenizer.eof(); 328 } 329 330 public void setEncoding(Encoding encoding, Confidence confidence) { 331 this.characterEncoding = encoding; 332 if (confidence == Confidence.CERTAIN) { 333 becomeConfident(); 334 } 335 } 336 337 public boolean internalEncodingDeclaration(String internalCharset) 338 throws SAXException { 339 try { 340 internalCharset = Encoding.toAsciiLowerCase(internalCharset); 341 Encoding cs; 342 if ("utf-16".equals(internalCharset) 343 || "utf-16be".equals(internalCharset) 344 || "utf-16le".equals(internalCharset)) { 345 tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" 346 + internalCharset 347 + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D."); 348 cs = Encoding.UTF8; 349 internalCharset = "utf-8"; 350 } else { 351 cs = Encoding.forName(internalCharset); 352 } 353 Encoding actual = cs.getActualHtmlEncoding(); 354 if (actual == null) { 355 actual = cs; 356 } 357 if (!actual.isAsciiSuperset()) { 358 tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" 359 + internalCharset 360 + "\u201D which is not an ASCII superset. Not changing the encoding."); 361 return false; 362 } 363 if (characterEncoding == null) { 364 // Reader case 365 return true; 366 } 367 if (characterEncoding == actual) { 368 becomeConfident(); 369 return true; 370 } 371 if (confidence == Confidence.CERTAIN && actual != characterEncoding) { 372 tokenizer.errTreeBuilder("Internal encoding declaration \u201C" 373 + internalCharset 374 + "\u201D disagrees with the actual encoding of the document (\u201C" 375 + characterEncoding.getCanonName() + "\u201D)."); 376 } else { 377 Encoding newEnc = whineAboutEncodingAndReturnActual( 378 internalCharset, cs); 379 tokenizer.errTreeBuilder("Changing character encoding \u201C" 380 + internalCharset + "\u201D and reparsing."); 381 characterEncoding = newEnc; 382 throw new ReparseException(); 383 } 384 return true; 385 } catch (UnsupportedCharsetException e) { 386 tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C" 387 + internalCharset + "\u201D."); 388 return false; 389 } 390 } 391 392 /** 393 * 394 */ 395 private void becomeConfident() { 396 if (rewindableInputStream != null) { 397 rewindableInputStream.willNotRewind(); 398 } 399 confidence = Confidence.CERTAIN; 400 tokenizer.becomeConfident(); 401 } 402 403 /** 404 * Sets the encoding sniffing heuristics. 405 * 406 * @param heuristics 407 * the heuristics to set 408 */ 409 public void setHeuristics(Heuristics heuristics) { 410 this.heuristics = heuristics; 411 } 412 413 /** 414 * Reports a warning without line/col 415 * 416 * @param message 417 * the message 418 * @throws SAXException 419 */ 420 protected void warnWithoutLocation(String message) throws SAXException { 421 ErrorHandler errorHandler = tokenizer.getErrorHandler(); 422 if (errorHandler == null) { 423 return; 424 } 425 SAXParseException spe = new SAXParseException(message, null, 426 tokenizer.getSystemId(), -1, -1); 427 errorHandler.warning(spe); 428 } 429 430 /** 431 * Initializes a decoder from external decl. 432 */ 433 protected Encoding encodingFromExternalDeclaration(String encoding) 434 throws SAXException { 435 if (encoding == null) { 436 return null; 437 } 438 encoding = Encoding.toAsciiLowerCase(encoding); 439 try { 440 Encoding cs = Encoding.forName(encoding); 441 if ("utf-16".equals(cs.getCanonName()) 442 || "utf-32".equals(cs.getCanonName())) { 443 swallowBom = false; 444 } 445 return whineAboutEncodingAndReturnActual(encoding, cs); 446 } catch (UnsupportedCharsetException e) { 447 tokenizer.err("Unsupported character encoding name: \u201C" + encoding 448 + "\u201D. Will sniff."); 449 swallowBom = true; 450 } 451 return null; // keep the compiler happy 452 } 453 454 /** 455 * @param encoding 456 * @param cs 457 * @return 458 * @throws SAXException 459 */ 460 protected Encoding whineAboutEncodingAndReturnActual(String encoding, 461 Encoding cs) throws SAXException { 462 String canonName = cs.getCanonName(); 463 if (!cs.isRegistered()) { 464 if (encoding.startsWith("x-")) { 465 tokenizer.err("The encoding \u201C" 466 + encoding 467 + "\u201D is not an IANA-registered encoding. (Charmod C022)"); 468 } else { 469 tokenizer.err("The encoding \u201C" 470 + encoding 471 + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)"); 472 } 473 } else if (!canonName.equals(encoding)) { 474 tokenizer.err("The encoding \u201C" 475 + encoding 476 + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" 477 + canonName + "\u201D. (Charmod C024)"); 478 } 479 if (cs.isShouldNot()) { 480 tokenizer.warn("Authors should not use the character encoding \u201C" 481 + encoding 482 + "\u201D. It is recommended to use \u201CUTF-8\u201D."); 483 } else if (cs.isLikelyEbcdic()) { 484 tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D."); 485 } else if (cs.isObscure()) { 486 tokenizer.warn("The character encoding \u201C" 487 + encoding 488 + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); 489 } 490 Encoding actual = cs.getActualHtmlEncoding(); 491 if (actual == null) { 492 return cs; 493 } else { 494 tokenizer.warn("Using \u201C" + actual.getCanonName() 495 + "\u201D instead of the declared encoding \u201C" 496 + encoding + "\u201D."); 497 return actual; 498 } 499 } 500 501 private class ReparseException extends SAXException { 502 503 } 504 505 void notifyAboutMetaBoundary() { 506 tokenizer.notifyAboutMetaBoundary(); 507 } 508 509 /** 510 * @param commentPolicy 511 * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 512 */ 513 public void setCommentPolicy(XmlViolationPolicy commentPolicy) { 514 tokenizer.setCommentPolicy(commentPolicy); 515 } 516 517 /** 518 * @param contentNonXmlCharPolicy 519 * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 520 */ 521 public void setContentNonXmlCharPolicy( 522 XmlViolationPolicy contentNonXmlCharPolicy) { 523 tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy); 524 } 525 526 /** 527 * @param contentSpacePolicy 528 * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 529 */ 530 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { 531 tokenizer.setContentSpacePolicy(contentSpacePolicy); 532 } 533 534 /** 535 * @param eh 536 * @see nu.validator.htmlparser.impl.Tokenizer#setErrorHandler(org.xml.sax.ErrorHandler) 537 */ 538 public void setErrorHandler(ErrorHandler eh) { 539 tokenizer.setErrorHandler(eh); 540 for (int i = 0; i < characterHandlers.length; i++) { 541 CharacterHandler ch = characterHandlers[i]; 542 if (ch instanceof NormalizationChecker) { 543 NormalizationChecker nc = (NormalizationChecker) ch; 544 nc.setErrorHandler(eh); 545 } 546 } 547 } 548 549 public void setTransitionHandler(TransitionHandler transitionHandler) { 550 if (tokenizer instanceof ErrorReportingTokenizer) { 551 ErrorReportingTokenizer ert = (ErrorReportingTokenizer) tokenizer; 552 ert.setTransitionHandler(transitionHandler); 553 } else if (transitionHandler != null) { 554 throw new IllegalStateException("Attempt to set a transition handler on a plain tokenizer."); 555 } 556 } 557 558 /** 559 * @param html4ModeCompatibleWithXhtml1Schemata 560 * @see nu.validator.htmlparser.impl.Tokenizer#setHtml4ModeCompatibleWithXhtml1Schemata(boolean) 561 */ 562 public void setHtml4ModeCompatibleWithXhtml1Schemata( 563 boolean html4ModeCompatibleWithXhtml1Schemata) { 564 tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata); 565 } 566 567 /** 568 * @param mappingLangToXmlLang 569 * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean) 570 */ 571 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { 572 tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang); 573 } 574 575 /** 576 * @param namePolicy 577 * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 578 */ 579 public void setNamePolicy(XmlViolationPolicy namePolicy) { 580 tokenizer.setNamePolicy(namePolicy); 581 } 582 583 /** 584 * @param xmlnsPolicy 585 * @see nu.validator.htmlparser.impl.Tokenizer#setXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 586 */ 587 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { 588 tokenizer.setXmlnsPolicy(xmlnsPolicy); 589 } 590 591 public String getCharacterEncoding() throws SAXException { 592 return characterEncoding.getCanonName(); 593 } 594 595 public Locator getDocumentLocator() { 596 return tokenizer; 597 } 598 }