001 /* 002 * Copyright (c) 2007 Henri Sivonen 003 * Copyright (c) 2007 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.htmlparser.xom; 025 026 import java.io.File; 027 import java.io.FileInputStream; 028 import java.io.IOException; 029 import java.io.InputStream; 030 import java.io.Reader; 031 import java.io.StringReader; 032 import java.net.MalformedURLException; 033 import java.net.URL; 034 035 import nu.validator.htmlparser.common.DoctypeExpectation; 036 import nu.validator.htmlparser.common.DocumentModeHandler; 037 import nu.validator.htmlparser.common.XmlViolationPolicy; 038 import nu.validator.htmlparser.impl.Tokenizer; 039 import nu.xom.Builder; 040 import nu.xom.Document; 041 import nu.xom.Nodes; 042 import nu.xom.ParsingException; 043 import nu.xom.ValidityException; 044 045 import org.xml.sax.EntityResolver; 046 import org.xml.sax.ErrorHandler; 047 import org.xml.sax.InputSource; 048 import org.xml.sax.SAXException; 049 import org.xml.sax.SAXParseException; 050 051 /** 052 * This class implements an HTML5 parser that exposes data through the XOM 053 * interface. 054 * 055 * <p>By default, when using the constructor without arguments, the 056 * this parser treats XML 1.0-incompatible infosets as fatal errors. 057 * This corresponds to 058 * <code>FATAL</code> as the general XML violation policy. Handling 059 * all input without fatal errors and without 060 * violating the XOM API contract is possible by setting 061 * the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This 062 * makes the parser non-conforming</em> but is probably the most useful 063 * setting for most applications. 064 * 065 * <p>The doctype is not represented in the tree. 066 * 067 * <p>The document mode is represented via the <code>Mode</code> 068 * interface on the <code>Document</code> node if the node implements 069 * that interface (depends on the used node factory). 070 * 071 * <p>The form pointer is stored if the node factory supports storing it. 072 * 073 * <p>This package has its own node factory class because the official 074 * XOM node factory may return multiple nodes instead of one confusing 075 * the assumptions of the DOM-oriented HTML5 parsing algorithm. 076 * 077 * @version $Id: HtmlBuilder.java 153 2007-09-11 07:41:33Z hsivonen $ 078 * @author hsivonen 079 */ 080 public class HtmlBuilder extends Builder { 081 082 private final Tokenizer tokenizer; 083 084 private final XOMTreeBuilder xomTreeBuilder; 085 086 private final SimpleNodeFactory simpleNodeFactory; 087 088 private EntityResolver entityResolver; 089 090 /** 091 * Constructor with default node factory and fatal XML violation policy. 092 */ 093 public HtmlBuilder() { 094 this(new SimpleNodeFactory(), XmlViolationPolicy.FATAL); 095 } 096 097 /** 098 * Constructor with given node factory and fatal XML violation policy. 099 * @param nodeFactory the factory 100 */ 101 public HtmlBuilder(SimpleNodeFactory nodeFactory) { 102 this(nodeFactory, XmlViolationPolicy.FATAL); 103 } 104 105 /** 106 * Constructor with default node factory and given XML violation policy. 107 * @param xmlPolicy the policy 108 */ 109 public HtmlBuilder(XmlViolationPolicy xmlPolicy) { 110 this(new SimpleNodeFactory(), xmlPolicy); 111 } 112 113 /** 114 * Constructor with given node factory and given XML violation policy. 115 * @param nodeFactory the factory 116 * @param xmlPolicy the policy 117 */ 118 public HtmlBuilder(SimpleNodeFactory nodeFactory, XmlViolationPolicy xmlPolicy) { 119 super(); 120 this.simpleNodeFactory = nodeFactory; 121 this.xomTreeBuilder = new XOMTreeBuilder(nodeFactory); 122 this.tokenizer = new Tokenizer(xomTreeBuilder); 123 this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET); 124 setXmlPolicy(xmlPolicy); 125 } 126 127 private void tokenize(InputSource is) throws ParsingException, IOException, 128 MalformedURLException { 129 try { 130 if (is == null) { 131 throw new IllegalArgumentException("Null input."); 132 } 133 if (is.getByteStream() == null && is.getCharacterStream() == null) { 134 String systemId = is.getSystemId(); 135 if (systemId == null) { 136 throw new IllegalArgumentException( 137 "No byte stream, no character stream nor URI."); 138 } 139 if (entityResolver != null) { 140 is = entityResolver.resolveEntity(is.getPublicId(), 141 systemId); 142 } 143 if (is.getByteStream() == null 144 || is.getCharacterStream() == null) { 145 is = new InputSource(); 146 is.setSystemId(systemId); 147 is.setByteStream(new URL(systemId).openStream()); 148 } 149 } 150 tokenizer.tokenize(is); 151 } catch (SAXParseException e) { 152 throw new ParsingException(e.getMessage(), e.getSystemId(), e.getLineNumber(), 153 e.getColumnNumber(), e); 154 } catch (SAXException e) { 155 throw new ParsingException(e.getMessage(), e); 156 } 157 } 158 159 /** 160 * Parse from SAX <code>InputSource</code>. 161 * @param is the <code>InputSource</code> 162 * @return the document 163 * @throws ParsingException in case of an XML violation 164 * @throws IOException if IO goes wrang 165 */ 166 public Document build(InputSource is) throws ParsingException, IOException { 167 xomTreeBuilder.setFragmentContext(null); 168 tokenize(is); 169 return xomTreeBuilder.getDocument(); 170 } 171 172 /** 173 * Parse a fragment from SAX <code>InputSource</code>. 174 * @param is the <code>InputSource</code> 175 * @param context the name of the context element 176 * @return the fragment 177 * @throws ParsingException in case of an XML violation 178 * @throws IOException if IO goes wrang 179 */ 180 public Nodes buildFragment(InputSource is, String context) 181 throws IOException, ParsingException { 182 xomTreeBuilder.setFragmentContext(context); 183 tokenize(is); 184 return xomTreeBuilder.getDocumentFragment(); 185 } 186 187 188 /** 189 * Parse from <code>File</code>. 190 * @param file the file 191 * @return the document 192 * @throws ParsingException in case of an XML violation 193 * @throws IOException if IO goes wrang 194 * @see nu.xom.Builder#build(java.io.File) 195 */ 196 @Override 197 public Document build(File file) throws ParsingException, 198 ValidityException, IOException { 199 return build(new FileInputStream(file), file.toURI().toASCIIString()); 200 } 201 202 /** 203 * Parse from <code>InputStream</code>. 204 * @param stream the stream 205 * @param uri the base URI 206 * @return the document 207 * @throws ParsingException in case of an XML violation 208 * @throws IOException if IO goes wrang 209 * @see nu.xom.Builder#build(java.io.InputStream, java.lang.String) 210 */ 211 @Override 212 public Document build(InputStream stream, String uri) 213 throws ParsingException, ValidityException, IOException { 214 InputSource is = new InputSource(stream); 215 is.setSystemId(uri); 216 return build(is); 217 } 218 219 /** 220 * Parse from <code>InputStream</code>. 221 * @param stream the stream 222 * @return the document 223 * @throws ParsingException in case of an XML violation 224 * @throws IOException if IO goes wrang 225 * @see nu.xom.Builder#build(java.io.InputStream) 226 */ 227 @Override 228 public Document build(InputStream stream) throws ParsingException, 229 ValidityException, IOException { 230 return build(new InputSource(stream)); 231 } 232 233 /** 234 * Parse from <code>Reader</code>. 235 * @param stream the reader 236 * @param uri the base URI 237 * @return the document 238 * @throws ParsingException in case of an XML violation 239 * @throws IOException if IO goes wrang 240 * @see nu.xom.Builder#build(java.io.Reader, java.lang.String) 241 */ 242 @Override 243 public Document build(Reader stream, String uri) throws ParsingException, 244 ValidityException, IOException { 245 InputSource is = new InputSource(stream); 246 is.setSystemId(uri); 247 return build(is); 248 } 249 250 /** 251 * Parse from <code>Reader</code>. 252 * @param stream the reader 253 * @return the document 254 * @throws ParsingException in case of an XML violation 255 * @throws IOException if IO goes wrang 256 * @see nu.xom.Builder#build(java.io.Reader) 257 */ 258 @Override 259 public Document build(Reader stream) throws ParsingException, 260 ValidityException, IOException { 261 return build(new InputSource(stream)); 262 } 263 264 /** 265 * Parse from <code>String</code>. 266 * @param content the HTML source as string 267 * @param uri the base URI 268 * @return the document 269 * @throws ParsingException in case of an XML violation 270 * @throws IOException if IO goes wrang 271 * @see nu.xom.Builder#build(java.lang.String, java.lang.String) 272 */ 273 @Override 274 public Document build(String content, String uri) throws ParsingException, 275 ValidityException, IOException { 276 return build(new StringReader(content), uri); 277 } 278 279 /** 280 * Parse from URI. 281 * @param uri the URI of the document 282 * @return the document 283 * @throws ParsingException in case of an XML violation 284 * @throws IOException if IO goes wrang 285 * @see nu.xom.Builder#build(java.lang.String) 286 */ 287 @Override 288 public Document build(String uri) throws ParsingException, 289 ValidityException, IOException { 290 return build(new InputSource(uri)); 291 } 292 293 /** 294 * Gets the node factory 295 */ 296 public SimpleNodeFactory getSimpleNodeFactory() { 297 return simpleNodeFactory; 298 } 299 300 /** 301 * Sets the entity resolver for URI-only inputs. 302 * @param resolver the resolver 303 * @see javax.xml.parsers.DocumentBuilder#setEntityResolver(org.xml.sax.EntityResolver) 304 */ 305 public void setEntityResolver(EntityResolver resolver) { 306 this.entityResolver = resolver; 307 } 308 309 /** 310 * @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler) 311 */ 312 public void setErrorHandler(ErrorHandler errorHandler) { 313 xomTreeBuilder.setErrorHandler(errorHandler); 314 tokenizer.setErrorHandler(errorHandler); 315 } 316 317 /** 318 * Sets whether comment nodes appear in the tree. 319 * @param ignoreComments <code>true</code> to ignore comments 320 * @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean) 321 */ 322 public void setIgnoringComments(boolean ignoreComments) { 323 xomTreeBuilder.setIgnoringComments(ignoreComments); 324 } 325 326 /** 327 * Sets whether the parser considers scripting to be enabled for noscript treatment. 328 * @param scriptingEnabled <code>true</code> to enable 329 * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean) 330 */ 331 public void setScriptingEnabled(boolean scriptingEnabled) { 332 xomTreeBuilder.setScriptingEnabled(scriptingEnabled); 333 } 334 335 /** 336 * Toggles the checking of the NFC normalization of source. 337 * @param enable <code>true</code> to check normalization 338 * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean) 339 */ 340 public void setCheckingNormalization(boolean enable) { 341 tokenizer.setCheckingNormalization(enable); 342 } 343 344 /** 345 * Sets the policy for consecutive hyphens in comments. 346 * @param commentPolicy the policy 347 * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 348 */ 349 public void setCommentPolicy(XmlViolationPolicy commentPolicy) { 350 if (commentPolicy == XmlViolationPolicy.ALLOW) { 351 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW."); 352 } 353 tokenizer.setCommentPolicy(commentPolicy); 354 } 355 356 /** 357 * Sets the policy for non-XML characters except white space. 358 * @param contentNonXmlCharPolicy the policy 359 * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 360 */ 361 public void setContentNonXmlCharPolicy( 362 XmlViolationPolicy contentNonXmlCharPolicy) { 363 if (contentNonXmlCharPolicy == XmlViolationPolicy.ALLOW) { 364 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW."); 365 } 366 tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy); 367 } 368 369 /** 370 * Sets the policy for non-XML white space. 371 * @param contentSpacePolicy the policy 372 * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 373 */ 374 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { 375 if (contentSpacePolicy == XmlViolationPolicy.ALLOW) { 376 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW."); 377 } 378 tokenizer.setContentSpacePolicy(contentSpacePolicy); 379 } 380 381 382 /** 383 * Whether the HTML 4 mode reports boolean attributes in a way that repeats 384 * the name in the value. 385 * @param html4ModeCompatibleWithXhtml1Schemata 386 */ 387 public void setHtml4ModeCompatibleWithXhtml1Schemata( 388 boolean html4ModeCompatibleWithXhtml1Schemata) { 389 tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata); 390 } 391 392 /** 393 * @param mappingLangToXmlLang 394 * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean) 395 */ 396 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { 397 tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang); 398 } 399 400 /** 401 * @param namePolicy 402 * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 403 */ 404 public void setNamePolicy(XmlViolationPolicy namePolicy) { 405 if (namePolicy == XmlViolationPolicy.ALLOW) { 406 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW."); 407 } 408 tokenizer.setNamePolicy(namePolicy); 409 } 410 411 /** 412 * This is a catch-all convenience method for setting name, content space, 413 * content non-XML char and comment policies in one go. 414 * 415 * @param xmlPolicy 416 */ 417 public void setXmlPolicy(XmlViolationPolicy xmlPolicy) { 418 setNamePolicy(xmlPolicy); 419 setContentSpacePolicy(xmlPolicy); 420 setContentNonXmlCharPolicy(xmlPolicy); 421 setCommentPolicy(xmlPolicy); 422 setBogusXmlnsPolicy(xmlPolicy); 423 } 424 425 /** 426 * Sets the doctype expectation. 427 * 428 * @param doctypeExpectation 429 * the doctypeExpectation to set 430 * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation) 431 */ 432 public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) { 433 xomTreeBuilder.setDoctypeExpectation(doctypeExpectation); 434 } 435 436 /** 437 * Sets the document mode handler. 438 * 439 * @param documentModeHandler 440 * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler) 441 */ 442 public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) { 443 xomTreeBuilder.setDocumentModeHandler(documentModeHandler); 444 } 445 446 /** 447 * Sets the policy for forbidden <code>xmlns</code> attributes. 448 * @param bogusXmlnsPolicy the policy 449 * @see nu.validator.htmlparser.impl.Tokenizer#setBogusXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 450 */ 451 public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) { 452 tokenizer.setBogusXmlnsPolicy(bogusXmlnsPolicy); 453 } 454 }