001 /* 002 * Copyright (c) 2007 Henri Sivonen 003 * Copyright (c) 2007 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.htmlparser.dom; 025 026 import java.io.IOException; 027 import java.net.MalformedURLException; 028 import java.net.URL; 029 030 import javax.xml.parsers.DocumentBuilder; 031 import javax.xml.parsers.DocumentBuilderFactory; 032 import javax.xml.parsers.ParserConfigurationException; 033 034 import nu.validator.htmlparser.common.DoctypeExpectation; 035 import nu.validator.htmlparser.common.DocumentModeHandler; 036 import nu.validator.htmlparser.common.XmlViolationPolicy; 037 import nu.validator.htmlparser.impl.Tokenizer; 038 039 import org.w3c.dom.DOMImplementation; 040 import org.w3c.dom.Document; 041 import org.w3c.dom.DocumentFragment; 042 import org.xml.sax.EntityResolver; 043 import org.xml.sax.ErrorHandler; 044 import org.xml.sax.InputSource; 045 import org.xml.sax.SAXException; 046 047 /** 048 * This class implements an HTML5 parser that exposes data through the DOM 049 * interface. 050 * 051 * <p>By default, when using the constructor without arguments, the 052 * this parser treats XML 1.0-incompatible infosets as fatal errors. 053 * This corresponds to 054 * <code>FATAL</code> as the general XML violation policy. To make the parser 055 * support non-conforming HTML fully per the HTML 5 spec while on the other 056 * hand potentially violating the DOM API contract, set the general XML 057 * violation policy to <code>ALLOW</code>. This does not work with a standard 058 * DOM implementation. Handling all input without fatal errors and without 059 * violating the DOM API contract is possible by setting 060 * the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This 061 * makes the parser non-conforming</em> but is probably the most useful 062 * setting for most applications. 063 * 064 * <p>The doctype is not represented in the tree. 065 * 066 * <p>The document mode is represented as user data <code>DocumentMode</code> 067 * object with the key <code>nu.validator.document-mode</code> on the document 068 * node. 069 * 070 * <p>The form pointer is also stored as user data with the key 071 * <code>nu.validator.form-pointer</code>. 072 * 073 * @version $Id: HtmlDocumentBuilder.java 153 2007-09-11 07:41:33Z hsivonen $ 074 * @author hsivonen 075 */ 076 public class HtmlDocumentBuilder extends DocumentBuilder { 077 078 /** 079 * @return the JAXP DOM implementation 080 */ 081 private static DOMImplementation jaxpDOMImplementation() { 082 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 083 factory.setNamespaceAware(true); 084 DocumentBuilder builder; 085 try { 086 builder = factory.newDocumentBuilder(); 087 } catch (ParserConfigurationException e) { 088 throw new RuntimeException(e); 089 } 090 return builder.getDOMImplementation(); 091 } 092 093 private final Tokenizer tokenizer; 094 095 private final DOMTreeBuilder domTreeBuilder; 096 097 private final DOMImplementation implementation; 098 099 private EntityResolver entityResolver; 100 101 /** 102 * Instantiates the document builder with a specific DOM 103 * implementation and XML violation policy. 104 * 105 * @param implementation 106 * the DOM implementation 107 * @param xmlPolicy the policy 108 */ 109 public HtmlDocumentBuilder(DOMImplementation implementation, 110 XmlViolationPolicy xmlPolicy) { 111 this.implementation = implementation; 112 this.domTreeBuilder = new DOMTreeBuilder(implementation); 113 this.tokenizer = new Tokenizer(domTreeBuilder); 114 this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET); 115 setXmlPolicy(xmlPolicy); 116 } 117 118 /** 119 * Instantiates the document builder with a specific DOM implementation 120 * and fatal XML violation policy. 121 * 122 * @param implementation 123 * the DOM implementation 124 */ 125 public HtmlDocumentBuilder(DOMImplementation implementation) { 126 this(implementation, XmlViolationPolicy.FATAL); 127 } 128 129 /** 130 * Instantiates the document builder with the JAXP DOM implementation 131 * and fatal XML violation policy. 132 */ 133 public HtmlDocumentBuilder() { 134 this(XmlViolationPolicy.FATAL); 135 } 136 137 /** 138 * Instantiates the document builder with the JAXP DOM implementation 139 * and a specific XML violation policy. 140 * @param xmlPolicy the policy 141 */ 142 public HtmlDocumentBuilder(XmlViolationPolicy xmlPolicy) { 143 this(jaxpDOMImplementation(), xmlPolicy); 144 } 145 146 /** 147 * Returns the DOM implementation 148 * @return the DOM implementation 149 * @see javax.xml.parsers.DocumentBuilder#getDOMImplementation() 150 */ 151 @Override 152 public DOMImplementation getDOMImplementation() { 153 return implementation; 154 } 155 156 /** 157 * Returns <code>true</code>. 158 * @return <code>true</code> 159 * @see javax.xml.parsers.DocumentBuilder#isNamespaceAware() 160 */ 161 @Override 162 public boolean isNamespaceAware() { 163 return true; 164 } 165 166 /** 167 * Returns <code>false</code> 168 * @return <code>false</code> 169 * @see javax.xml.parsers.DocumentBuilder#isValidating() 170 */ 171 @Override 172 public boolean isValidating() { 173 return false; 174 } 175 176 /** 177 * For API compatibility. 178 * @see javax.xml.parsers.DocumentBuilder#newDocument() 179 */ 180 @Override 181 public Document newDocument() { 182 return implementation.createDocument(null, null, null); 183 } 184 185 /** 186 * Parses a document from a SAX <code>InputSource</code>. 187 * @param is the source 188 * @return the doc 189 * @see javax.xml.parsers.DocumentBuilder#parse(org.xml.sax.InputSource) 190 */ 191 @Override 192 public Document parse(InputSource is) throws SAXException, IOException { 193 domTreeBuilder.setFragmentContext(null); 194 tokenize(is); 195 return domTreeBuilder.getDocument(); 196 } 197 198 /** 199 * Parses a document fragment from a SAX <code>InputSource</code>. 200 * @param is the source 201 * @param context the context element name 202 * @return the doc 203 * @throws IOException 204 * @throws SAXException 205 */ 206 public DocumentFragment parseFragment(InputSource is, String context) 207 throws IOException, SAXException { 208 domTreeBuilder.setFragmentContext(context); 209 tokenize(is); 210 return domTreeBuilder.getDocumentFragment(); 211 } 212 213 /** 214 * @param is 215 * @throws SAXException 216 * @throws IOException 217 * @throws MalformedURLException 218 */ 219 private void tokenize(InputSource is) throws SAXException, IOException, 220 MalformedURLException { 221 if (is == null) { 222 throw new IllegalArgumentException("Null input."); 223 } 224 if (is.getByteStream() == null && is.getCharacterStream() == null) { 225 String systemId = is.getSystemId(); 226 if (systemId == null) { 227 throw new IllegalArgumentException( 228 "No byte stream, no character stream nor URI."); 229 } 230 if (entityResolver != null) { 231 is = entityResolver.resolveEntity(is.getPublicId(), systemId); 232 } 233 if (is.getByteStream() == null || is.getCharacterStream() == null) { 234 is = new InputSource(); 235 is.setSystemId(systemId); 236 is.setByteStream(new URL(systemId).openStream()); 237 } 238 } 239 tokenizer.tokenize(is); 240 } 241 242 /** 243 * Sets the entity resolver for URI-only inputs. 244 * @param resolver the resolver 245 * @see javax.xml.parsers.DocumentBuilder#setEntityResolver(org.xml.sax.EntityResolver) 246 */ 247 @Override 248 public void setEntityResolver(EntityResolver resolver) { 249 this.entityResolver = resolver; 250 } 251 252 /** 253 * @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler) 254 */ 255 @Override 256 public void setErrorHandler(ErrorHandler errorHandler) { 257 domTreeBuilder.setErrorHandler(errorHandler); 258 tokenizer.setErrorHandler(errorHandler); 259 } 260 261 /** 262 * Sets whether comment nodes appear in the tree. 263 * @param ignoreComments <code>true</code> to ignore comments 264 * @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean) 265 */ 266 public void setIgnoringComments(boolean ignoreComments) { 267 domTreeBuilder.setIgnoringComments(ignoreComments); 268 } 269 270 /** 271 * Sets whether the parser considers scripting to be enabled for noscript treatment. 272 * @param scriptingEnabled <code>true</code> to enable 273 * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean) 274 */ 275 public void setScriptingEnabled(boolean scriptingEnabled) { 276 domTreeBuilder.setScriptingEnabled(scriptingEnabled); 277 } 278 279 /** 280 * Toggles the checking of the NFC normalization of source. 281 * @param enable <code>true</code> to check normalization 282 * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean) 283 */ 284 public void setCheckingNormalization(boolean enable) { 285 tokenizer.setCheckingNormalization(enable); 286 } 287 288 /** 289 * Sets the policy for consecutive hyphens in comments. 290 * @param commentPolicy the policy 291 * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 292 */ 293 public void setCommentPolicy(XmlViolationPolicy commentPolicy) { 294 tokenizer.setCommentPolicy(commentPolicy); 295 } 296 297 /** 298 * Sets the policy for non-XML characters except white space. 299 * @param contentNonXmlCharPolicy the policy 300 * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 301 */ 302 public void setContentNonXmlCharPolicy( 303 XmlViolationPolicy contentNonXmlCharPolicy) { 304 tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy); 305 } 306 307 /** 308 * Sets the policy for non-XML white space. 309 * @param contentSpacePolicy the policy 310 * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 311 */ 312 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { 313 tokenizer.setContentSpacePolicy(contentSpacePolicy); 314 } 315 316 317 /** 318 * Whether the HTML 4 mode reports boolean attributes in a way that repeats 319 * the name in the value. 320 * @param html4ModeCompatibleWithXhtml1Schemata 321 */ 322 public void setHtml4ModeCompatibleWithXhtml1Schemata( 323 boolean html4ModeCompatibleWithXhtml1Schemata) { 324 tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata); 325 } 326 327 /** 328 * @param mappingLangToXmlLang 329 * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean) 330 */ 331 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { 332 tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang); 333 } 334 335 /** 336 * @param namePolicy 337 * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 338 */ 339 public void setNamePolicy(XmlViolationPolicy namePolicy) { 340 tokenizer.setNamePolicy(namePolicy); 341 } 342 343 /** 344 * This is a catch-all convenience method for setting name, content space, 345 * content non-XML char and comment policies in one go. 346 * 347 * @param xmlPolicy 348 */ 349 public void setXmlPolicy(XmlViolationPolicy xmlPolicy) { 350 setNamePolicy(xmlPolicy); 351 setContentSpacePolicy(xmlPolicy); 352 setContentNonXmlCharPolicy(xmlPolicy); 353 setCommentPolicy(xmlPolicy); 354 setBogusXmlnsPolicy(xmlPolicy); 355 } 356 357 /** 358 * Sets the doctype expectation. 359 * 360 * @param doctypeExpectation 361 * the doctypeExpectation to set 362 * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation) 363 */ 364 public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) { 365 domTreeBuilder.setDoctypeExpectation(doctypeExpectation); 366 } 367 368 /** 369 * Sets the document mode handler. 370 * 371 * @param documentModeHandler 372 * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler) 373 */ 374 public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) { 375 domTreeBuilder.setDocumentModeHandler(documentModeHandler); 376 } 377 378 /** 379 * Sets the policy for forbidden <code>xmlns</code> attributes. 380 * @param bogusXmlnsPolicy the policy 381 * @see nu.validator.htmlparser.impl.Tokenizer#setBogusXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) 382 */ 383 public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) { 384 tokenizer.setBogusXmlnsPolicy(bogusXmlnsPolicy); 385 } 386 387 }