001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 * Copyright (c) 2007 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.dom;
025
026 import java.io.IOException;
027 import java.net.MalformedURLException;
028 import java.net.URL;
029
030 import javax.xml.parsers.DocumentBuilder;
031 import javax.xml.parsers.DocumentBuilderFactory;
032 import javax.xml.parsers.ParserConfigurationException;
033
034 import nu.validator.htmlparser.common.DoctypeExpectation;
035 import nu.validator.htmlparser.common.DocumentModeHandler;
036 import nu.validator.htmlparser.common.XmlViolationPolicy;
037 import nu.validator.htmlparser.impl.Tokenizer;
038
039 import org.w3c.dom.DOMImplementation;
040 import org.w3c.dom.Document;
041 import org.w3c.dom.DocumentFragment;
042 import org.xml.sax.EntityResolver;
043 import org.xml.sax.ErrorHandler;
044 import org.xml.sax.InputSource;
045 import org.xml.sax.SAXException;
046
047 /**
048 * This class implements an HTML5 parser that exposes data through the DOM
049 * interface.
050 *
051 * <p>By default, when using the constructor without arguments, the
052 * this parser treats XML 1.0-incompatible infosets as fatal errors.
053 * This corresponds to
054 * <code>FATAL</code> as the general XML violation policy. To make the parser
055 * support non-conforming HTML fully per the HTML 5 spec while on the other
056 * hand potentially violating the DOM API contract, set the general XML
057 * violation policy to <code>ALLOW</code>. This does not work with a standard
058 * DOM implementation. Handling all input without fatal errors and without
059 * violating the DOM API contract is possible by setting
060 * the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This
061 * makes the parser non-conforming</em> but is probably the most useful
062 * setting for most applications.
063 *
064 * <p>The doctype is not represented in the tree.
065 *
066 * <p>The document mode is represented as user data <code>DocumentMode</code>
067 * object with the key <code>nu.validator.document-mode</code> on the document
068 * node.
069 *
070 * <p>The form pointer is also stored as user data with the key
071 * <code>nu.validator.form-pointer</code>.
072 *
073 * @version $Id: HtmlDocumentBuilder.java 153 2007-09-11 07:41:33Z hsivonen $
074 * @author hsivonen
075 */
076 public class HtmlDocumentBuilder extends DocumentBuilder {
077
078 /**
079 * @return the JAXP DOM implementation
080 */
081 private static DOMImplementation jaxpDOMImplementation() {
082 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
083 factory.setNamespaceAware(true);
084 DocumentBuilder builder;
085 try {
086 builder = factory.newDocumentBuilder();
087 } catch (ParserConfigurationException e) {
088 throw new RuntimeException(e);
089 }
090 return builder.getDOMImplementation();
091 }
092
093 private final Tokenizer tokenizer;
094
095 private final DOMTreeBuilder domTreeBuilder;
096
097 private final DOMImplementation implementation;
098
099 private EntityResolver entityResolver;
100
101 /**
102 * Instantiates the document builder with a specific DOM
103 * implementation and XML violation policy.
104 *
105 * @param implementation
106 * the DOM implementation
107 * @param xmlPolicy the policy
108 */
109 public HtmlDocumentBuilder(DOMImplementation implementation,
110 XmlViolationPolicy xmlPolicy) {
111 this.implementation = implementation;
112 this.domTreeBuilder = new DOMTreeBuilder(implementation);
113 this.tokenizer = new Tokenizer(domTreeBuilder);
114 this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
115 setXmlPolicy(xmlPolicy);
116 }
117
118 /**
119 * Instantiates the document builder with a specific DOM implementation
120 * and fatal XML violation policy.
121 *
122 * @param implementation
123 * the DOM implementation
124 */
125 public HtmlDocumentBuilder(DOMImplementation implementation) {
126 this(implementation, XmlViolationPolicy.FATAL);
127 }
128
129 /**
130 * Instantiates the document builder with the JAXP DOM implementation
131 * and fatal XML violation policy.
132 */
133 public HtmlDocumentBuilder() {
134 this(XmlViolationPolicy.FATAL);
135 }
136
137 /**
138 * Instantiates the document builder with the JAXP DOM implementation
139 * and a specific XML violation policy.
140 * @param xmlPolicy the policy
141 */
142 public HtmlDocumentBuilder(XmlViolationPolicy xmlPolicy) {
143 this(jaxpDOMImplementation(), xmlPolicy);
144 }
145
146 /**
147 * Returns the DOM implementation
148 * @return the DOM implementation
149 * @see javax.xml.parsers.DocumentBuilder#getDOMImplementation()
150 */
151 @Override
152 public DOMImplementation getDOMImplementation() {
153 return implementation;
154 }
155
156 /**
157 * Returns <code>true</code>.
158 * @return <code>true</code>
159 * @see javax.xml.parsers.DocumentBuilder#isNamespaceAware()
160 */
161 @Override
162 public boolean isNamespaceAware() {
163 return true;
164 }
165
166 /**
167 * Returns <code>false</code>
168 * @return <code>false</code>
169 * @see javax.xml.parsers.DocumentBuilder#isValidating()
170 */
171 @Override
172 public boolean isValidating() {
173 return false;
174 }
175
176 /**
177 * For API compatibility.
178 * @see javax.xml.parsers.DocumentBuilder#newDocument()
179 */
180 @Override
181 public Document newDocument() {
182 return implementation.createDocument(null, null, null);
183 }
184
185 /**
186 * Parses a document from a SAX <code>InputSource</code>.
187 * @param is the source
188 * @return the doc
189 * @see javax.xml.parsers.DocumentBuilder#parse(org.xml.sax.InputSource)
190 */
191 @Override
192 public Document parse(InputSource is) throws SAXException, IOException {
193 domTreeBuilder.setFragmentContext(null);
194 tokenize(is);
195 return domTreeBuilder.getDocument();
196 }
197
198 /**
199 * Parses a document fragment from a SAX <code>InputSource</code>.
200 * @param is the source
201 * @param context the context element name
202 * @return the doc
203 * @throws IOException
204 * @throws SAXException
205 */
206 public DocumentFragment parseFragment(InputSource is, String context)
207 throws IOException, SAXException {
208 domTreeBuilder.setFragmentContext(context);
209 tokenize(is);
210 return domTreeBuilder.getDocumentFragment();
211 }
212
213 /**
214 * @param is
215 * @throws SAXException
216 * @throws IOException
217 * @throws MalformedURLException
218 */
219 private void tokenize(InputSource is) throws SAXException, IOException,
220 MalformedURLException {
221 if (is == null) {
222 throw new IllegalArgumentException("Null input.");
223 }
224 if (is.getByteStream() == null && is.getCharacterStream() == null) {
225 String systemId = is.getSystemId();
226 if (systemId == null) {
227 throw new IllegalArgumentException(
228 "No byte stream, no character stream nor URI.");
229 }
230 if (entityResolver != null) {
231 is = entityResolver.resolveEntity(is.getPublicId(), systemId);
232 }
233 if (is.getByteStream() == null || is.getCharacterStream() == null) {
234 is = new InputSource();
235 is.setSystemId(systemId);
236 is.setByteStream(new URL(systemId).openStream());
237 }
238 }
239 tokenizer.tokenize(is);
240 }
241
242 /**
243 * Sets the entity resolver for URI-only inputs.
244 * @param resolver the resolver
245 * @see javax.xml.parsers.DocumentBuilder#setEntityResolver(org.xml.sax.EntityResolver)
246 */
247 @Override
248 public void setEntityResolver(EntityResolver resolver) {
249 this.entityResolver = resolver;
250 }
251
252 /**
253 * @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler)
254 */
255 @Override
256 public void setErrorHandler(ErrorHandler errorHandler) {
257 domTreeBuilder.setErrorHandler(errorHandler);
258 tokenizer.setErrorHandler(errorHandler);
259 }
260
261 /**
262 * Sets whether comment nodes appear in the tree.
263 * @param ignoreComments <code>true</code> to ignore comments
264 * @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean)
265 */
266 public void setIgnoringComments(boolean ignoreComments) {
267 domTreeBuilder.setIgnoringComments(ignoreComments);
268 }
269
270 /**
271 * Sets whether the parser considers scripting to be enabled for noscript treatment.
272 * @param scriptingEnabled <code>true</code> to enable
273 * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
274 */
275 public void setScriptingEnabled(boolean scriptingEnabled) {
276 domTreeBuilder.setScriptingEnabled(scriptingEnabled);
277 }
278
279 /**
280 * Toggles the checking of the NFC normalization of source.
281 * @param enable <code>true</code> to check normalization
282 * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean)
283 */
284 public void setCheckingNormalization(boolean enable) {
285 tokenizer.setCheckingNormalization(enable);
286 }
287
288 /**
289 * Sets the policy for consecutive hyphens in comments.
290 * @param commentPolicy the policy
291 * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
292 */
293 public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
294 tokenizer.setCommentPolicy(commentPolicy);
295 }
296
297 /**
298 * Sets the policy for non-XML characters except white space.
299 * @param contentNonXmlCharPolicy the policy
300 * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
301 */
302 public void setContentNonXmlCharPolicy(
303 XmlViolationPolicy contentNonXmlCharPolicy) {
304 tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
305 }
306
307 /**
308 * Sets the policy for non-XML white space.
309 * @param contentSpacePolicy the policy
310 * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
311 */
312 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
313 tokenizer.setContentSpacePolicy(contentSpacePolicy);
314 }
315
316
317 /**
318 * Whether the HTML 4 mode reports boolean attributes in a way that repeats
319 * the name in the value.
320 * @param html4ModeCompatibleWithXhtml1Schemata
321 */
322 public void setHtml4ModeCompatibleWithXhtml1Schemata(
323 boolean html4ModeCompatibleWithXhtml1Schemata) {
324 tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
325 }
326
327 /**
328 * @param mappingLangToXmlLang
329 * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
330 */
331 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
332 tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
333 }
334
335 /**
336 * @param namePolicy
337 * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
338 */
339 public void setNamePolicy(XmlViolationPolicy namePolicy) {
340 tokenizer.setNamePolicy(namePolicy);
341 }
342
343 /**
344 * This is a catch-all convenience method for setting name, content space,
345 * content non-XML char and comment policies in one go.
346 *
347 * @param xmlPolicy
348 */
349 public void setXmlPolicy(XmlViolationPolicy xmlPolicy) {
350 setNamePolicy(xmlPolicy);
351 setContentSpacePolicy(xmlPolicy);
352 setContentNonXmlCharPolicy(xmlPolicy);
353 setCommentPolicy(xmlPolicy);
354 setBogusXmlnsPolicy(xmlPolicy);
355 }
356
357 /**
358 * Sets the doctype expectation.
359 *
360 * @param doctypeExpectation
361 * the doctypeExpectation to set
362 * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation)
363 */
364 public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) {
365 domTreeBuilder.setDoctypeExpectation(doctypeExpectation);
366 }
367
368 /**
369 * Sets the document mode handler.
370 *
371 * @param documentModeHandler
372 * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler)
373 */
374 public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) {
375 domTreeBuilder.setDocumentModeHandler(documentModeHandler);
376 }
377
378 /**
379 * Sets the policy for forbidden <code>xmlns</code> attributes.
380 * @param bogusXmlnsPolicy the policy
381 * @see nu.validator.htmlparser.impl.Tokenizer#setBogusXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
382 */
383 public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) {
384 tokenizer.setBogusXmlnsPolicy(bogusXmlnsPolicy);
385 }
386
387 }