001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 * Copyright (c) 2007-2008 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.xom;
025
026 import java.io.File;
027 import java.io.FileInputStream;
028 import java.io.IOException;
029 import java.io.InputStream;
030 import java.io.Reader;
031 import java.io.StringReader;
032 import java.net.MalformedURLException;
033 import java.net.URL;
034
035 import nu.validator.htmlparser.common.DoctypeExpectation;
036 import nu.validator.htmlparser.common.DocumentModeHandler;
037 import nu.validator.htmlparser.common.Heuristics;
038 import nu.validator.htmlparser.common.XmlViolationPolicy;
039 import nu.validator.htmlparser.io.Driver;
040 import nu.xom.Builder;
041 import nu.xom.Document;
042 import nu.xom.Nodes;
043 import nu.xom.ParsingException;
044 import nu.xom.ValidityException;
045
046 import org.xml.sax.EntityResolver;
047 import org.xml.sax.ErrorHandler;
048 import org.xml.sax.InputSource;
049 import org.xml.sax.SAXException;
050 import org.xml.sax.SAXParseException;
051
052 /**
053 * This class implements an HTML5 parser that exposes data through the XOM
054 * interface.
055 *
056 * <p>By default, when using the constructor without arguments, the
057 * this parser coerces XML 1.0-incompatible infosets into XML 1.0-compatible
058 * infosets. This corresponds to <code>ALTER_INFOSET</code> as the general
059 * XML violation policy. It is possible to treat XML 1.0 infoset violations
060 * as fatal by setting the general XML violation policy to <code>FATAL</code>.
061 *
062 * <p>The doctype is not represented in the tree.
063 *
064 * <p>The document mode is represented via the <code>Mode</code>
065 * interface on the <code>Document</code> node if the node implements
066 * that interface (depends on the used node factory).
067 *
068 * <p>The form pointer is stored if the node factory supports storing it.
069 *
070 * <p>This package has its own node factory class because the official
071 * XOM node factory may return multiple nodes instead of one confusing
072 * the assumptions of the DOM-oriented HTML5 parsing algorithm.
073 *
074 * @version $Id: HtmlBuilder.java 463 2008-10-03 11:46:38Z hsivonen $
075 * @author hsivonen
076 */
077 public class HtmlBuilder extends Builder {
078
079 private final Driver tokenizer;
080
081 private final XOMTreeBuilder xomTreeBuilder;
082
083 private final SimpleNodeFactory simpleNodeFactory;
084
085 private EntityResolver entityResolver;
086
087 /**
088 * Constructor with default node factory and fatal XML violation policy.
089 */
090 public HtmlBuilder() {
091 this(new SimpleNodeFactory(), XmlViolationPolicy.FATAL);
092 }
093
094 /**
095 * Constructor with given node factory and fatal XML violation policy.
096 * @param nodeFactory the factory
097 */
098 public HtmlBuilder(SimpleNodeFactory nodeFactory) {
099 this(nodeFactory, XmlViolationPolicy.FATAL);
100 }
101
102 /**
103 * Constructor with default node factory and given XML violation policy.
104 * @param xmlPolicy the policy
105 */
106 public HtmlBuilder(XmlViolationPolicy xmlPolicy) {
107 this(new SimpleNodeFactory(), xmlPolicy);
108 }
109
110 /**
111 * Constructor with given node factory and given XML violation policy.
112 * @param nodeFactory the factory
113 * @param xmlPolicy the policy
114 */
115 public HtmlBuilder(SimpleNodeFactory nodeFactory, XmlViolationPolicy xmlPolicy) {
116 super();
117 this.simpleNodeFactory = nodeFactory;
118 this.xomTreeBuilder = new XOMTreeBuilder(nodeFactory);
119 this.tokenizer = new Driver(xomTreeBuilder);
120 this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
121 setXmlPolicy(xmlPolicy);
122 }
123
124 private void tokenize(InputSource is) throws ParsingException, IOException,
125 MalformedURLException {
126 try {
127 if (is == null) {
128 throw new IllegalArgumentException("Null input.");
129 }
130 if (is.getByteStream() == null && is.getCharacterStream() == null) {
131 String systemId = is.getSystemId();
132 if (systemId == null) {
133 throw new IllegalArgumentException(
134 "No byte stream, no character stream nor URI.");
135 }
136 if (entityResolver != null) {
137 is = entityResolver.resolveEntity(is.getPublicId(),
138 systemId);
139 }
140 if (is.getByteStream() == null
141 || is.getCharacterStream() == null) {
142 is = new InputSource();
143 is.setSystemId(systemId);
144 is.setByteStream(new URL(systemId).openStream());
145 }
146 }
147 tokenizer.tokenize(is);
148 } catch (SAXParseException e) {
149 throw new ParsingException(e.getMessage(), e.getSystemId(), e.getLineNumber(),
150 e.getColumnNumber(), e);
151 } catch (SAXException e) {
152 throw new ParsingException(e.getMessage(), e);
153 }
154 }
155
156 /**
157 * Parse from SAX <code>InputSource</code>.
158 * @param is the <code>InputSource</code>
159 * @return the document
160 * @throws ParsingException in case of an XML violation
161 * @throws IOException if IO goes wrang
162 */
163 public Document build(InputSource is) throws ParsingException, IOException {
164 xomTreeBuilder.setFragmentContext(null);
165 tokenize(is);
166 return xomTreeBuilder.getDocument();
167 }
168
169 /**
170 * Parse a fragment from SAX <code>InputSource</code>.
171 * @param is the <code>InputSource</code>
172 * @param context the name of the context element
173 * @return the fragment
174 * @throws ParsingException in case of an XML violation
175 * @throws IOException if IO goes wrang
176 */
177 public Nodes buildFragment(InputSource is, String context)
178 throws IOException, ParsingException {
179 xomTreeBuilder.setFragmentContext(context.intern());
180 tokenize(is);
181 return xomTreeBuilder.getDocumentFragment();
182 }
183
184
185 /**
186 * Parse from <code>File</code>.
187 * @param file the file
188 * @return the document
189 * @throws ParsingException in case of an XML violation
190 * @throws IOException if IO goes wrang
191 * @see nu.xom.Builder#build(java.io.File)
192 */
193 @Override
194 public Document build(File file) throws ParsingException,
195 ValidityException, IOException {
196 return build(new FileInputStream(file), file.toURI().toASCIIString());
197 }
198
199 /**
200 * Parse from <code>InputStream</code>.
201 * @param stream the stream
202 * @param uri the base URI
203 * @return the document
204 * @throws ParsingException in case of an XML violation
205 * @throws IOException if IO goes wrang
206 * @see nu.xom.Builder#build(java.io.InputStream, java.lang.String)
207 */
208 @Override
209 public Document build(InputStream stream, String uri)
210 throws ParsingException, ValidityException, IOException {
211 InputSource is = new InputSource(stream);
212 is.setSystemId(uri);
213 return build(is);
214 }
215
216 /**
217 * Parse from <code>InputStream</code>.
218 * @param stream the stream
219 * @return the document
220 * @throws ParsingException in case of an XML violation
221 * @throws IOException if IO goes wrang
222 * @see nu.xom.Builder#build(java.io.InputStream)
223 */
224 @Override
225 public Document build(InputStream stream) throws ParsingException,
226 ValidityException, IOException {
227 return build(new InputSource(stream));
228 }
229
230 /**
231 * Parse from <code>Reader</code>.
232 * @param stream the reader
233 * @param uri the base URI
234 * @return the document
235 * @throws ParsingException in case of an XML violation
236 * @throws IOException if IO goes wrang
237 * @see nu.xom.Builder#build(java.io.Reader, java.lang.String)
238 */
239 @Override
240 public Document build(Reader stream, String uri) throws ParsingException,
241 ValidityException, IOException {
242 InputSource is = new InputSource(stream);
243 is.setSystemId(uri);
244 return build(is);
245 }
246
247 /**
248 * Parse from <code>Reader</code>.
249 * @param stream the reader
250 * @return the document
251 * @throws ParsingException in case of an XML violation
252 * @throws IOException if IO goes wrang
253 * @see nu.xom.Builder#build(java.io.Reader)
254 */
255 @Override
256 public Document build(Reader stream) throws ParsingException,
257 ValidityException, IOException {
258 return build(new InputSource(stream));
259 }
260
261 /**
262 * Parse from <code>String</code>.
263 * @param content the HTML source as string
264 * @param uri the base URI
265 * @return the document
266 * @throws ParsingException in case of an XML violation
267 * @throws IOException if IO goes wrang
268 * @see nu.xom.Builder#build(java.lang.String, java.lang.String)
269 */
270 @Override
271 public Document build(String content, String uri) throws ParsingException,
272 ValidityException, IOException {
273 return build(new StringReader(content), uri);
274 }
275
276 /**
277 * Parse from URI.
278 * @param uri the URI of the document
279 * @return the document
280 * @throws ParsingException in case of an XML violation
281 * @throws IOException if IO goes wrang
282 * @see nu.xom.Builder#build(java.lang.String)
283 */
284 @Override
285 public Document build(String uri) throws ParsingException,
286 ValidityException, IOException {
287 return build(new InputSource(uri));
288 }
289
290 /**
291 * Gets the node factory
292 */
293 public SimpleNodeFactory getSimpleNodeFactory() {
294 return simpleNodeFactory;
295 }
296
297 /**
298 * Sets the entity resolver for URI-only inputs.
299 * @param resolver the resolver
300 * @see javax.xml.parsers.DocumentBuilder#setEntityResolver(org.xml.sax.EntityResolver)
301 */
302 public void setEntityResolver(EntityResolver resolver) {
303 this.entityResolver = resolver;
304 }
305
306 /**
307 * @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler)
308 */
309 public void setErrorHandler(ErrorHandler errorHandler) {
310 xomTreeBuilder.setErrorHandler(errorHandler);
311 tokenizer.setErrorHandler(errorHandler);
312 }
313
314 /**
315 * Sets whether comment nodes appear in the tree.
316 * @param ignoreComments <code>true</code> to ignore comments
317 * @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean)
318 */
319 public void setIgnoringComments(boolean ignoreComments) {
320 xomTreeBuilder.setIgnoringComments(ignoreComments);
321 }
322
323 /**
324 * Sets whether the parser considers scripting to be enabled for noscript treatment.
325 * @param scriptingEnabled <code>true</code> to enable
326 * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
327 */
328 public void setScriptingEnabled(boolean scriptingEnabled) {
329 xomTreeBuilder.setScriptingEnabled(scriptingEnabled);
330 }
331
332 /**
333 * Toggles the checking of the NFC normalization of source.
334 * @param enable <code>true</code> to check normalization
335 * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean)
336 */
337 public void setCheckingNormalization(boolean enable) {
338 tokenizer.setCheckingNormalization(enable);
339 }
340
341 /**
342 * Sets the policy for consecutive hyphens in comments.
343 * @param commentPolicy the policy
344 * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
345 */
346 public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
347 if (commentPolicy == XmlViolationPolicy.ALLOW) {
348 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
349 }
350 tokenizer.setCommentPolicy(commentPolicy);
351 }
352
353 /**
354 * Sets the policy for non-XML characters except white space.
355 * @param contentNonXmlCharPolicy the policy
356 * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
357 */
358 public void setContentNonXmlCharPolicy(
359 XmlViolationPolicy contentNonXmlCharPolicy) {
360 if (contentNonXmlCharPolicy == XmlViolationPolicy.ALLOW) {
361 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
362 }
363 tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
364 }
365
366 /**
367 * Sets the policy for non-XML white space.
368 * @param contentSpacePolicy the policy
369 * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
370 */
371 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
372 if (contentSpacePolicy == XmlViolationPolicy.ALLOW) {
373 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
374 }
375 tokenizer.setContentSpacePolicy(contentSpacePolicy);
376 }
377
378
379 /**
380 * Whether the HTML 4 mode reports boolean attributes in a way that repeats
381 * the name in the value.
382 * @param html4ModeCompatibleWithXhtml1Schemata
383 */
384 public void setHtml4ModeCompatibleWithXhtml1Schemata(
385 boolean html4ModeCompatibleWithXhtml1Schemata) {
386 tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
387 }
388
389 /**
390 * @param mappingLangToXmlLang
391 * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
392 */
393 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
394 tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
395 }
396
397 /**
398 * @param namePolicy
399 * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
400 */
401 public void setNamePolicy(XmlViolationPolicy namePolicy) {
402 if (namePolicy == XmlViolationPolicy.ALLOW) {
403 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
404 }
405 tokenizer.setNamePolicy(namePolicy);
406 xomTreeBuilder.setNamePolicy(namePolicy);
407 }
408
409 /**
410 * This is a catch-all convenience method for setting name, content space,
411 * content non-XML char and comment policies in one go.
412 *
413 * @param xmlPolicy
414 */
415 public void setXmlPolicy(XmlViolationPolicy xmlPolicy) {
416 setNamePolicy(xmlPolicy);
417 setContentSpacePolicy(xmlPolicy);
418 setContentNonXmlCharPolicy(xmlPolicy);
419 setCommentPolicy(xmlPolicy);
420 }
421
422 /**
423 * Does nothing.
424 * @deprecated
425 */
426 public void setBogusXmlnsPolicy(
427 XmlViolationPolicy bogusXmlnsPolicy) {
428 }
429
430 /**
431 * Sets the doctype expectation.
432 *
433 * @param doctypeExpectation
434 * the doctypeExpectation to set
435 * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation)
436 */
437 public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) {
438 xomTreeBuilder.setDoctypeExpectation(doctypeExpectation);
439 }
440
441 /**
442 * Sets the document mode handler.
443 *
444 * @param documentModeHandler
445 * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler)
446 */
447 public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) {
448 xomTreeBuilder.setDocumentModeHandler(documentModeHandler);
449 }
450
451 /**
452 * Sets the encoding sniffing heuristics.
453 *
454 * @param heuristics the heuristics to set
455 * @see nu.validator.htmlparser.impl.Tokenizer#setHeuristics(nu.validator.htmlparser.common.Heuristics)
456 */
457 public void setHeuristics(Heuristics heuristics) {
458 tokenizer.setHeuristics(heuristics);
459 }
460 }