001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 * Copyright (c) 2007 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.xom;
025
026 import java.io.File;
027 import java.io.FileInputStream;
028 import java.io.IOException;
029 import java.io.InputStream;
030 import java.io.Reader;
031 import java.io.StringReader;
032 import java.net.MalformedURLException;
033 import java.net.URL;
034
035 import nu.validator.htmlparser.common.DoctypeExpectation;
036 import nu.validator.htmlparser.common.DocumentModeHandler;
037 import nu.validator.htmlparser.common.XmlViolationPolicy;
038 import nu.validator.htmlparser.impl.Tokenizer;
039 import nu.xom.Builder;
040 import nu.xom.Document;
041 import nu.xom.Nodes;
042 import nu.xom.ParsingException;
043 import nu.xom.ValidityException;
044
045 import org.xml.sax.EntityResolver;
046 import org.xml.sax.ErrorHandler;
047 import org.xml.sax.InputSource;
048 import org.xml.sax.SAXException;
049 import org.xml.sax.SAXParseException;
050
051 /**
052 * This class implements an HTML5 parser that exposes data through the XOM
053 * interface.
054 *
055 * <p>By default, when using the constructor without arguments, the
056 * this parser treats XML 1.0-incompatible infosets as fatal errors.
057 * This corresponds to
058 * <code>FATAL</code> as the general XML violation policy. Handling
059 * all input without fatal errors and without
060 * violating the XOM API contract is possible by setting
061 * the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This
062 * makes the parser non-conforming</em> but is probably the most useful
063 * setting for most applications.
064 *
065 * <p>The doctype is not represented in the tree.
066 *
067 * <p>The document mode is represented via the <code>Mode</code>
068 * interface on the <code>Document</code> node if the node implements
069 * that interface (depends on the used node factory).
070 *
071 * <p>The form pointer is stored if the node factory supports storing it.
072 *
073 * <p>This package has its own node factory class because the official
074 * XOM node factory may return multiple nodes instead of one confusing
075 * the assumptions of the DOM-oriented HTML5 parsing algorithm.
076 *
077 * @version $Id: HtmlBuilder.java 153 2007-09-11 07:41:33Z hsivonen $
078 * @author hsivonen
079 */
080 public class HtmlBuilder extends Builder {
081
082 private final Tokenizer tokenizer;
083
084 private final XOMTreeBuilder xomTreeBuilder;
085
086 private final SimpleNodeFactory simpleNodeFactory;
087
088 private EntityResolver entityResolver;
089
090 /**
091 * Constructor with default node factory and fatal XML violation policy.
092 */
093 public HtmlBuilder() {
094 this(new SimpleNodeFactory(), XmlViolationPolicy.FATAL);
095 }
096
097 /**
098 * Constructor with given node factory and fatal XML violation policy.
099 * @param nodeFactory the factory
100 */
101 public HtmlBuilder(SimpleNodeFactory nodeFactory) {
102 this(nodeFactory, XmlViolationPolicy.FATAL);
103 }
104
105 /**
106 * Constructor with default node factory and given XML violation policy.
107 * @param xmlPolicy the policy
108 */
109 public HtmlBuilder(XmlViolationPolicy xmlPolicy) {
110 this(new SimpleNodeFactory(), xmlPolicy);
111 }
112
113 /**
114 * Constructor with given node factory and given XML violation policy.
115 * @param nodeFactory the factory
116 * @param xmlPolicy the policy
117 */
118 public HtmlBuilder(SimpleNodeFactory nodeFactory, XmlViolationPolicy xmlPolicy) {
119 super();
120 this.simpleNodeFactory = nodeFactory;
121 this.xomTreeBuilder = new XOMTreeBuilder(nodeFactory);
122 this.tokenizer = new Tokenizer(xomTreeBuilder);
123 this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
124 setXmlPolicy(xmlPolicy);
125 }
126
127 private void tokenize(InputSource is) throws ParsingException, IOException,
128 MalformedURLException {
129 try {
130 if (is == null) {
131 throw new IllegalArgumentException("Null input.");
132 }
133 if (is.getByteStream() == null && is.getCharacterStream() == null) {
134 String systemId = is.getSystemId();
135 if (systemId == null) {
136 throw new IllegalArgumentException(
137 "No byte stream, no character stream nor URI.");
138 }
139 if (entityResolver != null) {
140 is = entityResolver.resolveEntity(is.getPublicId(),
141 systemId);
142 }
143 if (is.getByteStream() == null
144 || is.getCharacterStream() == null) {
145 is = new InputSource();
146 is.setSystemId(systemId);
147 is.setByteStream(new URL(systemId).openStream());
148 }
149 }
150 tokenizer.tokenize(is);
151 } catch (SAXParseException e) {
152 throw new ParsingException(e.getMessage(), e.getSystemId(), e.getLineNumber(),
153 e.getColumnNumber(), e);
154 } catch (SAXException e) {
155 throw new ParsingException(e.getMessage(), e);
156 }
157 }
158
159 /**
160 * Parse from SAX <code>InputSource</code>.
161 * @param is the <code>InputSource</code>
162 * @return the document
163 * @throws ParsingException in case of an XML violation
164 * @throws IOException if IO goes wrang
165 */
166 public Document build(InputSource is) throws ParsingException, IOException {
167 xomTreeBuilder.setFragmentContext(null);
168 tokenize(is);
169 return xomTreeBuilder.getDocument();
170 }
171
172 /**
173 * Parse a fragment from SAX <code>InputSource</code>.
174 * @param is the <code>InputSource</code>
175 * @param context the name of the context element
176 * @return the fragment
177 * @throws ParsingException in case of an XML violation
178 * @throws IOException if IO goes wrang
179 */
180 public Nodes buildFragment(InputSource is, String context)
181 throws IOException, ParsingException {
182 xomTreeBuilder.setFragmentContext(context);
183 tokenize(is);
184 return xomTreeBuilder.getDocumentFragment();
185 }
186
187
188 /**
189 * Parse from <code>File</code>.
190 * @param file the file
191 * @return the document
192 * @throws ParsingException in case of an XML violation
193 * @throws IOException if IO goes wrang
194 * @see nu.xom.Builder#build(java.io.File)
195 */
196 @Override
197 public Document build(File file) throws ParsingException,
198 ValidityException, IOException {
199 return build(new FileInputStream(file), file.toURI().toASCIIString());
200 }
201
202 /**
203 * Parse from <code>InputStream</code>.
204 * @param stream the stream
205 * @param uri the base URI
206 * @return the document
207 * @throws ParsingException in case of an XML violation
208 * @throws IOException if IO goes wrang
209 * @see nu.xom.Builder#build(java.io.InputStream, java.lang.String)
210 */
211 @Override
212 public Document build(InputStream stream, String uri)
213 throws ParsingException, ValidityException, IOException {
214 InputSource is = new InputSource(stream);
215 is.setSystemId(uri);
216 return build(is);
217 }
218
219 /**
220 * Parse from <code>InputStream</code>.
221 * @param stream the stream
222 * @return the document
223 * @throws ParsingException in case of an XML violation
224 * @throws IOException if IO goes wrang
225 * @see nu.xom.Builder#build(java.io.InputStream)
226 */
227 @Override
228 public Document build(InputStream stream) throws ParsingException,
229 ValidityException, IOException {
230 return build(new InputSource(stream));
231 }
232
233 /**
234 * Parse from <code>Reader</code>.
235 * @param stream the reader
236 * @param uri the base URI
237 * @return the document
238 * @throws ParsingException in case of an XML violation
239 * @throws IOException if IO goes wrang
240 * @see nu.xom.Builder#build(java.io.Reader, java.lang.String)
241 */
242 @Override
243 public Document build(Reader stream, String uri) throws ParsingException,
244 ValidityException, IOException {
245 InputSource is = new InputSource(stream);
246 is.setSystemId(uri);
247 return build(is);
248 }
249
250 /**
251 * Parse from <code>Reader</code>.
252 * @param stream the reader
253 * @return the document
254 * @throws ParsingException in case of an XML violation
255 * @throws IOException if IO goes wrang
256 * @see nu.xom.Builder#build(java.io.Reader)
257 */
258 @Override
259 public Document build(Reader stream) throws ParsingException,
260 ValidityException, IOException {
261 return build(new InputSource(stream));
262 }
263
264 /**
265 * Parse from <code>String</code>.
266 * @param content the HTML source as string
267 * @param uri the base URI
268 * @return the document
269 * @throws ParsingException in case of an XML violation
270 * @throws IOException if IO goes wrang
271 * @see nu.xom.Builder#build(java.lang.String, java.lang.String)
272 */
273 @Override
274 public Document build(String content, String uri) throws ParsingException,
275 ValidityException, IOException {
276 return build(new StringReader(content), uri);
277 }
278
279 /**
280 * Parse from URI.
281 * @param uri the URI of the document
282 * @return the document
283 * @throws ParsingException in case of an XML violation
284 * @throws IOException if IO goes wrang
285 * @see nu.xom.Builder#build(java.lang.String)
286 */
287 @Override
288 public Document build(String uri) throws ParsingException,
289 ValidityException, IOException {
290 return build(new InputSource(uri));
291 }
292
293 /**
294 * Gets the node factory
295 */
296 public SimpleNodeFactory getSimpleNodeFactory() {
297 return simpleNodeFactory;
298 }
299
300 /**
301 * Sets the entity resolver for URI-only inputs.
302 * @param resolver the resolver
303 * @see javax.xml.parsers.DocumentBuilder#setEntityResolver(org.xml.sax.EntityResolver)
304 */
305 public void setEntityResolver(EntityResolver resolver) {
306 this.entityResolver = resolver;
307 }
308
309 /**
310 * @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler)
311 */
312 public void setErrorHandler(ErrorHandler errorHandler) {
313 xomTreeBuilder.setErrorHandler(errorHandler);
314 tokenizer.setErrorHandler(errorHandler);
315 }
316
317 /**
318 * Sets whether comment nodes appear in the tree.
319 * @param ignoreComments <code>true</code> to ignore comments
320 * @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean)
321 */
322 public void setIgnoringComments(boolean ignoreComments) {
323 xomTreeBuilder.setIgnoringComments(ignoreComments);
324 }
325
326 /**
327 * Sets whether the parser considers scripting to be enabled for noscript treatment.
328 * @param scriptingEnabled <code>true</code> to enable
329 * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
330 */
331 public void setScriptingEnabled(boolean scriptingEnabled) {
332 xomTreeBuilder.setScriptingEnabled(scriptingEnabled);
333 }
334
335 /**
336 * Toggles the checking of the NFC normalization of source.
337 * @param enable <code>true</code> to check normalization
338 * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean)
339 */
340 public void setCheckingNormalization(boolean enable) {
341 tokenizer.setCheckingNormalization(enable);
342 }
343
344 /**
345 * Sets the policy for consecutive hyphens in comments.
346 * @param commentPolicy the policy
347 * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
348 */
349 public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
350 if (commentPolicy == XmlViolationPolicy.ALLOW) {
351 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
352 }
353 tokenizer.setCommentPolicy(commentPolicy);
354 }
355
356 /**
357 * Sets the policy for non-XML characters except white space.
358 * @param contentNonXmlCharPolicy the policy
359 * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
360 */
361 public void setContentNonXmlCharPolicy(
362 XmlViolationPolicy contentNonXmlCharPolicy) {
363 if (contentNonXmlCharPolicy == XmlViolationPolicy.ALLOW) {
364 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
365 }
366 tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
367 }
368
369 /**
370 * Sets the policy for non-XML white space.
371 * @param contentSpacePolicy the policy
372 * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
373 */
374 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
375 if (contentSpacePolicy == XmlViolationPolicy.ALLOW) {
376 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
377 }
378 tokenizer.setContentSpacePolicy(contentSpacePolicy);
379 }
380
381
382 /**
383 * Whether the HTML 4 mode reports boolean attributes in a way that repeats
384 * the name in the value.
385 * @param html4ModeCompatibleWithXhtml1Schemata
386 */
387 public void setHtml4ModeCompatibleWithXhtml1Schemata(
388 boolean html4ModeCompatibleWithXhtml1Schemata) {
389 tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
390 }
391
392 /**
393 * @param mappingLangToXmlLang
394 * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
395 */
396 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
397 tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
398 }
399
400 /**
401 * @param namePolicy
402 * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
403 */
404 public void setNamePolicy(XmlViolationPolicy namePolicy) {
405 if (namePolicy == XmlViolationPolicy.ALLOW) {
406 throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
407 }
408 tokenizer.setNamePolicy(namePolicy);
409 }
410
411 /**
412 * This is a catch-all convenience method for setting name, content space,
413 * content non-XML char and comment policies in one go.
414 *
415 * @param xmlPolicy
416 */
417 public void setXmlPolicy(XmlViolationPolicy xmlPolicy) {
418 setNamePolicy(xmlPolicy);
419 setContentSpacePolicy(xmlPolicy);
420 setContentNonXmlCharPolicy(xmlPolicy);
421 setCommentPolicy(xmlPolicy);
422 setBogusXmlnsPolicy(xmlPolicy);
423 }
424
425 /**
426 * Sets the doctype expectation.
427 *
428 * @param doctypeExpectation
429 * the doctypeExpectation to set
430 * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation)
431 */
432 public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) {
433 xomTreeBuilder.setDoctypeExpectation(doctypeExpectation);
434 }
435
436 /**
437 * Sets the document mode handler.
438 *
439 * @param documentModeHandler
440 * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler)
441 */
442 public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) {
443 xomTreeBuilder.setDocumentModeHandler(documentModeHandler);
444 }
445
446 /**
447 * Sets the policy for forbidden <code>xmlns</code> attributes.
448 * @param bogusXmlnsPolicy the policy
449 * @see nu.validator.htmlparser.impl.Tokenizer#setBogusXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
450 */
451 public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) {
452 tokenizer.setBogusXmlnsPolicy(bogusXmlnsPolicy);
453 }
454 }