001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     * Copyright (c) 2007-2008 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.xom;
025    
026    import java.io.File;
027    import java.io.FileInputStream;
028    import java.io.IOException;
029    import java.io.InputStream;
030    import java.io.Reader;
031    import java.io.StringReader;
032    import java.net.MalformedURLException;
033    import java.net.URL;
034    
035    import nu.validator.htmlparser.common.DoctypeExpectation;
036    import nu.validator.htmlparser.common.DocumentModeHandler;
037    import nu.validator.htmlparser.common.Heuristics;
038    import nu.validator.htmlparser.common.XmlViolationPolicy;
039    import nu.validator.htmlparser.io.Driver;
040    import nu.xom.Builder;
041    import nu.xom.Document;
042    import nu.xom.Nodes;
043    import nu.xom.ParsingException;
044    import nu.xom.ValidityException;
045    
046    import org.xml.sax.EntityResolver;
047    import org.xml.sax.ErrorHandler;
048    import org.xml.sax.InputSource;
049    import org.xml.sax.SAXException;
050    import org.xml.sax.SAXParseException;
051    
052    /**
053     * This class implements an HTML5 parser that exposes data through the XOM 
054     * interface. 
055     * 
056     * <p>By default, when using the constructor without arguments, the 
057     * this parser coerces XML 1.0-incompatible infosets into XML 1.0-compatible
058     * infosets. This corresponds to <code>ALTER_INFOSET</code> as the general 
059     * XML violation policy. It is possible to treat XML 1.0 infoset violations 
060     * as fatal by setting the general XML violation policy to <code>FATAL</code>. 
061     * 
062     * <p>The doctype is not represented in the tree.
063     * 
064     * <p>The document mode is represented via the <code>Mode</code> 
065     * interface on the <code>Document</code> node if the node implements 
066     * that interface (depends on the used node factory).
067     * 
068     * <p>The form pointer is stored if the node factory supports storing it.
069     * 
070     * <p>This package has its own node factory class because the official 
071     * XOM node factory may return multiple nodes instead of one confusing 
072     * the assumptions of the DOM-oriented HTML5 parsing algorithm.
073     * 
074     * @version $Id: HtmlBuilder.java 463 2008-10-03 11:46:38Z hsivonen $
075     * @author hsivonen
076     */
077    public class HtmlBuilder extends Builder {
078    
079        private final Driver tokenizer;
080    
081        private final XOMTreeBuilder xomTreeBuilder;
082    
083        private final SimpleNodeFactory simpleNodeFactory;
084    
085        private EntityResolver entityResolver;
086    
087        /**
088         * Constructor with default node factory and fatal XML violation policy.
089         */
090        public HtmlBuilder() {
091            this(new SimpleNodeFactory(), XmlViolationPolicy.FATAL);
092        }
093        
094        /**
095         * Constructor with given node factory and fatal XML violation policy.
096         * @param nodeFactory the factory
097         */
098        public HtmlBuilder(SimpleNodeFactory nodeFactory) {
099            this(nodeFactory, XmlViolationPolicy.FATAL);
100        }
101    
102        /**
103         * Constructor with default node factory and given XML violation policy.
104         * @param xmlPolicy the policy
105         */
106        public HtmlBuilder(XmlViolationPolicy xmlPolicy) {
107            this(new SimpleNodeFactory(), xmlPolicy);
108        }
109        
110        /**
111         * Constructor with given node factory and given XML violation policy.
112         * @param nodeFactory the factory
113         * @param xmlPolicy the policy
114         */
115        public HtmlBuilder(SimpleNodeFactory nodeFactory, XmlViolationPolicy xmlPolicy) {
116            super();
117            this.simpleNodeFactory = nodeFactory;
118            this.xomTreeBuilder = new XOMTreeBuilder(nodeFactory);
119            this.tokenizer = new Driver(xomTreeBuilder);
120            this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
121            setXmlPolicy(xmlPolicy);
122        }
123    
124        private void tokenize(InputSource is) throws ParsingException, IOException,
125                MalformedURLException {
126            try {
127                if (is == null) {
128                    throw new IllegalArgumentException("Null input.");
129                }
130                if (is.getByteStream() == null && is.getCharacterStream() == null) {
131                    String systemId = is.getSystemId();
132                    if (systemId == null) {
133                        throw new IllegalArgumentException(
134                                "No byte stream, no character stream nor URI.");
135                    }
136                    if (entityResolver != null) {
137                        is = entityResolver.resolveEntity(is.getPublicId(),
138                                systemId);
139                    }
140                    if (is.getByteStream() == null
141                            || is.getCharacterStream() == null) {
142                        is = new InputSource();
143                        is.setSystemId(systemId);
144                        is.setByteStream(new URL(systemId).openStream());
145                    }
146                }
147                tokenizer.tokenize(is);
148            } catch (SAXParseException e) {
149                throw new ParsingException(e.getMessage(), e.getSystemId(), e.getLineNumber(),
150                        e.getColumnNumber(), e);
151            } catch (SAXException e) {
152                throw new ParsingException(e.getMessage(), e);
153            }
154        }
155    
156        /**
157         * Parse from SAX <code>InputSource</code>.
158         * @param is the <code>InputSource</code>
159         * @return the document
160         * @throws ParsingException in case of an XML violation
161         * @throws IOException if IO goes wrang
162         */
163        public Document build(InputSource is) throws ParsingException, IOException {
164            xomTreeBuilder.setFragmentContext(null);
165            tokenize(is);
166            return xomTreeBuilder.getDocument();
167        }
168    
169        /**
170         * Parse a fragment from SAX <code>InputSource</code>.
171         * @param is the <code>InputSource</code>
172         * @param context the name of the context element
173         * @return the fragment
174         * @throws ParsingException in case of an XML violation
175         * @throws IOException if IO goes wrang
176         */
177        public Nodes buildFragment(InputSource is, String context)
178                throws IOException, ParsingException {
179            xomTreeBuilder.setFragmentContext(context.intern());
180            tokenize(is);
181            return xomTreeBuilder.getDocumentFragment();
182        }
183    
184        
185        /**
186         * Parse from <code>File</code>.
187         * @param file the file
188         * @return the document
189         * @throws ParsingException in case of an XML violation
190         * @throws IOException if IO goes wrang
191         * @see nu.xom.Builder#build(java.io.File)
192         */
193        @Override
194        public Document build(File file) throws ParsingException,
195                ValidityException, IOException {
196            return build(new FileInputStream(file), file.toURI().toASCIIString());
197        }
198    
199        /**
200         * Parse from <code>InputStream</code>.
201         * @param stream the stream
202         * @param uri the base URI
203         * @return the document
204         * @throws ParsingException in case of an XML violation
205         * @throws IOException if IO goes wrang
206         * @see nu.xom.Builder#build(java.io.InputStream, java.lang.String)
207         */
208        @Override
209        public Document build(InputStream stream, String uri)
210                throws ParsingException, ValidityException, IOException {
211            InputSource is = new InputSource(stream);
212            is.setSystemId(uri);
213            return build(is);
214        }
215    
216        /**
217         * Parse from <code>InputStream</code>.
218         * @param stream the stream
219         * @return the document
220         * @throws ParsingException in case of an XML violation
221         * @throws IOException if IO goes wrang
222         * @see nu.xom.Builder#build(java.io.InputStream)
223         */
224        @Override
225        public Document build(InputStream stream) throws ParsingException,
226                ValidityException, IOException {
227            return build(new InputSource(stream));
228        }
229    
230        /**
231         * Parse from <code>Reader</code>.
232         * @param stream the reader
233         * @param uri the base URI
234         * @return the document
235         * @throws ParsingException in case of an XML violation
236         * @throws IOException if IO goes wrang
237         * @see nu.xom.Builder#build(java.io.Reader, java.lang.String)
238         */
239        @Override
240        public Document build(Reader stream, String uri) throws ParsingException,
241                ValidityException, IOException {
242            InputSource is = new InputSource(stream);
243            is.setSystemId(uri);
244            return build(is);
245        }
246    
247        /**
248         * Parse from <code>Reader</code>.
249         * @param stream the reader
250         * @return the document
251         * @throws ParsingException in case of an XML violation
252         * @throws IOException if IO goes wrang
253         * @see nu.xom.Builder#build(java.io.Reader)
254         */
255        @Override
256        public Document build(Reader stream) throws ParsingException,
257                ValidityException, IOException {
258            return build(new InputSource(stream));
259        }
260    
261        /**
262         * Parse from <code>String</code>.
263         * @param content the HTML source as string
264         * @param uri the base URI
265         * @return the document
266         * @throws ParsingException in case of an XML violation
267         * @throws IOException if IO goes wrang
268         * @see nu.xom.Builder#build(java.lang.String, java.lang.String)
269         */
270        @Override
271        public Document build(String content, String uri) throws ParsingException,
272                ValidityException, IOException {
273            return build(new StringReader(content), uri);
274        }
275    
276        /**
277         * Parse from URI.
278         * @param uri the URI of the document
279         * @return the document
280         * @throws ParsingException in case of an XML violation
281         * @throws IOException if IO goes wrang
282         * @see nu.xom.Builder#build(java.lang.String)
283         */
284        @Override
285        public Document build(String uri) throws ParsingException,
286                ValidityException, IOException {
287            return build(new InputSource(uri));
288        }
289    
290        /**
291         * Gets the node factory
292         */
293        public SimpleNodeFactory getSimpleNodeFactory() {
294            return simpleNodeFactory;
295        }
296    
297        /**
298         * Sets the entity resolver for URI-only inputs.
299         * @param resolver the resolver
300         * @see javax.xml.parsers.DocumentBuilder#setEntityResolver(org.xml.sax.EntityResolver)
301         */
302        public void setEntityResolver(EntityResolver resolver) {
303            this.entityResolver = resolver;
304        }
305    
306        /**
307         * @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler)
308         */
309        public void setErrorHandler(ErrorHandler errorHandler) {
310            xomTreeBuilder.setErrorHandler(errorHandler);
311            tokenizer.setErrorHandler(errorHandler);
312        }
313    
314        /**
315         * Sets whether comment nodes appear in the tree.
316         * @param ignoreComments <code>true</code> to ignore comments
317         * @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean)
318         */
319        public void setIgnoringComments(boolean ignoreComments) {
320            xomTreeBuilder.setIgnoringComments(ignoreComments);
321        }
322    
323        /**
324         * Sets whether the parser considers scripting to be enabled for noscript treatment.
325         * @param scriptingEnabled <code>true</code> to enable
326         * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
327         */
328        public void setScriptingEnabled(boolean scriptingEnabled) {
329            xomTreeBuilder.setScriptingEnabled(scriptingEnabled);
330        }
331    
332        /**
333         * Toggles the checking of the NFC normalization of source.
334         * @param enable <code>true</code> to check normalization
335         * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean)
336         */
337        public void setCheckingNormalization(boolean enable) {
338            tokenizer.setCheckingNormalization(enable);
339        }
340    
341        /**
342         * Sets the policy for consecutive hyphens in comments.
343         * @param commentPolicy the policy
344         * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
345         */
346        public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
347            if (commentPolicy == XmlViolationPolicy.ALLOW) {
348                throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
349            }
350            tokenizer.setCommentPolicy(commentPolicy);
351        }
352    
353        /**
354         * Sets the policy for non-XML characters except white space.
355         * @param contentNonXmlCharPolicy the policy
356         * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
357         */
358        public void setContentNonXmlCharPolicy(
359                XmlViolationPolicy contentNonXmlCharPolicy) {
360            if (contentNonXmlCharPolicy == XmlViolationPolicy.ALLOW) {
361                throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
362            }
363            tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
364        }
365    
366        /**
367         * Sets the policy for non-XML white space.
368         * @param contentSpacePolicy the policy
369         * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
370         */
371        public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
372            if (contentSpacePolicy == XmlViolationPolicy.ALLOW) {
373                throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
374            }
375            tokenizer.setContentSpacePolicy(contentSpacePolicy);
376        }
377    
378    
379        /**
380         * Whether the HTML 4 mode reports boolean attributes in a way that repeats
381         * the name in the value.
382         * @param html4ModeCompatibleWithXhtml1Schemata
383         */
384        public void setHtml4ModeCompatibleWithXhtml1Schemata(
385                boolean html4ModeCompatibleWithXhtml1Schemata) {
386            tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
387        }
388    
389        /**
390         * @param mappingLangToXmlLang
391         * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
392         */
393        public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
394            tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
395        }
396    
397        /**
398         * @param namePolicy
399         * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
400         */
401        public void setNamePolicy(XmlViolationPolicy namePolicy) {
402            if (namePolicy == XmlViolationPolicy.ALLOW) {
403                throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
404            }
405            tokenizer.setNamePolicy(namePolicy);
406            xomTreeBuilder.setNamePolicy(namePolicy);
407        }
408    
409        /**
410         * This is a catch-all convenience method for setting name, content space,
411         * content non-XML char and comment policies in one go.
412         * 
413         * @param xmlPolicy
414         */
415        public void setXmlPolicy(XmlViolationPolicy xmlPolicy) {
416            setNamePolicy(xmlPolicy);
417            setContentSpacePolicy(xmlPolicy);
418            setContentNonXmlCharPolicy(xmlPolicy);
419            setCommentPolicy(xmlPolicy);
420        }
421    
422        /**
423         * Does nothing.
424         * @deprecated
425         */
426        public void setBogusXmlnsPolicy(
427                XmlViolationPolicy bogusXmlnsPolicy) {
428        }
429        
430        /**
431         * Sets the doctype expectation.
432         * 
433         * @param doctypeExpectation
434         *            the doctypeExpectation to set
435         * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation)
436         */
437        public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) {
438            xomTreeBuilder.setDoctypeExpectation(doctypeExpectation);
439        }
440    
441        /**
442         * Sets the document mode handler.
443         * 
444         * @param documentModeHandler
445         * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler)
446         */
447        public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) {
448            xomTreeBuilder.setDocumentModeHandler(documentModeHandler);
449        }
450    
451        /**
452         * Sets the encoding sniffing heuristics.
453         * 
454         * @param heuristics the heuristics to set
455         * @see nu.validator.htmlparser.impl.Tokenizer#setHeuristics(nu.validator.htmlparser.common.Heuristics)
456         */
457        public void setHeuristics(Heuristics heuristics) {
458            tokenizer.setHeuristics(heuristics);
459        }
460    }