001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     * Copyright (c) 2007 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.xom;
025    
026    import java.io.File;
027    import java.io.FileInputStream;
028    import java.io.IOException;
029    import java.io.InputStream;
030    import java.io.Reader;
031    import java.io.StringReader;
032    import java.net.MalformedURLException;
033    import java.net.URL;
034    
035    import nu.validator.htmlparser.common.DoctypeExpectation;
036    import nu.validator.htmlparser.common.DocumentModeHandler;
037    import nu.validator.htmlparser.common.XmlViolationPolicy;
038    import nu.validator.htmlparser.impl.Tokenizer;
039    import nu.xom.Builder;
040    import nu.xom.Document;
041    import nu.xom.Nodes;
042    import nu.xom.ParsingException;
043    import nu.xom.ValidityException;
044    
045    import org.xml.sax.EntityResolver;
046    import org.xml.sax.ErrorHandler;
047    import org.xml.sax.InputSource;
048    import org.xml.sax.SAXException;
049    import org.xml.sax.SAXParseException;
050    
051    /**
052     * This class implements an HTML5 parser that exposes data through the XOM 
053     * interface. 
054     * 
055     * <p>By default, when using the constructor without arguments, the 
056     * this parser treats XML 1.0-incompatible infosets as fatal errors. 
057     * This corresponds to 
058     * <code>FATAL</code> as the general XML violation policy. Handling 
059     * all input without fatal errors and without 
060     * violating the XOM API contract is possible by setting 
061     * the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This 
062     * makes the parser non-conforming</em> but is probably the most useful 
063     * setting for most applications.
064     * 
065     * <p>The doctype is not represented in the tree.
066     * 
067     * <p>The document mode is represented via the <code>Mode</code> 
068     * interface on the <code>Document</code> node if the node implements 
069     * that interface (depends on the used node factory).
070     * 
071     * <p>The form pointer is stored if the node factory supports storing it.
072     * 
073     * <p>This package has its own node factory class because the official 
074     * XOM node factory may return multiple nodes instead of one confusing 
075     * the assumptions of the DOM-oriented HTML5 parsing algorithm.
076     * 
077     * @version $Id: HtmlBuilder.java 153 2007-09-11 07:41:33Z hsivonen $
078     * @author hsivonen
079     */
080    public class HtmlBuilder extends Builder {
081    
082        private final Tokenizer tokenizer;
083    
084        private final XOMTreeBuilder xomTreeBuilder;
085    
086        private final SimpleNodeFactory simpleNodeFactory;
087    
088        private EntityResolver entityResolver;
089    
090        /**
091         * Constructor with default node factory and fatal XML violation policy.
092         */
093        public HtmlBuilder() {
094            this(new SimpleNodeFactory(), XmlViolationPolicy.FATAL);
095        }
096        
097        /**
098         * Constructor with given node factory and fatal XML violation policy.
099         * @param nodeFactory the factory
100         */
101        public HtmlBuilder(SimpleNodeFactory nodeFactory) {
102            this(nodeFactory, XmlViolationPolicy.FATAL);
103        }
104    
105        /**
106         * Constructor with default node factory and given XML violation policy.
107         * @param xmlPolicy the policy
108         */
109        public HtmlBuilder(XmlViolationPolicy xmlPolicy) {
110            this(new SimpleNodeFactory(), xmlPolicy);
111        }
112        
113        /**
114         * Constructor with given node factory and given XML violation policy.
115         * @param nodeFactory the factory
116         * @param xmlPolicy the policy
117         */
118        public HtmlBuilder(SimpleNodeFactory nodeFactory, XmlViolationPolicy xmlPolicy) {
119            super();
120            this.simpleNodeFactory = nodeFactory;
121            this.xomTreeBuilder = new XOMTreeBuilder(nodeFactory);
122            this.tokenizer = new Tokenizer(xomTreeBuilder);
123            this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
124            setXmlPolicy(xmlPolicy);
125        }
126    
127        private void tokenize(InputSource is) throws ParsingException, IOException,
128                MalformedURLException {
129            try {
130                if (is == null) {
131                    throw new IllegalArgumentException("Null input.");
132                }
133                if (is.getByteStream() == null && is.getCharacterStream() == null) {
134                    String systemId = is.getSystemId();
135                    if (systemId == null) {
136                        throw new IllegalArgumentException(
137                                "No byte stream, no character stream nor URI.");
138                    }
139                    if (entityResolver != null) {
140                        is = entityResolver.resolveEntity(is.getPublicId(),
141                                systemId);
142                    }
143                    if (is.getByteStream() == null
144                            || is.getCharacterStream() == null) {
145                        is = new InputSource();
146                        is.setSystemId(systemId);
147                        is.setByteStream(new URL(systemId).openStream());
148                    }
149                }
150                tokenizer.tokenize(is);
151            } catch (SAXParseException e) {
152                throw new ParsingException(e.getMessage(), e.getSystemId(), e.getLineNumber(),
153                        e.getColumnNumber(), e);
154            } catch (SAXException e) {
155                throw new ParsingException(e.getMessage(), e);
156            }
157        }
158    
159        /**
160         * Parse from SAX <code>InputSource</code>.
161         * @param is the <code>InputSource</code>
162         * @return the document
163         * @throws ParsingException in case of an XML violation
164         * @throws IOException if IO goes wrang
165         */
166        public Document build(InputSource is) throws ParsingException, IOException {
167            xomTreeBuilder.setFragmentContext(null);
168            tokenize(is);
169            return xomTreeBuilder.getDocument();
170        }
171    
172        /**
173         * Parse a fragment from SAX <code>InputSource</code>.
174         * @param is the <code>InputSource</code>
175         * @param context the name of the context element
176         * @return the fragment
177         * @throws ParsingException in case of an XML violation
178         * @throws IOException if IO goes wrang
179         */
180        public Nodes buildFragment(InputSource is, String context)
181                throws IOException, ParsingException {
182            xomTreeBuilder.setFragmentContext(context);
183            tokenize(is);
184            return xomTreeBuilder.getDocumentFragment();
185        }
186    
187        
188        /**
189         * Parse from <code>File</code>.
190         * @param file the file
191         * @return the document
192         * @throws ParsingException in case of an XML violation
193         * @throws IOException if IO goes wrang
194         * @see nu.xom.Builder#build(java.io.File)
195         */
196        @Override
197        public Document build(File file) throws ParsingException,
198                ValidityException, IOException {
199            return build(new FileInputStream(file), file.toURI().toASCIIString());
200        }
201    
202        /**
203         * Parse from <code>InputStream</code>.
204         * @param stream the stream
205         * @param uri the base URI
206         * @return the document
207         * @throws ParsingException in case of an XML violation
208         * @throws IOException if IO goes wrang
209         * @see nu.xom.Builder#build(java.io.InputStream, java.lang.String)
210         */
211        @Override
212        public Document build(InputStream stream, String uri)
213                throws ParsingException, ValidityException, IOException {
214            InputSource is = new InputSource(stream);
215            is.setSystemId(uri);
216            return build(is);
217        }
218    
219        /**
220         * Parse from <code>InputStream</code>.
221         * @param stream the stream
222         * @return the document
223         * @throws ParsingException in case of an XML violation
224         * @throws IOException if IO goes wrang
225         * @see nu.xom.Builder#build(java.io.InputStream)
226         */
227        @Override
228        public Document build(InputStream stream) throws ParsingException,
229                ValidityException, IOException {
230            return build(new InputSource(stream));
231        }
232    
233        /**
234         * Parse from <code>Reader</code>.
235         * @param stream the reader
236         * @param uri the base URI
237         * @return the document
238         * @throws ParsingException in case of an XML violation
239         * @throws IOException if IO goes wrang
240         * @see nu.xom.Builder#build(java.io.Reader, java.lang.String)
241         */
242        @Override
243        public Document build(Reader stream, String uri) throws ParsingException,
244                ValidityException, IOException {
245            InputSource is = new InputSource(stream);
246            is.setSystemId(uri);
247            return build(is);
248        }
249    
250        /**
251         * Parse from <code>Reader</code>.
252         * @param stream the reader
253         * @return the document
254         * @throws ParsingException in case of an XML violation
255         * @throws IOException if IO goes wrang
256         * @see nu.xom.Builder#build(java.io.Reader)
257         */
258        @Override
259        public Document build(Reader stream) throws ParsingException,
260                ValidityException, IOException {
261            return build(new InputSource(stream));
262        }
263    
264        /**
265         * Parse from <code>String</code>.
266         * @param content the HTML source as string
267         * @param uri the base URI
268         * @return the document
269         * @throws ParsingException in case of an XML violation
270         * @throws IOException if IO goes wrang
271         * @see nu.xom.Builder#build(java.lang.String, java.lang.String)
272         */
273        @Override
274        public Document build(String content, String uri) throws ParsingException,
275                ValidityException, IOException {
276            return build(new StringReader(content), uri);
277        }
278    
279        /**
280         * Parse from URI.
281         * @param uri the URI of the document
282         * @return the document
283         * @throws ParsingException in case of an XML violation
284         * @throws IOException if IO goes wrang
285         * @see nu.xom.Builder#build(java.lang.String)
286         */
287        @Override
288        public Document build(String uri) throws ParsingException,
289                ValidityException, IOException {
290            return build(new InputSource(uri));
291        }
292    
293        /**
294         * Gets the node factory
295         */
296        public SimpleNodeFactory getSimpleNodeFactory() {
297            return simpleNodeFactory;
298        }
299    
300        /**
301         * Sets the entity resolver for URI-only inputs.
302         * @param resolver the resolver
303         * @see javax.xml.parsers.DocumentBuilder#setEntityResolver(org.xml.sax.EntityResolver)
304         */
305        public void setEntityResolver(EntityResolver resolver) {
306            this.entityResolver = resolver;
307        }
308    
309        /**
310         * @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler)
311         */
312        public void setErrorHandler(ErrorHandler errorHandler) {
313            xomTreeBuilder.setErrorHandler(errorHandler);
314            tokenizer.setErrorHandler(errorHandler);
315        }
316    
317        /**
318         * Sets whether comment nodes appear in the tree.
319         * @param ignoreComments <code>true</code> to ignore comments
320         * @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean)
321         */
322        public void setIgnoringComments(boolean ignoreComments) {
323            xomTreeBuilder.setIgnoringComments(ignoreComments);
324        }
325    
326        /**
327         * Sets whether the parser considers scripting to be enabled for noscript treatment.
328         * @param scriptingEnabled <code>true</code> to enable
329         * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
330         */
331        public void setScriptingEnabled(boolean scriptingEnabled) {
332            xomTreeBuilder.setScriptingEnabled(scriptingEnabled);
333        }
334    
335        /**
336         * Toggles the checking of the NFC normalization of source.
337         * @param enable <code>true</code> to check normalization
338         * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean)
339         */
340        public void setCheckingNormalization(boolean enable) {
341            tokenizer.setCheckingNormalization(enable);
342        }
343    
344        /**
345         * Sets the policy for consecutive hyphens in comments.
346         * @param commentPolicy the policy
347         * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
348         */
349        public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
350            if (commentPolicy == XmlViolationPolicy.ALLOW) {
351                throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
352            }
353            tokenizer.setCommentPolicy(commentPolicy);
354        }
355    
356        /**
357         * Sets the policy for non-XML characters except white space.
358         * @param contentNonXmlCharPolicy the policy
359         * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
360         */
361        public void setContentNonXmlCharPolicy(
362                XmlViolationPolicy contentNonXmlCharPolicy) {
363            if (contentNonXmlCharPolicy == XmlViolationPolicy.ALLOW) {
364                throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
365            }
366            tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
367        }
368    
369        /**
370         * Sets the policy for non-XML white space.
371         * @param contentSpacePolicy the policy
372         * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
373         */
374        public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
375            if (contentSpacePolicy == XmlViolationPolicy.ALLOW) {
376                throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
377            }
378            tokenizer.setContentSpacePolicy(contentSpacePolicy);
379        }
380    
381    
382        /**
383         * Whether the HTML 4 mode reports boolean attributes in a way that repeats
384         * the name in the value.
385         * @param html4ModeCompatibleWithXhtml1Schemata
386         */
387        public void setHtml4ModeCompatibleWithXhtml1Schemata(
388                boolean html4ModeCompatibleWithXhtml1Schemata) {
389            tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
390        }
391    
392        /**
393         * @param mappingLangToXmlLang
394         * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
395         */
396        public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
397            tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
398        }
399    
400        /**
401         * @param namePolicy
402         * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
403         */
404        public void setNamePolicy(XmlViolationPolicy namePolicy) {
405            if (namePolicy == XmlViolationPolicy.ALLOW) {
406                throw new IllegalArgumentException("Only XML 1.0-compatible policies allowed. Cannot use ALLOW.");
407            }
408            tokenizer.setNamePolicy(namePolicy);
409        }
410    
411        /**
412         * This is a catch-all convenience method for setting name, content space,
413         * content non-XML char and comment policies in one go.
414         * 
415         * @param xmlPolicy
416         */
417        public void setXmlPolicy(XmlViolationPolicy xmlPolicy) {
418            setNamePolicy(xmlPolicy);
419            setContentSpacePolicy(xmlPolicy);
420            setContentNonXmlCharPolicy(xmlPolicy);
421            setCommentPolicy(xmlPolicy);
422            setBogusXmlnsPolicy(xmlPolicy);
423        }
424    
425        /**
426         * Sets the doctype expectation.
427         * 
428         * @param doctypeExpectation
429         *            the doctypeExpectation to set
430         * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation)
431         */
432        public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) {
433            xomTreeBuilder.setDoctypeExpectation(doctypeExpectation);
434        }
435    
436        /**
437         * Sets the document mode handler.
438         * 
439         * @param documentModeHandler
440         * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler)
441         */
442        public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) {
443            xomTreeBuilder.setDocumentModeHandler(documentModeHandler);
444        }
445        
446        /**
447         * Sets the policy for forbidden <code>xmlns</code> attributes.
448         * @param bogusXmlnsPolicy the policy
449         * @see nu.validator.htmlparser.impl.Tokenizer#setBogusXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
450         */
451        public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) {
452            tokenizer.setBogusXmlnsPolicy(bogusXmlnsPolicy);
453        }
454    }