001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     * Copyright (c) 2007 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.dom;
025    
026    import java.io.IOException;
027    import java.net.MalformedURLException;
028    import java.net.URL;
029    
030    import javax.xml.parsers.DocumentBuilder;
031    import javax.xml.parsers.DocumentBuilderFactory;
032    import javax.xml.parsers.ParserConfigurationException;
033    
034    import nu.validator.htmlparser.common.DoctypeExpectation;
035    import nu.validator.htmlparser.common.DocumentModeHandler;
036    import nu.validator.htmlparser.common.XmlViolationPolicy;
037    import nu.validator.htmlparser.impl.Tokenizer;
038    
039    import org.w3c.dom.DOMImplementation;
040    import org.w3c.dom.Document;
041    import org.w3c.dom.DocumentFragment;
042    import org.xml.sax.EntityResolver;
043    import org.xml.sax.ErrorHandler;
044    import org.xml.sax.InputSource;
045    import org.xml.sax.SAXException;
046    
047    /**
048     * This class implements an HTML5 parser that exposes data through the DOM 
049     * interface. 
050     * 
051     * <p>By default, when using the constructor without arguments, the 
052     * this parser treats XML 1.0-incompatible infosets as fatal errors. 
053     * This corresponds to 
054     * <code>FATAL</code> as the general XML violation policy. To make the parser 
055     * support non-conforming HTML fully per the HTML 5 spec while on the other 
056     * hand potentially violating the DOM API contract, set the general XML 
057     * violation policy to <code>ALLOW</code>. This does not work with a standard 
058     * DOM implementation. Handling all input without fatal errors and without 
059     * violating the DOM API contract is possible by setting 
060     * the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This 
061     * makes the parser non-conforming</em> but is probably the most useful 
062     * setting for most applications.
063     * 
064     * <p>The doctype is not represented in the tree.
065     * 
066     * <p>The document mode is represented as user data <code>DocumentMode</code> 
067     * object with the key <code>nu.validator.document-mode</code> on the document 
068     * node. 
069     * 
070     * <p>The form pointer is also stored as user data with the key 
071     * <code>nu.validator.form-pointer</code>.
072     * 
073     * @version $Id: HtmlDocumentBuilder.java 153 2007-09-11 07:41:33Z hsivonen $
074     * @author hsivonen
075     */
076    public class HtmlDocumentBuilder extends DocumentBuilder {
077    
078        /**
079         * @return the JAXP DOM implementation
080         */
081        private static DOMImplementation jaxpDOMImplementation() {
082            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
083            factory.setNamespaceAware(true);
084            DocumentBuilder builder;
085            try {
086                builder = factory.newDocumentBuilder();
087            } catch (ParserConfigurationException e) {
088                throw new RuntimeException(e);
089            }
090            return builder.getDOMImplementation();
091        }
092    
093        private final Tokenizer tokenizer;
094    
095        private final DOMTreeBuilder domTreeBuilder;
096    
097        private final DOMImplementation implementation;
098    
099        private EntityResolver entityResolver;
100    
101        /**
102         * Instantiates the document builder with a specific DOM 
103         * implementation and XML violation policy.
104         * 
105         * @param implementation
106         *            the DOM implementation
107         *            @param xmlPolicy the policy
108         */
109        public HtmlDocumentBuilder(DOMImplementation implementation,
110                XmlViolationPolicy xmlPolicy) {
111            this.implementation = implementation;
112            this.domTreeBuilder = new DOMTreeBuilder(implementation);
113            this.tokenizer = new Tokenizer(domTreeBuilder);
114            this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
115            setXmlPolicy(xmlPolicy);
116        }
117    
118        /**
119         * Instantiates the document builder with a specific DOM implementation 
120         * and fatal XML violation policy.
121         * 
122         * @param implementation
123         *            the DOM implementation
124         */
125        public HtmlDocumentBuilder(DOMImplementation implementation) {
126            this(implementation, XmlViolationPolicy.FATAL);
127        }
128    
129        /**
130         * Instantiates the document builder with the JAXP DOM implementation 
131         * and fatal XML violation policy.
132         */
133        public HtmlDocumentBuilder() {
134            this(XmlViolationPolicy.FATAL);
135        }
136    
137        /**
138         * Instantiates the document builder with the JAXP DOM implementation 
139         * and a specific XML violation policy.
140         *            @param xmlPolicy the policy
141         */
142        public HtmlDocumentBuilder(XmlViolationPolicy xmlPolicy) {
143            this(jaxpDOMImplementation(), xmlPolicy);
144        }
145    
146        /**
147         * Returns the DOM implementation
148         * @return the DOM implementation
149         * @see javax.xml.parsers.DocumentBuilder#getDOMImplementation()
150         */
151        @Override
152        public DOMImplementation getDOMImplementation() {
153            return implementation;
154        }
155    
156        /**
157         * Returns <code>true</code>.
158         * @return <code>true</code>
159         * @see javax.xml.parsers.DocumentBuilder#isNamespaceAware()
160         */
161        @Override
162        public boolean isNamespaceAware() {
163            return true;
164        }
165    
166        /**
167         * Returns <code>false</code>
168         * @return <code>false</code>
169         * @see javax.xml.parsers.DocumentBuilder#isValidating()
170         */
171        @Override
172        public boolean isValidating() {
173            return false;
174        }
175    
176        /**
177         * For API compatibility.
178         * @see javax.xml.parsers.DocumentBuilder#newDocument()
179         */
180        @Override
181        public Document newDocument() {
182            return implementation.createDocument(null, null, null);
183        }
184    
185        /**
186         * Parses a document from a SAX <code>InputSource</code>.
187         * @param is the source
188         * @return the doc
189         * @see javax.xml.parsers.DocumentBuilder#parse(org.xml.sax.InputSource)
190         */
191        @Override
192        public Document parse(InputSource is) throws SAXException, IOException {
193            domTreeBuilder.setFragmentContext(null);
194            tokenize(is);
195            return domTreeBuilder.getDocument();
196        }
197    
198        /**
199         * Parses a document fragment from a SAX <code>InputSource</code>.
200         * @param is the source
201         * @param context the context element name
202         * @return the doc
203         * @throws IOException
204         * @throws SAXException
205         */
206        public DocumentFragment parseFragment(InputSource is, String context)
207                throws IOException, SAXException {
208            domTreeBuilder.setFragmentContext(context);
209            tokenize(is);
210            return domTreeBuilder.getDocumentFragment();
211        }
212    
213        /**
214         * @param is
215         * @throws SAXException
216         * @throws IOException
217         * @throws MalformedURLException
218         */
219        private void tokenize(InputSource is) throws SAXException, IOException,
220                MalformedURLException {
221            if (is == null) {
222                throw new IllegalArgumentException("Null input.");
223            }
224            if (is.getByteStream() == null && is.getCharacterStream() == null) {
225                String systemId = is.getSystemId();
226                if (systemId == null) {
227                    throw new IllegalArgumentException(
228                            "No byte stream, no character stream nor URI.");
229                }
230                if (entityResolver != null) {
231                    is = entityResolver.resolveEntity(is.getPublicId(), systemId);
232                }
233                if (is.getByteStream() == null || is.getCharacterStream() == null) {
234                    is = new InputSource();
235                    is.setSystemId(systemId);
236                    is.setByteStream(new URL(systemId).openStream());
237                }
238            }
239            tokenizer.tokenize(is);
240        }
241    
242        /**
243         * Sets the entity resolver for URI-only inputs.
244         * @param resolver the resolver
245         * @see javax.xml.parsers.DocumentBuilder#setEntityResolver(org.xml.sax.EntityResolver)
246         */
247        @Override
248        public void setEntityResolver(EntityResolver resolver) {
249            this.entityResolver = resolver;
250        }
251    
252        /**
253         * @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler)
254         */
255        @Override
256        public void setErrorHandler(ErrorHandler errorHandler) {
257            domTreeBuilder.setErrorHandler(errorHandler);
258            tokenizer.setErrorHandler(errorHandler);
259        }
260    
261        /**
262         * Sets whether comment nodes appear in the tree.
263         * @param ignoreComments <code>true</code> to ignore comments
264         * @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean)
265         */
266        public void setIgnoringComments(boolean ignoreComments) {
267            domTreeBuilder.setIgnoringComments(ignoreComments);
268        }
269    
270        /**
271         * Sets whether the parser considers scripting to be enabled for noscript treatment.
272         * @param scriptingEnabled <code>true</code> to enable
273         * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
274         */
275        public void setScriptingEnabled(boolean scriptingEnabled) {
276            domTreeBuilder.setScriptingEnabled(scriptingEnabled);
277        }
278    
279        /**
280         * Toggles the checking of the NFC normalization of source.
281         * @param enable <code>true</code> to check normalization
282         * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean)
283         */
284        public void setCheckingNormalization(boolean enable) {
285            tokenizer.setCheckingNormalization(enable);
286        }
287    
288        /**
289         * Sets the policy for consecutive hyphens in comments.
290         * @param commentPolicy the policy
291         * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
292         */
293        public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
294            tokenizer.setCommentPolicy(commentPolicy);
295        }
296    
297        /**
298         * Sets the policy for non-XML characters except white space.
299         * @param contentNonXmlCharPolicy the policy
300         * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
301         */
302        public void setContentNonXmlCharPolicy(
303                XmlViolationPolicy contentNonXmlCharPolicy) {
304            tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
305        }
306    
307        /**
308         * Sets the policy for non-XML white space.
309         * @param contentSpacePolicy the policy
310         * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
311         */
312        public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
313            tokenizer.setContentSpacePolicy(contentSpacePolicy);
314        }
315    
316    
317        /**
318         * Whether the HTML 4 mode reports boolean attributes in a way that repeats
319         * the name in the value.
320         * @param html4ModeCompatibleWithXhtml1Schemata
321         */
322        public void setHtml4ModeCompatibleWithXhtml1Schemata(
323                boolean html4ModeCompatibleWithXhtml1Schemata) {
324            tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
325        }
326    
327        /**
328         * @param mappingLangToXmlLang
329         * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
330         */
331        public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
332            tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
333        }
334    
335        /**
336         * @param namePolicy
337         * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
338         */
339        public void setNamePolicy(XmlViolationPolicy namePolicy) {
340            tokenizer.setNamePolicy(namePolicy);
341        }
342    
343        /**
344         * This is a catch-all convenience method for setting name, content space,
345         * content non-XML char and comment policies in one go.
346         * 
347         * @param xmlPolicy
348         */
349        public void setXmlPolicy(XmlViolationPolicy xmlPolicy) {
350            setNamePolicy(xmlPolicy);
351            setContentSpacePolicy(xmlPolicy);
352            setContentNonXmlCharPolicy(xmlPolicy);
353            setCommentPolicy(xmlPolicy);
354            setBogusXmlnsPolicy(xmlPolicy);
355        }
356    
357        /**
358         * Sets the doctype expectation.
359         * 
360         * @param doctypeExpectation
361         *            the doctypeExpectation to set
362         * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation)
363         */
364        public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) {
365            domTreeBuilder.setDoctypeExpectation(doctypeExpectation);
366        }
367    
368        /**
369         * Sets the document mode handler.
370         * 
371         * @param documentModeHandler
372         * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler)
373         */
374        public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) {
375            domTreeBuilder.setDocumentModeHandler(documentModeHandler);
376        }
377    
378        /**
379         * Sets the policy for forbidden <code>xmlns</code> attributes.
380         * @param bogusXmlnsPolicy the policy
381         * @see nu.validator.htmlparser.impl.Tokenizer#setBogusXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
382         */
383        public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) {
384            tokenizer.setBogusXmlnsPolicy(bogusXmlnsPolicy);
385        }
386        
387    }