001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     * Copyright (c) 2007-2010 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.sax;
025    
026    import java.io.IOException;
027    import java.net.MalformedURLException;
028    import java.net.URL;
029    import java.util.LinkedList;
030    import java.util.List;
031    import java.util.HashMap;
032    
033    import nu.validator.htmlparser.common.CharacterHandler;
034    import nu.validator.htmlparser.common.DoctypeExpectation;
035    import nu.validator.htmlparser.common.DocumentModeHandler;
036    import nu.validator.htmlparser.common.Heuristics;
037    import nu.validator.htmlparser.common.TokenHandler;
038    import nu.validator.htmlparser.common.TransitionHandler;
039    import nu.validator.htmlparser.common.XmlViolationPolicy;
040    import nu.validator.htmlparser.impl.ErrorReportingTokenizer;
041    import nu.validator.htmlparser.impl.Tokenizer;
042    import nu.validator.htmlparser.impl.TreeBuilder;
043    import nu.validator.htmlparser.io.Driver;
044    import nu.validator.saxtree.Document;
045    import nu.validator.saxtree.DocumentFragment;
046    import nu.validator.saxtree.TreeParser;
047    
048    import org.xml.sax.ContentHandler;
049    import org.xml.sax.DTDHandler;
050    import org.xml.sax.EntityResolver;
051    import org.xml.sax.ErrorHandler;
052    import org.xml.sax.InputSource;
053    import org.xml.sax.Locator;
054    import org.xml.sax.SAXException;
055    import org.xml.sax.SAXNotRecognizedException;
056    import org.xml.sax.SAXNotSupportedException;
057    import org.xml.sax.XMLReader;
058    import org.xml.sax.ext.LexicalHandler;
059    import org.xml.sax.helpers.DefaultHandler;
060    
061    /**
062     * This class implements an HTML5 parser that exposes data through the SAX2 
063     * interface. 
064     * 
065     * <p>By default, when using the constructor without arguments, the 
066     * this parser coerces XML 1.0-incompatible infosets into XML 1.0-compatible
067     * infosets. This corresponds to <code>ALTER_INFOSET</code> as the general 
068     * XML violation policy. To make the parser support non-conforming HTML fully 
069     * per the HTML 5 spec while on the other hand potentially violating the SAX2 
070     * API contract, set the general XML violation policy to <code>ALLOW</code>. 
071     * It is possible to treat XML 1.0 infoset violations as fatal by setting 
072     * the general XML violation policy to <code>FATAL</code>. 
073     * 
074     * <p>By default, this parser doesn't do true streaming but buffers everything 
075     * first. The parser can be made truly streaming by calling 
076     * <code>setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL)</code>. This 
077     * has the consequence that errors that require non-streamable recovery are 
078     * treated as fatal.
079     * 
080     * <p>By default, in order to make the parse events emulate the parse events 
081     * for a DTDless XML document, the parser does not report the doctype through 
082     * <code>LexicalHandler</code>. Doctype reporting through 
083     * <code>LexicalHandler</code> can be turned on by calling 
084     * <code>setReportingDoctype(true)</code>.
085     * 
086     * @version $Id$
087     * @author hsivonen
088     */
089    public class HtmlParser implements XMLReader {
090    
091        private Driver driver = null;
092    
093        private TreeBuilder<?> treeBuilder = null;
094    
095        private SAXStreamer saxStreamer = null; // work around javac bug
096    
097        private SAXTreeBuilder saxTreeBuilder = null; // work around javac bug
098    
099        private ContentHandler contentHandler = null;
100    
101        private LexicalHandler lexicalHandler = null;
102    
103        private DTDHandler dtdHandler = null;
104    
105        private EntityResolver entityResolver = null;
106    
107        private ErrorHandler errorHandler = null;
108    
109        private DocumentModeHandler documentModeHandler = null;
110    
111        private DoctypeExpectation doctypeExpectation = DoctypeExpectation.HTML;
112    
113        private boolean checkingNormalization = false;
114    
115        private boolean scriptingEnabled = false;
116    
117        private final List<CharacterHandler> characterHandlers = new LinkedList<CharacterHandler>();
118        
119        private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.FATAL;
120    
121        private XmlViolationPolicy contentNonXmlCharPolicy = XmlViolationPolicy.FATAL;
122    
123        private XmlViolationPolicy commentPolicy = XmlViolationPolicy.FATAL;
124    
125        private XmlViolationPolicy namePolicy = XmlViolationPolicy.FATAL;
126    
127        private XmlViolationPolicy streamabilityViolationPolicy = XmlViolationPolicy.ALLOW;
128        
129        private boolean html4ModeCompatibleWithXhtml1Schemata = false;
130    
131        private boolean mappingLangToXmlLang = false;
132    
133        private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.FATAL;
134        
135        private boolean reportingDoctype = true;
136    
137        private ErrorHandler treeBuilderErrorHandler = null;
138    
139        private Heuristics heuristics = Heuristics.NONE;
140    
141        private HashMap<String, String> errorProfileMap = null;
142    
143        private TransitionHandler transitionHandler = null;
144        
145        /**
146         * Instantiates the parser with a fatal XML violation policy.
147         *
148         */
149        public HtmlParser() {
150            this(XmlViolationPolicy.FATAL);
151        }
152        
153        /**
154         * Instantiates the parser with a specific XML violation policy.
155         * @param xmlPolicy the policy
156         */
157        public HtmlParser(XmlViolationPolicy xmlPolicy) {
158            setXmlPolicy(xmlPolicy);
159        }    
160    
161        private Tokenizer newTokenizer(TokenHandler handler, boolean newAttributesEachTime) {
162            if (errorHandler == null && transitionHandler == null &&
163                contentNonXmlCharPolicy == XmlViolationPolicy.ALLOW) {
164                return new Tokenizer(handler, newAttributesEachTime);
165            }
166            ErrorReportingTokenizer tokenizer = 
167                new ErrorReportingTokenizer(handler, newAttributesEachTime);
168            tokenizer.setErrorProfile(errorProfileMap);
169            return tokenizer;
170       }
171        
172        /**
173         * This class wraps different tree builders depending on configuration. This 
174         * method does the work of hiding this from the user of the class.
175         */
176        private void lazyInit() {
177            if (driver == null) {
178                if (streamabilityViolationPolicy == XmlViolationPolicy.ALLOW) {
179                    this.saxTreeBuilder = new SAXTreeBuilder();
180                    this.treeBuilder = this.saxTreeBuilder;
181                    this.saxStreamer = null;
182                    this.driver = new Driver(newTokenizer(treeBuilder, true));
183                } else {
184                    this.saxStreamer = new SAXStreamer();
185                    this.treeBuilder = this.saxStreamer;
186                    this.saxTreeBuilder = null;
187                    this.driver = new Driver(newTokenizer(treeBuilder, false));
188                }
189                this.driver.setErrorHandler(errorHandler);
190                this.driver.setTransitionHandler(transitionHandler);
191                this.treeBuilder.setErrorHandler(treeBuilderErrorHandler);
192                this.driver.setCheckingNormalization(checkingNormalization);
193                this.driver.setCommentPolicy(commentPolicy);
194                this.driver.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
195                this.driver.setContentSpacePolicy(contentSpacePolicy);
196                this.driver.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
197                this.driver.setMappingLangToXmlLang(mappingLangToXmlLang);
198                this.driver.setXmlnsPolicy(xmlnsPolicy);
199                this.driver.setHeuristics(heuristics);
200                for (CharacterHandler characterHandler : characterHandlers) {
201                    this.driver.addCharacterHandler(characterHandler);
202                }
203                this.treeBuilder.setDoctypeExpectation(doctypeExpectation);
204                this.treeBuilder.setDocumentModeHandler(documentModeHandler);
205                this.treeBuilder.setIgnoringComments(lexicalHandler == null);
206                this.treeBuilder.setScriptingEnabled(scriptingEnabled);
207                this.treeBuilder.setReportingDoctype(reportingDoctype);
208                this.treeBuilder.setNamePolicy(namePolicy);
209                if (saxStreamer != null) {
210                    saxStreamer.setContentHandler(contentHandler == null ? new DefaultHandler()
211                            : contentHandler);
212                    saxStreamer.setLexicalHandler(lexicalHandler);
213                    driver.setAllowRewinding(false);
214                }
215            }
216        }
217    
218        /**
219         * @see org.xml.sax.XMLReader#getContentHandler()
220         */
221        public ContentHandler getContentHandler() {
222            return contentHandler;
223        }
224    
225        /**
226         * @see org.xml.sax.XMLReader#getDTDHandler()
227         */
228        public DTDHandler getDTDHandler() {
229            return dtdHandler;
230        }
231    
232        /**
233         * @see org.xml.sax.XMLReader#getEntityResolver()
234         */
235        public EntityResolver getEntityResolver() {
236            return entityResolver;
237        }
238    
239        /**
240         * @see org.xml.sax.XMLReader#getErrorHandler()
241         */
242        public ErrorHandler getErrorHandler() {
243            return errorHandler;
244        }
245    
246        /**
247         * Exposes the configuration of the emulated XML parser as well as
248         * boolean-valued configuration without using non-<code>XMLReader</code>
249         * getters directly.
250         * 
251         * <dl>
252         * <dt><code>http://xml.org/sax/features/external-general-entities</code></dt>
253         * <dd><code>false</code></dd>
254         * <dt><code>http://xml.org/sax/features/external-parameter-entities</code></dt>
255         * <dd><code>false</code></dd>
256         * <dt><code>http://xml.org/sax/features/is-standalone</code></dt>
257         * <dd><code>true</code></dd>
258         * <dt><code>http://xml.org/sax/features/lexical-handler/parameter-entities</code></dt>
259         * <dd><code>false</code></dd>
260         * <dt><code>http://xml.org/sax/features/namespaces</code></dt>
261         * <dd><code>true</code></dd>
262         * <dt><code>http://xml.org/sax/features/namespace-prefixes</code></dt>
263         * <dd><code>false</code></dd>
264         * <dt><code>http://xml.org/sax/features/resolve-dtd-uris</code></dt>
265         * <dd><code>true</code></dd>
266         * <dt><code>http://xml.org/sax/features/string-interning</code></dt>
267         * <dd><code>false</code></dd>
268         * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
269         * <dd><code>isCheckingNormalization</code></dd>
270         * <dt><code>http://xml.org/sax/features/use-attributes2</code></dt>
271         * <dd><code>false</code></dd>
272         * <dt><code>http://xml.org/sax/features/use-locator2</code></dt>
273         * <dd><code>false</code></dd>
274         * <dt><code>http://xml.org/sax/features/use-entity-resolver2</code></dt>
275         * <dd><code>false</code></dd>
276         * <dt><code>http://xml.org/sax/features/validation</code></dt>
277         * <dd><code>false</code></dd>
278         * <dt><code>http://xml.org/sax/features/xmlns-uris</code></dt>
279         * <dd><code>false</code></dd>
280         * <dt><code>http://xml.org/sax/features/xml-1.1</code></dt>
281         * <dd><code>false</code></dd>
282         * <dt><code>http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata</code></dt>
283         * <dd><code>isHtml4ModeCompatibleWithXhtml1Schemata</code></dd>
284         * <dt><code>http://validator.nu/features/mapping-lang-to-xml-lang</code></dt>
285         * <dd><code>isMappingLangToXmlLang</code></dd>
286         * <dt><code>http://validator.nu/features/scripting-enabled</code></dt>
287         * <dd><code>isScriptingEnabled</code></dd>
288         * </dl>
289         * 
290         * @param name
291         *            feature URI string
292         * @return a value per the list above
293         * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
294         */
295        public boolean getFeature(String name) throws SAXNotRecognizedException,
296                SAXNotSupportedException {
297            if ("http://xml.org/sax/features/external-general-entities".equals(name)) {
298                return false;
299            } else if ("http://xml.org/sax/features/external-parameter-entities".equals(name)) {
300                return false;
301            } else if ("http://xml.org/sax/features/is-standalone".equals(name)) {
302                return true;
303            } else if ("http://xml.org/sax/features/lexical-handler/parameter-entities".equals(name)) {
304                return false;
305            } else if ("http://xml.org/sax/features/namespaces".equals(name)) {
306                return true;
307            } else if ("http://xml.org/sax/features/namespace-prefixes".equals(name)) {
308                return false;
309            } else if ("http://xml.org/sax/features/resolve-dtd-uris".equals(name)) {
310                return true; // default value--applicable scenario never happens
311            } else if ("http://xml.org/sax/features/string-interning".equals(name)) {
312                return true;
313            } else if ("http://xml.org/sax/features/unicode-normalization-checking".equals(name)) {
314                return isCheckingNormalization(); // the checks aren't really per
315                // XML 1.1
316            } else if ("http://xml.org/sax/features/use-attributes2".equals(name)) {
317                return false;
318            } else if ("http://xml.org/sax/features/use-locator2".equals(name)) {
319                return false;
320            } else if ("http://xml.org/sax/features/use-entity-resolver2".equals(name)) {
321                return false;
322            } else if ("http://xml.org/sax/features/validation".equals(name)) {
323                return false;
324            } else if ("http://xml.org/sax/features/xmlns-uris".equals(name)) {
325                return false;
326            } else if ("http://xml.org/sax/features/xml-1.1".equals(name)) {
327                return false;
328            } else if ("http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata".equals(name)) {
329                return isHtml4ModeCompatibleWithXhtml1Schemata();
330            } else if ("http://validator.nu/features/mapping-lang-to-xml-lang".equals(name)) {
331                return isMappingLangToXmlLang();
332            } else if ("http://validator.nu/features/scripting-enabled".equals(name)) {
333                return isScriptingEnabled();
334            } else {
335                throw new SAXNotRecognizedException();
336            }
337        }
338    
339        /**
340         * Allows <code>XMLReader</code>-level access to non-boolean valued
341         * getters.
342         * 
343         * <p>
344         * The properties are mapped as follows:
345         * 
346         * <dl>
347         * <dt><code>http://xml.org/sax/properties/document-xml-version</code></dt>
348         * <dd><code>"1.0"</code></dd>
349         * <dt><code>http://xml.org/sax/properties/lexical-handler</code></dt>
350         * <dd><code>getLexicalHandler</code></dd>
351         * <dt><code>http://validator.nu/properties/content-space-policy</code></dt>
352         * <dd><code>getContentSpacePolicy</code></dd>
353         * <dt><code>http://validator.nu/properties/content-non-xml-char-policy</code></dt>
354         * <dd><code>getContentNonXmlCharPolicy</code></dd>
355         * <dt><code>http://validator.nu/properties/comment-policy</code></dt>
356         * <dd><code>getCommentPolicy</code></dd>
357         * <dt><code>http://validator.nu/properties/xmlns-policy</code></dt>
358         * <dd><code>getXmlnsPolicy</code></dd>
359         * <dt><code>http://validator.nu/properties/name-policy</code></dt>
360         * <dd><code>getNamePolicy</code></dd>
361         * <dt><code>http://validator.nu/properties/streamability-violation-policy</code></dt>
362         * <dd><code>getStreamabilityViolationPolicy</code></dd>
363         * <dt><code>http://validator.nu/properties/document-mode-handler</code></dt>
364         * <dd><code>getDocumentModeHandler</code></dd>
365         * <dt><code>http://validator.nu/properties/doctype-expectation</code></dt>
366         * <dd><code>getDoctypeExpectation</code></dd>
367         * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
368         * </dl>
369         * 
370         * @param name
371         *            property URI string
372         * @return a value per the list above
373         * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
374         */
375        public Object getProperty(String name) throws SAXNotRecognizedException,
376                SAXNotSupportedException {
377            if ("http://xml.org/sax/properties/declaration-handler".equals(name)) {
378                throw new SAXNotSupportedException(
379                        "This parser does not suppert DeclHandler.");
380            } else if ("http://xml.org/sax/properties/document-xml-version".equals(name)) {
381                return "1.0"; // Emulating an XML 1.1 parser is not supported.
382            } else if ("http://xml.org/sax/properties/dom-node".equals(name)) {
383                throw new SAXNotSupportedException(
384                        "This parser does not walk the DOM.");
385            } else if ("http://xml.org/sax/properties/lexical-handler".equals(name)) {
386                return getLexicalHandler();
387            } else if ("http://xml.org/sax/properties/xml-string".equals(name)) {
388                throw new SAXNotSupportedException(
389                        "This parser does not expose the source as a string.");
390            } else if ("http://validator.nu/properties/content-space-policy".equals(name)) {
391                return getContentSpacePolicy();
392            } else if ("http://validator.nu/properties/content-non-xml-char-policy".equals(name)) {
393                return getContentNonXmlCharPolicy();
394            } else if ("http://validator.nu/properties/comment-policy".equals(name)) {
395                return getCommentPolicy();
396            } else if ("http://validator.nu/properties/xmlns-policy".equals(name)) {
397                return getXmlnsPolicy();
398            } else if ("http://validator.nu/properties/name-policy".equals(name)) {
399                return getNamePolicy();
400            } else if ("http://validator.nu/properties/streamability-violation-policy".equals(name)) {
401                return getStreamabilityViolationPolicy();
402            } else if ("http://validator.nu/properties/document-mode-handler".equals(name)) {
403                return getDocumentModeHandler();
404            } else if ("http://validator.nu/properties/doctype-expectation".equals(name)) {
405                return getDoctypeExpectation();
406            } else if ("http://validator.nu/properties/xml-policy".equals(name)) {
407                throw new SAXNotSupportedException(
408                        "Cannot get a convenience setter.");
409            } else if ("http://validator.nu/properties/heuristics".equals(name)) {
410                return getHeuristics();
411            } else {
412                throw new SAXNotRecognizedException();
413            }
414        }
415    
416        /**
417         * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource)
418         */
419        public void parse(InputSource input) throws IOException, SAXException {
420            lazyInit();
421            try {
422                treeBuilder.setFragmentContext(null);
423                tokenize(input);
424            } finally {
425                if (saxTreeBuilder != null) {
426                    Document document = saxTreeBuilder.getDocument();
427                    if (document != null) {
428                        new TreeParser(contentHandler, lexicalHandler).parse(document);
429                    }
430                }
431            }
432        }
433    
434        /**
435         * Parses a fragment.
436         * 
437         * @param input the input to parse
438         * @param context the name of the context element
439         * @throws IOException
440         * @throws SAXException
441         */
442        public void parseFragment(InputSource input, String context)
443                throws IOException, SAXException {
444            lazyInit();
445            try {
446                treeBuilder.setFragmentContext(context.intern());
447                tokenize(input);
448            } finally {
449                if (saxTreeBuilder != null) {
450                    DocumentFragment fragment = saxTreeBuilder.getDocumentFragment();
451                    new TreeParser(contentHandler, lexicalHandler).parse(fragment);
452                }
453            }
454        }
455        
456        /**
457         * @param is
458         * @throws SAXException
459         * @throws IOException
460         * @throws MalformedURLException
461         */
462        private void tokenize(InputSource is) throws SAXException, IOException, MalformedURLException {
463            if (is == null) {
464                throw new IllegalArgumentException("Null input.");            
465            }
466            if (is.getByteStream() == null && is.getCharacterStream() == null) {
467                String systemId = is.getSystemId();
468                if (systemId == null) {
469                    throw new IllegalArgumentException("No byte stream, no character stream nor URI.");
470                }
471                if (entityResolver != null) {
472                    is = entityResolver.resolveEntity(is.getPublicId(), systemId);
473                }
474                if (is.getByteStream() == null || is.getCharacterStream() == null) {
475                    is = new InputSource();
476                    is.setSystemId(systemId);
477                    is.setByteStream(new URL(systemId).openStream());
478                }
479            }
480            driver.tokenize(is);
481        }
482    
483        /**
484         * @see org.xml.sax.XMLReader#parse(java.lang.String)
485         */
486        public void parse(String systemId) throws IOException, SAXException {
487            parse(new InputSource(systemId));
488        }
489    
490        /**
491         * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler)
492         */
493        public void setContentHandler(ContentHandler handler) {
494            contentHandler = handler;
495            if (saxStreamer != null) {
496                saxStreamer.setContentHandler(contentHandler == null ? new DefaultHandler()
497                        : contentHandler);
498            }
499        }
500    
501        /**
502         * Sets the lexical handler.
503         * @param handler the hander.
504         */
505        public void setLexicalHandler(LexicalHandler handler) {
506            lexicalHandler = handler;
507            if (treeBuilder != null) {
508                treeBuilder.setIgnoringComments(handler == null);
509                if (saxStreamer != null) {
510                    saxStreamer.setLexicalHandler(handler);
511                }
512            }
513        }
514    
515        /**
516         * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
517         */
518        public void setDTDHandler(DTDHandler handler) {
519            dtdHandler = handler;
520        }
521    
522        /**
523         * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
524         */
525        public void setEntityResolver(EntityResolver resolver) {
526            entityResolver = resolver;
527        }
528    
529        /**
530         * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
531         */
532        public void setErrorHandler(ErrorHandler handler) {
533            errorHandler = handler;
534            treeBuilderErrorHandler = handler;
535            driver = null;
536        }
537    
538        public void setTransitionHandler(TransitionHandler handler) {
539            transitionHandler = handler;
540            driver = null;
541        }
542        
543        /**
544         * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
545         * @deprecated For Validator.nu internal use
546         */
547        public void setTreeBuilderErrorHandlerOverride(ErrorHandler handler) {
548            treeBuilderErrorHandler = handler;
549            if (driver != null) {
550                treeBuilder.setErrorHandler(handler);
551            }
552        }
553        
554        /**
555         * Sets a boolean feature without having to use non-<code>XMLReader</code>
556         * setters directly.
557         * 
558         * <p>
559         * The supported features are:
560         * 
561         * <dl>
562         * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
563         * <dd><code>setCheckingNormalization</code></dd>
564         * <dt><code>http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata</code></dt>
565         * <dd><code>setHtml4ModeCompatibleWithXhtml1Schemata</code></dd>
566         * <dt><code>http://validator.nu/features/mapping-lang-to-xml-lang</code></dt>
567         * <dd><code>setMappingLangToXmlLang</code></dd>
568         * <dt><code>http://validator.nu/features/scripting-enabled</code></dt>
569         * <dd><code>setScriptingEnabled</code></dd>
570         * </dl>
571         * 
572         * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
573         */
574        public void setFeature(String name, boolean value)
575                throws SAXNotRecognizedException, SAXNotSupportedException {
576            if ("http://xml.org/sax/features/external-general-entities".equals(name)) {
577                if (value) {
578                    throw new SAXNotSupportedException("Cannot set " + name + ".");
579                }
580            } else if ("http://xml.org/sax/features/external-parameter-entities".equals(name)) {
581                if (value) {
582                    throw new SAXNotSupportedException("Cannot set " + name + ".");
583                }
584            } else if ("http://xml.org/sax/features/is-standalone".equals(name)) {
585                if (!value) {
586                    throw new SAXNotSupportedException("Cannot set " + name + ".");
587                }
588            } else if ("http://xml.org/sax/features/lexical-handler/parameter-entities".equals(name)) {
589                if (value) {
590                    throw new SAXNotSupportedException("Cannot set " + name + ".");
591                }
592            } else if ("http://xml.org/sax/features/namespaces".equals(name)) {
593                if (!value) {
594                    throw new SAXNotSupportedException("Cannot set " + name + ".");
595                }
596            } else if ("http://xml.org/sax/features/namespace-prefixes".equals(name)) {
597                if (value) {
598                    throw new SAXNotSupportedException("Cannot set " + name + ".");
599                }
600            } else if ("http://xml.org/sax/features/resolve-dtd-uris".equals(name)) {
601                if (!value) {
602                    throw new SAXNotSupportedException("Cannot set " + name + ".");
603                }
604            } else if ("http://xml.org/sax/features/string-interning".equals(name)) {
605                if (!value) {
606                    throw new SAXNotSupportedException("Cannot set " + name + ".");
607                }
608            } else if ("http://xml.org/sax/features/unicode-normalization-checking".equals(name)) {
609                setCheckingNormalization(value);
610            } else if ("http://xml.org/sax/features/use-attributes2".equals(name)) {
611                if (value) {
612                    throw new SAXNotSupportedException("Cannot set " + name + ".");
613                }
614            } else if ("http://xml.org/sax/features/use-locator2".equals(name)) {
615                if (value) {
616                    throw new SAXNotSupportedException("Cannot set " + name + ".");
617                }
618            } else if ("http://xml.org/sax/features/use-entity-resolver2".equals(name)) {
619                if (value) {
620                    throw new SAXNotSupportedException("Cannot set " + name + ".");
621                }
622            } else if ("http://xml.org/sax/features/validation".equals(name)) {
623                if (value) {
624                    throw new SAXNotSupportedException("Cannot set " + name + ".");
625                }
626            } else if ("http://xml.org/sax/features/xmlns-uris".equals(name)) {
627                if (value) {
628                    throw new SAXNotSupportedException("Cannot set " + name + ".");
629                }
630            } else if ("http://xml.org/sax/features/xml-1.1".equals(name)) {
631                if (value) {
632                    throw new SAXNotSupportedException("Cannot set " + name + ".");
633                }
634            } else if ("http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata".equals(name)) {
635                setHtml4ModeCompatibleWithXhtml1Schemata(value);
636            } else if ("http://validator.nu/features/mapping-lang-to-xml-lang".equals(name)) {
637                setMappingLangToXmlLang(value);
638            } else if ("http://validator.nu/features/scripting-enabled".equals(name)) {
639                setScriptingEnabled(value);
640            } else {
641                throw new SAXNotRecognizedException();
642            }
643        }
644    
645        /**
646         * Sets a non-boolean property without having to use non-<code>XMLReader</code>
647         * setters directly.
648         * 
649         * <dl>
650         * <dt><code>http://xml.org/sax/properties/lexical-handler</code></dt>
651         * <dd><code>setLexicalHandler</code></dd>
652         * <dt><code>http://validator.nu/properties/content-space-policy</code></dt>
653         * <dd><code>setContentSpacePolicy</code></dd>
654         * <dt><code>http://validator.nu/properties/content-non-xml-char-policy</code></dt>
655         * <dd><code>setContentNonXmlCharPolicy</code></dd>
656         * <dt><code>http://validator.nu/properties/comment-policy</code></dt>
657         * <dd><code>setCommentPolicy</code></dd>
658         * <dt><code>http://validator.nu/properties/xmlns-policy</code></dt>
659         * <dd><code>setXmlnsPolicy</code></dd>
660         * <dt><code>http://validator.nu/properties/name-policy</code></dt>
661         * <dd><code>setNamePolicy</code></dd>
662         * <dt><code>http://validator.nu/properties/streamability-violation-policy</code></dt>
663         * <dd><code>setStreamabilityViolationPolicy</code></dd>
664         * <dt><code>http://validator.nu/properties/document-mode-handler</code></dt>
665         * <dd><code>setDocumentModeHandler</code></dd>
666         * <dt><code>http://validator.nu/properties/doctype-expectation</code></dt>
667         * <dd><code>setDoctypeExpectation</code></dd>
668         * <dt><code>http://validator.nu/properties/xml-policy</code></dt>
669         * <dd><code>setXmlPolicy</code></dd>
670         * </dl>
671         * 
672         * @see org.xml.sax.XMLReader#setProperty(java.lang.String,
673         *      java.lang.Object)
674         */
675        public void setProperty(String name, Object value)
676                throws SAXNotRecognizedException, SAXNotSupportedException {
677            if ("http://xml.org/sax/properties/declaration-handler".equals(name)) {
678                throw new SAXNotSupportedException(
679                        "This parser does not suppert DeclHandler.");
680            } else if ("http://xml.org/sax/properties/document-xml-version".equals(name)) {
681                throw new SAXNotSupportedException(
682                        "Can't set document-xml-version.");
683            } else if ("http://xml.org/sax/properties/dom-node".equals(name)) {
684                throw new SAXNotSupportedException("Can't set dom-node.");
685            } else if ("http://xml.org/sax/properties/lexical-handler".equals(name)) {
686                setLexicalHandler((LexicalHandler) value);
687            } else if ("http://xml.org/sax/properties/xml-string".equals(name)) {
688                throw new SAXNotSupportedException("Can't set xml-string.");
689            } else if ("http://validator.nu/properties/content-space-policy".equals(name)) {
690                setContentSpacePolicy((XmlViolationPolicy) value);
691            } else if ("http://validator.nu/properties/content-non-xml-char-policy".equals(name)) {
692                setContentNonXmlCharPolicy((XmlViolationPolicy) value);
693            } else if ("http://validator.nu/properties/comment-policy".equals(name)) {
694                setCommentPolicy((XmlViolationPolicy) value);
695            } else if ("http://validator.nu/properties/xmlns-policy".equals(name)) {
696                setXmlnsPolicy((XmlViolationPolicy) value);
697            } else if ("http://validator.nu/properties/name-policy".equals(name)) {
698                setNamePolicy((XmlViolationPolicy) value);
699            } else if ("http://validator.nu/properties/streamability-violation-policy".equals(name)) {
700                setStreamabilityViolationPolicy((XmlViolationPolicy) value);
701            } else if ("http://validator.nu/properties/document-mode-handler".equals(name)) {
702                setDocumentModeHandler((DocumentModeHandler) value);
703            } else if ("http://validator.nu/properties/doctype-expectation".equals(name)) {
704                setDoctypeExpectation((DoctypeExpectation) value);
705            } else if ("http://validator.nu/properties/xml-policy".equals(name)) {
706                setXmlPolicy((XmlViolationPolicy) value);
707            } else if ("http://validator.nu/properties/heuristics".equals(name)) {
708                setHeuristics((Heuristics) value);
709            } else {
710                throw new SAXNotRecognizedException();
711            }
712        }
713    
714        /**
715         * Indicates whether NFC normalization of source is being checked.
716         * @return <code>true</code> if NFC normalization of source is being checked.
717         * @see nu.validator.htmlparser.impl.Tokenizer#isCheckingNormalization()
718         */
719        public boolean isCheckingNormalization() {
720            return checkingNormalization;
721        }
722    
723        /**
724         * Toggles the checking of the NFC normalization of source.
725         * @param enable <code>true</code> to check normalization
726         * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean)
727         */
728        public void setCheckingNormalization(boolean enable) {
729            this.checkingNormalization = enable;
730            if (driver != null) {
731                driver.setCheckingNormalization(checkingNormalization);
732            }
733        }
734    
735        /**
736         * Sets the policy for consecutive hyphens in comments.
737         * @param commentPolicy the policy
738         * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
739         */
740        public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
741            this.commentPolicy = commentPolicy;
742            if (driver != null) {
743                driver.setCommentPolicy(commentPolicy);
744            }
745        }
746    
747        /**
748         * Sets the policy for non-XML characters except white space.
749         * @param contentNonXmlCharPolicy the policy
750         * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
751         */
752        public void setContentNonXmlCharPolicy(
753                XmlViolationPolicy contentNonXmlCharPolicy) {
754            this.contentNonXmlCharPolicy = contentNonXmlCharPolicy;
755            driver = null;
756        }
757    
758        /**
759         * Sets the policy for non-XML white space.
760         * @param contentSpacePolicy the policy
761         * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
762         */
763        public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
764            this.contentSpacePolicy = contentSpacePolicy;
765            if (driver != null) {
766                driver.setContentSpacePolicy(contentSpacePolicy);
767            }
768        }
769    
770        /**
771         * Whether the parser considers scripting to be enabled for noscript treatment.
772         * 
773         * @return <code>true</code> if enabled
774         * @see nu.validator.htmlparser.impl.TreeBuilder#isScriptingEnabled()
775         */
776        public boolean isScriptingEnabled() {
777            return scriptingEnabled;
778        }
779    
780        /**
781         * Sets whether the parser considers scripting to be enabled for noscript treatment.
782         * @param scriptingEnabled <code>true</code> to enable
783         * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
784         */
785        public void setScriptingEnabled(boolean scriptingEnabled) {
786            this.scriptingEnabled = scriptingEnabled;
787            if (treeBuilder != null) {
788                treeBuilder.setScriptingEnabled(scriptingEnabled);
789            }
790        }
791    
792        /**
793         * Returns the doctype expectation.
794         * 
795         * @return the doctypeExpectation
796         */
797        public DoctypeExpectation getDoctypeExpectation() {
798            return doctypeExpectation;
799        }
800    
801        /**
802         * Sets the doctype expectation.
803         * 
804         * @param doctypeExpectation
805         *            the doctypeExpectation to set
806         * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation)
807         */
808        public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) {
809            this.doctypeExpectation = doctypeExpectation;
810            if (treeBuilder != null) {
811                treeBuilder.setDoctypeExpectation(doctypeExpectation);
812            }
813        }
814    
815        /**
816         * Returns the document mode handler.
817         * 
818         * @return the documentModeHandler
819         */
820        public DocumentModeHandler getDocumentModeHandler() {
821            return documentModeHandler;
822        }
823    
824        /**
825         * Sets the document mode handler.
826         * 
827         * @param documentModeHandler
828         *            the documentModeHandler to set
829         * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler)
830         */
831        public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) {
832            this.documentModeHandler = documentModeHandler;
833        }
834    
835        /**
836         * Returns the streamabilityViolationPolicy.
837         * 
838         * @return the streamabilityViolationPolicy
839         */
840        public XmlViolationPolicy getStreamabilityViolationPolicy() {
841            return streamabilityViolationPolicy;
842        }
843    
844        /**
845         * Sets the streamabilityViolationPolicy.
846         * 
847         * @param streamabilityViolationPolicy
848         *            the streamabilityViolationPolicy to set
849         */
850        public void setStreamabilityViolationPolicy(
851                XmlViolationPolicy streamabilityViolationPolicy) {
852            this.streamabilityViolationPolicy = streamabilityViolationPolicy;
853            driver = null;
854        }
855    
856        /**
857         * Whether the HTML 4 mode reports boolean attributes in a way that repeats
858         * the name in the value.
859         * @param html4ModeCompatibleWithXhtml1Schemata
860         */
861        public void setHtml4ModeCompatibleWithXhtml1Schemata(
862                boolean html4ModeCompatibleWithXhtml1Schemata) {
863            this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
864            if (driver != null) {
865                driver.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
866            }
867        }
868    
869        /**
870         * Returns the <code>Locator</code> during parse.
871         * @return the <code>Locator</code>
872         */
873        public Locator getDocumentLocator() {
874            return driver.getDocumentLocator();
875        }
876    
877        /**
878         * Whether the HTML 4 mode reports boolean attributes in a way that repeats
879         * the name in the value.
880         * 
881         * @return the html4ModeCompatibleWithXhtml1Schemata
882         */
883        public boolean isHtml4ModeCompatibleWithXhtml1Schemata() {
884            return html4ModeCompatibleWithXhtml1Schemata;
885        }
886    
887        /**
888         * Whether <code>lang</code> is mapped to <code>xml:lang</code>.
889         * @param mappingLangToXmlLang
890         * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
891         */
892        public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
893            this.mappingLangToXmlLang = mappingLangToXmlLang;
894            if (driver != null) {
895                driver.setMappingLangToXmlLang(mappingLangToXmlLang);
896            }
897        }
898    
899        /**
900         * Whether <code>lang</code> is mapped to <code>xml:lang</code>.
901         * 
902         * @return the mappingLangToXmlLang
903         */
904        public boolean isMappingLangToXmlLang() {
905            return mappingLangToXmlLang;
906        }
907    
908        /**
909         * Whether the <code>xmlns</code> attribute on the root element is 
910         * passed to through. (FATAL not allowed.)
911         * @param xmlnsPolicy
912         * @see nu.validator.htmlparser.impl.Tokenizer#setXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
913         */
914        public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
915            if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
916                throw new IllegalArgumentException("Can't use FATAL here.");
917            }
918            this.xmlnsPolicy = xmlnsPolicy;
919            if (driver != null) {
920                driver.setXmlnsPolicy(xmlnsPolicy);
921            }
922        }
923    
924        /**
925         * Returns the xmlnsPolicy.
926         * 
927         * @return the xmlnsPolicy
928         */
929        public XmlViolationPolicy getXmlnsPolicy() {
930            return xmlnsPolicy;
931        }
932    
933        /**
934         * Returns the lexicalHandler.
935         * 
936         * @return the lexicalHandler
937         */
938        public LexicalHandler getLexicalHandler() {
939            return lexicalHandler;
940        }
941    
942        /**
943         * Returns the commentPolicy.
944         * 
945         * @return the commentPolicy
946         */
947        public XmlViolationPolicy getCommentPolicy() {
948            return commentPolicy;
949        }
950    
951        /**
952         * Returns the contentNonXmlCharPolicy.
953         * 
954         * @return the contentNonXmlCharPolicy
955         */
956        public XmlViolationPolicy getContentNonXmlCharPolicy() {
957            return contentNonXmlCharPolicy;
958        }
959    
960        /**
961         * Returns the contentSpacePolicy.
962         * 
963         * @return the contentSpacePolicy
964         */
965        public XmlViolationPolicy getContentSpacePolicy() {
966            return contentSpacePolicy;
967        }
968    
969        /**
970         * @param reportingDoctype
971         * @see nu.validator.htmlparser.impl.TreeBuilder#setReportingDoctype(boolean)
972         */
973        public void setReportingDoctype(boolean reportingDoctype) {
974            this.reportingDoctype = reportingDoctype;
975            if (treeBuilder != null) {
976                treeBuilder.setReportingDoctype(reportingDoctype);
977            }
978        }
979    
980        /**
981         * Returns the reportingDoctype.
982         * 
983         * @return the reportingDoctype
984         */
985        public boolean isReportingDoctype() {
986            return reportingDoctype;
987        }
988    
989        /**
990         * @param errorProfile
991         * @see nu.validator.htmlparser.impl.errorReportingTokenizer#setErrorProfile(set)
992         */
993        public void setErrorProfile(HashMap<String, String> errorProfileMap) {
994            this.errorProfileMap = errorProfileMap;
995        }
996    
997        /**
998         * The policy for non-NCName element and attribute names.
999         * @param namePolicy
1000         * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
1001         */
1002        public void setNamePolicy(XmlViolationPolicy namePolicy) {
1003            this.namePolicy = namePolicy;
1004            if (driver != null) {
1005                driver.setNamePolicy(namePolicy);
1006                treeBuilder.setNamePolicy(namePolicy);
1007            }
1008        }
1009        
1010        /**
1011         * Sets the encoding sniffing heuristics.
1012         * 
1013         * @param heuristics the heuristics to set
1014         * @see nu.validator.htmlparser.impl.Tokenizer#setHeuristics(nu.validator.htmlparser.common.Heuristics)
1015         */
1016        public void setHeuristics(Heuristics heuristics) {
1017            this.heuristics = heuristics;
1018            if (driver != null) {
1019                driver.setHeuristics(heuristics);
1020            }
1021        }
1022        
1023        public Heuristics getHeuristics() {
1024            return this.heuristics;
1025        }
1026    
1027        /**
1028         * This is a catch-all convenience method for setting name, xmlns, content space, 
1029         * content non-XML char and comment policies in one go. This does not affect the 
1030         * streamability policy or doctype reporting.
1031         * 
1032         * @param xmlPolicy
1033         */
1034        public void setXmlPolicy(XmlViolationPolicy xmlPolicy) {
1035            setNamePolicy(xmlPolicy);
1036            setXmlnsPolicy(xmlPolicy == XmlViolationPolicy.FATAL ? XmlViolationPolicy.ALTER_INFOSET : xmlPolicy);
1037            setContentSpacePolicy(xmlPolicy);
1038            setContentNonXmlCharPolicy(xmlPolicy);
1039            setCommentPolicy(xmlPolicy);
1040        }
1041    
1042        /**
1043         * The policy for non-NCName element and attribute names.
1044         * 
1045         * @return the namePolicy
1046         */
1047        public XmlViolationPolicy getNamePolicy() {
1048            return namePolicy;
1049        }
1050    
1051        /**
1052         * Does nothing.
1053         * @deprecated
1054         */
1055        public void setBogusXmlnsPolicy(
1056                XmlViolationPolicy bogusXmlnsPolicy) {
1057        }
1058    
1059        /**
1060         * Returns <code>XmlViolationPolicy.ALTER_INFOSET</code>.
1061         * @deprecated
1062         * @return <code>XmlViolationPolicy.ALTER_INFOSET</code>
1063         */
1064        public XmlViolationPolicy getBogusXmlnsPolicy() {
1065            return XmlViolationPolicy.ALTER_INFOSET;
1066        }
1067        
1068        public void addCharacterHandler(CharacterHandler characterHandler) {
1069            this.characterHandlers.add(characterHandler);
1070            if (driver != null) {
1071                driver.addCharacterHandler(characterHandler);
1072            }
1073        }
1074    }