001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     * Copyright (c) 2007 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.sax;
025    
026    import java.io.IOException;
027    import java.net.MalformedURLException;
028    import java.net.URL;
029    import java.util.LinkedList;
030    import java.util.List;
031    
032    import nu.validator.htmlparser.common.DoctypeExpectation;
033    import nu.validator.htmlparser.common.DocumentModeHandler;
034    import nu.validator.htmlparser.common.XmlViolationPolicy;
035    import nu.validator.htmlparser.impl.CharacterHandler;
036    import nu.validator.htmlparser.impl.Tokenizer;
037    import nu.validator.htmlparser.impl.TreeBuilder;
038    import nu.validator.saxtree.Document;
039    import nu.validator.saxtree.DocumentFragment;
040    import nu.validator.saxtree.TreeParser;
041    
042    import org.xml.sax.ContentHandler;
043    import org.xml.sax.DTDHandler;
044    import org.xml.sax.EntityResolver;
045    import org.xml.sax.ErrorHandler;
046    import org.xml.sax.InputSource;
047    import org.xml.sax.Locator;
048    import org.xml.sax.SAXException;
049    import org.xml.sax.SAXNotRecognizedException;
050    import org.xml.sax.SAXNotSupportedException;
051    import org.xml.sax.XMLReader;
052    import org.xml.sax.ext.LexicalHandler;
053    import org.xml.sax.helpers.DefaultHandler;
054    
055    /**
056     * This class implements an HTML5 parser that exposes data through the SAX2 
057     * interface. 
058     * 
059     * <p>By default, when using the constructor without arguments, the 
060     * this parser treats XML 1.0-incompatible infosets as fatal errors in 
061     * order to adhere to the SAX2 API contract strictly. This corresponds to 
062     * <code>FATAL</code> as the general XML violation policy. To make the parser 
063     * support non-conforming HTML fully per the HTML 5 spec while on the other 
064     * hand potentially violating the SAX2 API contract, set the general XML 
065     * violation policy to <code>ALLOW</code>. Handling all input without fatal 
066     * errors and without violating the SAX2 API contract is possible by setting 
067     * the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This 
068     * makes the parser non-conforming</em> but is probably the most useful 
069     * setting for most applications.
070     * 
071     * <p>By default, this parser doesn't do true streaming but buffers everything 
072     * first. The parser can be made truly streaming by calling 
073     * <code>setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL)</code>. This 
074     * has the consequence that errors that require non-streamable recovery are 
075     * treated as fatal.
076     * 
077     * <p>By default, in order to make the parse events emulate the parse events 
078     * for a DTDless XML document, the parser does not report the doctype through 
079     * <code>LexicalHandler</code>. Doctype reporting through 
080     * <code>LexicalHandler</code> can be turned on by calling 
081     * <code>setReportingDoctype(true)</code>.
082     * 
083     * @version $Id: HtmlParser.java 161 2007-10-02 09:10:00Z hsivonen $
084     * @author hsivonen
085     */
086    public class HtmlParser implements XMLReader {
087    
088        private Tokenizer tokenizer = null;
089    
090        private TreeBuilder<?> treeBuilder = null;
091    
092        private SAXStreamer saxStreamer = null; // work around javac bug
093    
094        private SAXTreeBuilder saxTreeBuilder = null; // work around javac bug
095    
096        private ContentHandler contentHandler = null;
097    
098        private LexicalHandler lexicalHandler = null;
099    
100        private DTDHandler dtdHandler = null;
101    
102        private EntityResolver entityResolver = null;
103    
104        private ErrorHandler errorHandler = null;
105    
106        private DocumentModeHandler documentModeHandler = null;
107    
108        private DoctypeExpectation doctypeExpectation = DoctypeExpectation.HTML;
109    
110        private boolean checkingNormalization = false;
111    
112        private boolean scriptingEnabled = false;
113    
114        private final List<CharacterHandler> characterHandlers = new LinkedList<CharacterHandler>();
115        
116        private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.FATAL;
117    
118        private XmlViolationPolicy contentNonXmlCharPolicy = XmlViolationPolicy.FATAL;
119    
120        private XmlViolationPolicy commentPolicy = XmlViolationPolicy.FATAL;
121    
122        private XmlViolationPolicy namePolicy = XmlViolationPolicy.FATAL;
123    
124        private XmlViolationPolicy streamabilityViolationPolicy = XmlViolationPolicy.ALLOW;
125        
126        private boolean html4ModeCompatibleWithXhtml1Schemata;
127    
128        private boolean mappingLangToXmlLang;
129    
130        private XmlViolationPolicy xmlnsPolicy;
131    
132        private XmlViolationPolicy bogusXmlnsPolicy;
133        
134        private boolean reportingDoctype = true;
135    
136        private ErrorHandler treeBuilderErrorHandler;
137    
138        /**
139         * Instantiates the parser with a fatal XML violation policy.
140         *
141         */
142        public HtmlParser() {
143            this(XmlViolationPolicy.FATAL);
144        }
145        
146        /**
147         * Instantiates the parser with a specific XML violation policy.
148         * @param xmlPolicy the policy
149         */
150        public HtmlParser(XmlViolationPolicy xmlPolicy) {
151            setXmlPolicy(xmlPolicy);
152        }    
153    
154        /**
155         * This class wraps differnt tree builders depending on configuration. This 
156         * method does the work of hiding this from the user of the class.
157         */
158        private void lazyInit() {
159            if (tokenizer == null) {
160                if (streamabilityViolationPolicy == XmlViolationPolicy.ALLOW) {
161                    this.saxTreeBuilder = new SAXTreeBuilder();
162                    this.treeBuilder = this.saxTreeBuilder;
163                    this.saxStreamer = null;
164                } else {
165                    this.saxStreamer = new SAXStreamer();
166                    this.treeBuilder = this.saxStreamer;
167                    this.saxTreeBuilder = null;
168                }
169                this.tokenizer = new Tokenizer(treeBuilder);
170                this.tokenizer.setErrorHandler(errorHandler);
171                this.treeBuilder.setErrorHandler(treeBuilderErrorHandler);
172                this.tokenizer.setCheckingNormalization(checkingNormalization);
173                this.tokenizer.setCommentPolicy(commentPolicy);
174                this.tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
175                this.tokenizer.setContentSpacePolicy(contentSpacePolicy);
176                this.tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
177                this.tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
178                this.tokenizer.setXmlnsPolicy(xmlnsPolicy);
179                for (CharacterHandler characterHandler : characterHandlers) {
180                    this.tokenizer.addCharacterHandler(characterHandler);
181                }
182                this.treeBuilder.setDoctypeExpectation(doctypeExpectation);
183                this.treeBuilder.setDocumentModeHandler(documentModeHandler);
184                this.treeBuilder.setIgnoringComments(lexicalHandler == null);
185                this.treeBuilder.setScriptingEnabled(scriptingEnabled);
186                this.treeBuilder.setReportingDoctype(reportingDoctype);
187                if (saxStreamer != null) {
188                    saxStreamer.setContentHandler(contentHandler == null ? new DefaultHandler()
189                            : contentHandler);
190                    saxStreamer.setLexicalHandler(lexicalHandler);
191                }
192            }
193        }
194    
195        /**
196         * @see org.xml.sax.XMLReader#getContentHandler()
197         */
198        public ContentHandler getContentHandler() {
199            return contentHandler;
200        }
201    
202        /**
203         * @see org.xml.sax.XMLReader#getDTDHandler()
204         */
205        public DTDHandler getDTDHandler() {
206            return dtdHandler;
207        }
208    
209        /**
210         * @see org.xml.sax.XMLReader#getEntityResolver()
211         */
212        public EntityResolver getEntityResolver() {
213            return entityResolver;
214        }
215    
216        /**
217         * @see org.xml.sax.XMLReader#getErrorHandler()
218         */
219        public ErrorHandler getErrorHandler() {
220            return errorHandler;
221        }
222    
223        /**
224         * Exposes the configuration of the emulated XML parser as well as
225         * boolean-valued configuration without using non-<code>XMLReader</code>
226         * getters directly.
227         * 
228         * <dl>
229         * <dt><code>http://xml.org/sax/features/external-general-entities</code></dt>
230         * <dd><code>false</code></dd>
231         * <dt><code>http://xml.org/sax/features/external-parameter-entities</code></dt>
232         * <dd><code>false</code></dd>
233         * <dt><code>http://xml.org/sax/features/is-standalone</code></dt>
234         * <dd><code>true</code></dd>
235         * <dt><code>http://xml.org/sax/features/lexical-handler/parameter-entities</code></dt>
236         * <dd><code>false</code></dd>
237         * <dt><code>http://xml.org/sax/features/namespaces</code></dt>
238         * <dd><code>true</code></dd>
239         * <dt><code>http://xml.org/sax/features/namespace-prefixes</code></dt>
240         * <dd><code>false</code></dd>
241         * <dt><code>http://xml.org/sax/features/resolve-dtd-uris</code></dt>
242         * <dd><code>true</code></dd>
243         * <dt><code>http://xml.org/sax/features/string-interning</code></dt>
244         * <dd><code>false</code></dd>
245         * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
246         * <dd><code>isCheckingNormalization</code></dd>
247         * <dt><code>http://xml.org/sax/features/use-attributes2</code></dt>
248         * <dd><code>false</code></dd>
249         * <dt><code>http://xml.org/sax/features/use-locator2</code></dt>
250         * <dd><code>false</code></dd>
251         * <dt><code>http://xml.org/sax/features/use-entity-resolver2</code></dt>
252         * <dd><code>false</code></dd>
253         * <dt><code>http://xml.org/sax/features/validation</code></dt>
254         * <dd><code>false</code></dd>
255         * <dt><code>http://xml.org/sax/features/xmlns-uris</code></dt>
256         * <dd><code>false</code></dd>
257         * <dt><code>http://xml.org/sax/features/xml-1.1</code></dt>
258         * <dd><code>false</code></dd>
259         * <dt><code>http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata</code></dt>
260         * <dd><code>isHtml4ModeCompatibleWithXhtml1Schemata</code></dd>
261         * <dt><code>http://validator.nu/features/mapping-lang-to-xml-lang</code></dt>
262         * <dd><code>isMappingLangToXmlLang</code></dd>
263         * <dt><code>http://validator.nu/features/scripting-enabled</code></dt>
264         * <dd><code>isScriptingEnabled</code></dd>
265         * </dl>
266         * 
267         * @param name
268         *            feature URI string
269         * @return a value per the list above
270         * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
271         */
272        public boolean getFeature(String name) throws SAXNotRecognizedException,
273                SAXNotSupportedException {
274            if ("http://xml.org/sax/features/external-general-entities".equals(name)) {
275                return false;
276            } else if ("http://xml.org/sax/features/external-parameter-entities".equals(name)) {
277                return false;
278            } else if ("http://xml.org/sax/features/is-standalone".equals(name)) {
279                return true;
280            } else if ("http://xml.org/sax/features/lexical-handler/parameter-entities".equals(name)) {
281                return false;
282            } else if ("http://xml.org/sax/features/namespaces".equals(name)) {
283                return true;
284            } else if ("http://xml.org/sax/features/namespace-prefixes".equals(name)) {
285                return false;
286            } else if ("http://xml.org/sax/features/resolve-dtd-uris".equals(name)) {
287                return true; // default value--applicable scenario never happens
288            } else if ("http://xml.org/sax/features/string-interning".equals(name)) {
289                return false; // XXX revisit
290            } else if ("http://xml.org/sax/features/unicode-normalization-checking".equals(name)) {
291                return isCheckingNormalization(); // the checks aren't really per
292                // XML 1.1
293            } else if ("http://xml.org/sax/features/use-attributes2".equals(name)) {
294                return false;
295            } else if ("http://xml.org/sax/features/use-locator2".equals(name)) {
296                return false;
297            } else if ("http://xml.org/sax/features/use-entity-resolver2".equals(name)) {
298                return false;
299            } else if ("http://xml.org/sax/features/validation".equals(name)) {
300                return false;
301            } else if ("http://xml.org/sax/features/xmlns-uris".equals(name)) {
302                return false;
303            } else if ("http://xml.org/sax/features/xml-1.1".equals(name)) {
304                return false;
305            } else if ("http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata".equals(name)) {
306                return isHtml4ModeCompatibleWithXhtml1Schemata();
307            } else if ("http://validator.nu/features/mapping-lang-to-xml-lang".equals(name)) {
308                return isMappingLangToXmlLang();
309            } else if ("http://validator.nu/features/scripting-enabled".equals(name)) {
310                return isScriptingEnabled();
311            } else {
312                throw new SAXNotRecognizedException();
313            }
314        }
315    
316        /**
317         * Allows <code>XMLReader</code>-level access to non-boolean valued
318         * getters.
319         * 
320         * <p>
321         * The properties are mapped as follows:
322         * 
323         * <dl>
324         * <dt><code>http://xml.org/sax/properties/document-xml-version</code></dt>
325         * <dd><code>"1.0"</code></dd>
326         * <dt><code>http://xml.org/sax/properties/lexical-handler</code></dt>
327         * <dd><code>getLexicalHandler</code></dd>
328         * <dt><code>http://validator.nu/properties/content-space-policy</code></dt>
329         * <dd><code>getContentSpacePolicy</code></dd>
330         * <dt><code>http://validator.nu/properties/content-non-xml-char-policy</code></dt>
331         * <dd><code>getContentNonXmlCharPolicy</code></dd>
332         * <dt><code>http://validator.nu/properties/comment-policy</code></dt>
333         * <dd><code>getCommentPolicy</code></dd>
334         * <dt><code>http://validator.nu/properties/xmlns-policy</code></dt>
335         * <dd><code>getXmlnsPolicy</code></dd>
336         * <dt><code>http://validator.nu/properties/name-policy</code></dt>
337         * <dd><code>getNamePolicy</code></dd>
338         * <dt><code>http://validator.nu/properties/streamability-violation-policy</code></dt>
339         * <dd><code>getStreamabilityViolationPolicy</code></dd>
340         * <dt><code>http://validator.nu/properties/document-mode-handler</code></dt>
341         * <dd><code>getDocumentModeHandler</code></dd>
342         * <dt><code>http://validator.nu/properties/doctype-expectation</code></dt>
343         * <dd><code>getDoctypeExpectation</code></dd>
344         * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
345         * </dl>
346         * 
347         * @param name
348         *            property URI string
349         * @return a value per the list above
350         * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
351         */
352        public Object getProperty(String name) throws SAXNotRecognizedException,
353                SAXNotSupportedException {
354            if ("http://xml.org/sax/properties/declaration-handler".equals(name)) {
355                throw new SAXNotSupportedException(
356                        "This parser does not suppert DeclHandler.");
357            } else if ("http://xml.org/sax/properties/document-xml-version".equals(name)) {
358                return "1.0"; // Emulating an XML 1.1 parser is not supported.
359            } else if ("http://xml.org/sax/properties/dom-node".equals(name)) {
360                throw new SAXNotSupportedException(
361                        "This parser does not walk the DOM.");
362            } else if ("http://xml.org/sax/properties/lexical-handler".equals(name)) {
363                return getLexicalHandler();
364            } else if ("http://xml.org/sax/properties/xml-string".equals(name)) {
365                throw new SAXNotSupportedException(
366                        "This parser does not expose the source as a string.");
367            } else if ("http://validator.nu/properties/content-space-policy".equals(name)) {
368                return getContentSpacePolicy();
369            } else if ("http://validator.nu/properties/content-non-xml-char-policy".equals(name)) {
370                return getContentNonXmlCharPolicy();
371            } else if ("http://validator.nu/properties/comment-policy".equals(name)) {
372                return getCommentPolicy();
373            } else if ("http://validator.nu/properties/xmlns-policy".equals(name)) {
374                return getXmlnsPolicy();
375            } else if ("http://validator.nu/properties/name-policy".equals(name)) {
376                return getNamePolicy();
377            } else if ("http://validator.nu/properties/streamability-violation-policy".equals(name)) {
378                return getStreamabilityViolationPolicy();
379            } else if ("http://validator.nu/properties/document-mode-handler".equals(name)) {
380                return getDocumentModeHandler();
381            } else if ("http://validator.nu/properties/doctype-expectation".equals(name)) {
382                return getDoctypeExpectation();
383            } else if ("http://validator.nu/properties/xml-policy".equals(name)) {
384                throw new SAXNotSupportedException(
385                        "Cannot get a convenience setter.");
386            } else {
387                throw new SAXNotRecognizedException();
388            }
389        }
390    
391        /**
392         * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource)
393         */
394        public void parse(InputSource input) throws IOException, SAXException {
395            lazyInit();
396            try {
397                treeBuilder.setFragmentContext(null);
398                tokenize(input);
399            } finally {
400                if (saxTreeBuilder != null) {
401                    Document document = saxTreeBuilder.getDocument();
402                    if (document != null) {
403                        new TreeParser(contentHandler, lexicalHandler).parse(document);
404                    }
405                }
406            }
407        }
408    
409        /**
410         * Parser a fragment.
411         * 
412         * @param input the input to parse
413         * @param context the name of the context element
414         * @throws IOException
415         * @throws SAXException
416         */
417        public void parseFragment(InputSource input, String context)
418                throws IOException, SAXException {
419            lazyInit();
420            try {
421                treeBuilder.setFragmentContext(context);
422                tokenize(input);
423            } finally {
424                if (saxTreeBuilder != null) {
425                    DocumentFragment fragment = saxTreeBuilder.getDocumentFragment();
426                    new TreeParser(contentHandler, lexicalHandler).parse(fragment);
427                }
428            }
429        }
430        
431        /**
432         * @param is
433         * @throws SAXException
434         * @throws IOException
435         * @throws MalformedURLException
436         */
437        private void tokenize(InputSource is) throws SAXException, IOException, MalformedURLException {
438            if (is == null) {
439                throw new IllegalArgumentException("Null input.");            
440            }
441            if (is.getByteStream() == null && is.getCharacterStream() == null) {
442                String systemId = is.getSystemId();
443                if (systemId == null) {
444                    throw new IllegalArgumentException("No byte stream, no character stream nor URI.");
445                }
446                if (entityResolver != null) {
447                    is = entityResolver.resolveEntity(is.getPublicId(), systemId);
448                }
449                if (is.getByteStream() == null || is.getCharacterStream() == null) {
450                    is = new InputSource();
451                    is.setSystemId(systemId);
452                    is.setByteStream(new URL(systemId).openStream());
453                }
454            }
455            tokenizer.tokenize(is);
456        }
457    
458        /**
459         * @see org.xml.sax.XMLReader#parse(java.lang.String)
460         */
461        public void parse(String systemId) throws IOException, SAXException {
462            parse(new InputSource(systemId));
463        }
464    
465        /**
466         * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler)
467         */
468        public void setContentHandler(ContentHandler handler) {
469            contentHandler = handler;
470            if (saxStreamer != null) {
471                saxStreamer.setContentHandler(contentHandler == null ? new DefaultHandler()
472                        : contentHandler);
473            }
474        }
475    
476        /**
477         * Sets the lexical handler.
478         * @param handler the hander.
479         */
480        public void setLexicalHandler(LexicalHandler handler) {
481            lexicalHandler = handler;
482            if (treeBuilder != null) {
483                treeBuilder.setIgnoringComments(handler == null);
484                if (saxStreamer != null) {
485                    saxStreamer.setLexicalHandler(handler);
486                }
487            }
488        }
489    
490        /**
491         * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
492         */
493        public void setDTDHandler(DTDHandler handler) {
494            dtdHandler = handler;
495        }
496    
497        /**
498         * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
499         */
500        public void setEntityResolver(EntityResolver resolver) {
501            entityResolver = resolver;
502        }
503    
504        /**
505         * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
506         */
507        public void setErrorHandler(ErrorHandler handler) {
508            errorHandler = handler;
509            treeBuilderErrorHandler = handler;
510            if (tokenizer != null) {
511                tokenizer.setErrorHandler(handler);
512                treeBuilder.setErrorHandler(handler);
513            }
514        }
515    
516        /**
517         * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
518         * @deprecated For Validator.nu internal use
519         */
520        public void setTreeBuilderErrorHandlerOverride(ErrorHandler handler) {
521            treeBuilderErrorHandler = handler;
522            if (tokenizer != null) {
523                treeBuilder.setErrorHandler(handler);
524            }
525        }
526        
527        /**
528         * Sets a boolean feature without having to use non-<code>XMLReader</code>
529         * setters directly.
530         * 
531         * <p>
532         * The supported features are:
533         * 
534         * <dl>
535         * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
536         * <dd><code>setCheckingNormalization</code></dd>
537         * <dt><code>http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata</code></dt>
538         * <dd><code>setHtml4ModeCompatibleWithXhtml1Schemata</code></dd>
539         * <dt><code>http://validator.nu/features/mapping-lang-to-xml-lang</code></dt>
540         * <dd><code>setMappingLangToXmlLang</code></dd>
541         * <dt><code>http://validator.nu/features/scripting-enabled</code></dt>
542         * <dd><code>setScriptingEnabled</code></dd>
543         * </dl>
544         * 
545         * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
546         */
547        public void setFeature(String name, boolean value)
548                throws SAXNotRecognizedException, SAXNotSupportedException {
549            if ("http://xml.org/sax/features/external-general-entities".equals(name)) {
550                throw new SAXNotSupportedException("Cannot set " + name + ".");
551            } else if ("http://xml.org/sax/features/external-parameter-entities".equals(name)) {
552                throw new SAXNotSupportedException("Cannot set " + name + ".");
553            } else if ("http://xml.org/sax/features/is-standalone".equals(name)) {
554                throw new SAXNotSupportedException("Cannot set " + name + ".");
555            } else if ("http://xml.org/sax/features/lexical-handler/parameter-entities".equals(name)) {
556                throw new SAXNotSupportedException("Cannot set " + name + ".");
557            } else if ("http://xml.org/sax/features/namespaces".equals(name)) {
558                throw new SAXNotSupportedException("Cannot set " + name + ".");
559            } else if ("http://xml.org/sax/features/namespace-prefixes".equals(name)) {
560                throw new SAXNotSupportedException("Cannot set " + name + ".");
561            } else if ("http://xml.org/sax/features/resolve-dtd-uris".equals(name)) {
562                throw new SAXNotSupportedException("Cannot set " + name + ".");
563            } else if ("http://xml.org/sax/features/string-interning".equals(name)) {
564                throw new SAXNotSupportedException("Cannot set " + name + ".");
565            } else if ("http://xml.org/sax/features/unicode-normalization-checking".equals(name)) {
566                setCheckingNormalization(value);
567            } else if ("http://xml.org/sax/features/use-attributes2".equals(name)) {
568                throw new SAXNotSupportedException("Cannot set " + name + ".");
569            } else if ("http://xml.org/sax/features/use-locator2".equals(name)) {
570                throw new SAXNotSupportedException("Cannot set " + name + ".");
571            } else if ("http://xml.org/sax/features/use-entity-resolver2".equals(name)) {
572                throw new SAXNotSupportedException("Cannot set " + name + ".");
573            } else if ("http://xml.org/sax/features/validation".equals(name)) {
574                throw new SAXNotSupportedException("Cannot set " + name + ".");
575            } else if ("http://xml.org/sax/features/xmlns-uris".equals(name)) {
576                throw new SAXNotSupportedException("Cannot set " + name + ".");
577            } else if ("http://xml.org/sax/features/xml-1.1".equals(name)) {
578                throw new SAXNotSupportedException("Cannot set " + name + ".");
579            } else if ("http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata".equals(name)) {
580                setHtml4ModeCompatibleWithXhtml1Schemata(value);
581            } else if ("http://validator.nu/features/mapping-lang-to-xml-lang".equals(name)) {
582                setMappingLangToXmlLang(value);
583            } else if ("http://validator.nu/features/scripting-enabled".equals(name)) {
584                setScriptingEnabled(value);
585            } else {
586                throw new SAXNotRecognizedException();
587            }
588        }
589    
590        /**
591         * Sets a non-boolean property without having to use non-<code>XMLReader</code>
592         * setters directly.
593         * 
594         * <dl>
595         * <dt><code>http://xml.org/sax/properties/lexical-handler</code></dt>
596         * <dd><code>setLexicalHandler</code></dd>
597         * <dt><code>http://validator.nu/properties/content-space-policy</code></dt>
598         * <dd><code>setContentSpacePolicy</code></dd>
599         * <dt><code>http://validator.nu/properties/content-non-xml-char-policy</code></dt>
600         * <dd><code>setContentNonXmlCharPolicy</code></dd>
601         * <dt><code>http://validator.nu/properties/comment-policy</code></dt>
602         * <dd><code>setCommentPolicy</code></dd>
603         * <dt><code>http://validator.nu/properties/xmlns-policy</code></dt>
604         * <dd><code>setXmlnsPolicy</code></dd>
605         * <dt><code>http://validator.nu/properties/name-policy</code></dt>
606         * <dd><code>setNamePolicy</code></dd>
607         * <dt><code>http://validator.nu/properties/streamability-violation-policy</code></dt>
608         * <dd><code>setStreamabilityViolationPolicy</code></dd>
609         * <dt><code>http://validator.nu/properties/document-mode-handler</code></dt>
610         * <dd><code>setDocumentModeHandler</code></dd>
611         * <dt><code>http://validator.nu/properties/doctype-expectation</code></dt>
612         * <dd><code>setDoctypeExpectation</code></dd>
613         * <dt><code>http://validator.nu/properties/xml-policy</code></dt>
614         * <dd><code>setXmlPolicy</code></dd>
615         * </dl>
616         * 
617         * @see org.xml.sax.XMLReader#setProperty(java.lang.String,
618         *      java.lang.Object)
619         */
620        public void setProperty(String name, Object value)
621                throws SAXNotRecognizedException, SAXNotSupportedException {
622            if ("http://xml.org/sax/properties/declaration-handler".equals(name)) {
623                throw new SAXNotSupportedException(
624                        "This parser does not suppert DeclHandler.");
625            } else if ("http://xml.org/sax/properties/document-xml-version".equals(name)) {
626                throw new SAXNotSupportedException(
627                        "Can't set document-xml-version.");
628            } else if ("http://xml.org/sax/properties/dom-node".equals(name)) {
629                throw new SAXNotSupportedException("Can't set dom-node.");
630            } else if ("http://xml.org/sax/properties/lexical-handler".equals(name)) {
631                setLexicalHandler((LexicalHandler) value);
632            } else if ("http://xml.org/sax/properties/xml-string".equals(name)) {
633                throw new SAXNotSupportedException("Can't set xml-string.");
634            } else if ("http://validator.nu/properties/content-space-policy".equals(name)) {
635                setContentSpacePolicy((XmlViolationPolicy) value);
636            } else if ("http://validator.nu/properties/content-non-xml-char-policy".equals(name)) {
637                setContentNonXmlCharPolicy((XmlViolationPolicy) value);
638            } else if ("http://validator.nu/properties/comment-policy".equals(name)) {
639                setCommentPolicy((XmlViolationPolicy) value);
640            } else if ("http://validator.nu/properties/xmlns-policy".equals(name)) {
641                setXmlnsPolicy((XmlViolationPolicy) value);
642            } else if ("http://validator.nu/properties/name-policy".equals(name)) {
643                setNamePolicy((XmlViolationPolicy) value);
644            } else if ("http://validator.nu/properties/streamability-violation-policy".equals(name)) {
645                setStreamabilityViolationPolicy((XmlViolationPolicy) value);
646            } else if ("http://validator.nu/properties/document-mode-handler".equals(name)) {
647                setDocumentModeHandler((DocumentModeHandler) value);
648            } else if ("http://validator.nu/properties/doctype-expectation".equals(name)) {
649                setDoctypeExpectation((DoctypeExpectation) value);
650            } else if ("http://validator.nu/properties/xml-policy".equals(name)) {
651                setXmlPolicy((XmlViolationPolicy) value);
652            } else {
653                throw new SAXNotRecognizedException();
654            }
655        }
656    
657        /**
658         * Indicates whether NFC normalization of source is being checked.
659         * @return <code>true</code> if NFC normalization of source is being checked.
660         * @see nu.validator.htmlparser.impl.Tokenizer#isCheckingNormalization()
661         */
662        public boolean isCheckingNormalization() {
663            return checkingNormalization;
664        }
665    
666        /**
667         * Toggles the checking of the NFC normalization of source.
668         * @param enable <code>true</code> to check normalization
669         * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean)
670         */
671        public void setCheckingNormalization(boolean enable) {
672            this.checkingNormalization = enable;
673            if (tokenizer != null) {
674                tokenizer.setCheckingNormalization(checkingNormalization);
675            }
676        }
677    
678        /**
679         * Sets the policy for consecutive hyphens in comments.
680         * @param commentPolicy the policy
681         * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
682         */
683        public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
684            this.commentPolicy = commentPolicy;
685            if (tokenizer != null) {
686                tokenizer.setCommentPolicy(commentPolicy);
687            }
688        }
689    
690        /**
691         * Sets the policy for non-XML characters except white space.
692         * @param contentNonXmlCharPolicy the policy
693         * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
694         */
695        public void setContentNonXmlCharPolicy(
696                XmlViolationPolicy contentNonXmlCharPolicy) {
697            this.contentNonXmlCharPolicy = contentNonXmlCharPolicy;
698            if (tokenizer != null) {
699                tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
700            }
701        }
702    
703        /**
704         * Sets the policy for non-XML white space.
705         * @param contentSpacePolicy the policy
706         * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
707         */
708        public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
709            this.contentSpacePolicy = contentSpacePolicy;
710            if (tokenizer != null) {
711                tokenizer.setContentSpacePolicy(contentSpacePolicy);
712            }
713        }
714    
715        /**
716         * Whether the parser considers scripting to be enabled for noscript treatment.
717         * 
718         * @return <code>true</code> if enabled
719         * @see nu.validator.htmlparser.impl.TreeBuilder#isScriptingEnabled()
720         */
721        public boolean isScriptingEnabled() {
722            return scriptingEnabled;
723        }
724    
725        /**
726         * Sets whether the parser considers scripting to be enabled for noscript treatment.
727         * @param scriptingEnabled <code>true</code> to enable
728         * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
729         */
730        public void setScriptingEnabled(boolean scriptingEnabled) {
731            this.scriptingEnabled = scriptingEnabled;
732            if (treeBuilder != null) {
733                treeBuilder.setScriptingEnabled(scriptingEnabled);
734            }
735        }
736    
737        /**
738         * Returns the doctype expectation.
739         * 
740         * @return the doctypeExpectation
741         */
742        public DoctypeExpectation getDoctypeExpectation() {
743            return doctypeExpectation;
744        }
745    
746        /**
747         * Sets the doctype expectation.
748         * 
749         * @param doctypeExpectation
750         *            the doctypeExpectation to set
751         * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation)
752         */
753        public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) {
754            this.doctypeExpectation = doctypeExpectation;
755            if (treeBuilder != null) {
756                treeBuilder.setDoctypeExpectation(doctypeExpectation);
757            }
758        }
759    
760        /**
761         * Returns the document mode handler.
762         * 
763         * @return the documentModeHandler
764         */
765        public DocumentModeHandler getDocumentModeHandler() {
766            return documentModeHandler;
767        }
768    
769        /**
770         * Sets the document mode handler.
771         * 
772         * @param documentModeHandler
773         *            the documentModeHandler to set
774         * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler)
775         */
776        public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) {
777            this.documentModeHandler = documentModeHandler;
778        }
779    
780        /**
781         * Returns the streamabilityViolationPolicy.
782         * 
783         * @return the streamabilityViolationPolicy
784         */
785        public XmlViolationPolicy getStreamabilityViolationPolicy() {
786            return streamabilityViolationPolicy;
787        }
788    
789        /**
790         * Sets the streamabilityViolationPolicy.
791         * 
792         * @param streamabilityViolationPolicy
793         *            the streamabilityViolationPolicy to set
794         */
795        public void setStreamabilityViolationPolicy(
796                XmlViolationPolicy streamabilityViolationPolicy) {
797            this.streamabilityViolationPolicy = streamabilityViolationPolicy;
798        }
799    
800        /**
801         * Whether the HTML 4 mode reports boolean attributes in a way that repeats
802         * the name in the value.
803         * @param html4ModeCompatibleWithXhtml1Schemata
804         */
805        public void setHtml4ModeCompatibleWithXhtml1Schemata(
806                boolean html4ModeCompatibleWithXhtml1Schemata) {
807            this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
808            if (tokenizer != null) {
809                tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
810            }
811        }
812    
813        /**
814         * Returns the <code>Locator</code> during parse.
815         * @return the <code>Locator</code>
816         */
817        public Locator getDocumentLocator() {
818            return tokenizer;
819        }
820    
821        /**
822         * Whether the HTML 4 mode reports boolean attributes in a way that repeats
823         * the name in the value.
824         * 
825         * @return the html4ModeCompatibleWithXhtml1Schemata
826         */
827        public boolean isHtml4ModeCompatibleWithXhtml1Schemata() {
828            return html4ModeCompatibleWithXhtml1Schemata;
829        }
830    
831        /**
832         * Whether <code>lang</code> is mapped to <code>xml:lang</code>.
833         * @param mappingLangToXmlLang
834         * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
835         */
836        public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
837            this.mappingLangToXmlLang = mappingLangToXmlLang;
838            if (tokenizer != null) {
839                tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
840            }
841        }
842    
843        /**
844         * Whether <code>lang</code> is mapped to <code>xml:lang</code>.
845         * 
846         * @return the mappingLangToXmlLang
847         */
848        public boolean isMappingLangToXmlLang() {
849            return mappingLangToXmlLang;
850        }
851    
852        /**
853         * Whether the <code>xmlns</code> attribute on the root element is 
854         * passed to through. (FATAL not allowed.)
855         * @param xmlnsPolicy
856         * @see nu.validator.htmlparser.impl.Tokenizer#setXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
857         */
858        public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
859            if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
860                throw new IllegalArgumentException("Can't use FATAL here.");
861            }
862            this.xmlnsPolicy = xmlnsPolicy;
863            if (tokenizer != null) {
864                tokenizer.setXmlnsPolicy(xmlnsPolicy);
865            }
866        }
867    
868        /**
869         * Returns the xmlnsPolicy.
870         * 
871         * @return the xmlnsPolicy
872         */
873        public XmlViolationPolicy getXmlnsPolicy() {
874            return xmlnsPolicy;
875        }
876    
877        /**
878         * Returns the lexicalHandler.
879         * 
880         * @return the lexicalHandler
881         */
882        public LexicalHandler getLexicalHandler() {
883            return lexicalHandler;
884        }
885    
886        /**
887         * Returns the commentPolicy.
888         * 
889         * @return the commentPolicy
890         */
891        public XmlViolationPolicy getCommentPolicy() {
892            return commentPolicy;
893        }
894    
895        /**
896         * Returns the contentNonXmlCharPolicy.
897         * 
898         * @return the contentNonXmlCharPolicy
899         */
900        public XmlViolationPolicy getContentNonXmlCharPolicy() {
901            return contentNonXmlCharPolicy;
902        }
903    
904        /**
905         * Returns the contentSpacePolicy.
906         * 
907         * @return the contentSpacePolicy
908         */
909        public XmlViolationPolicy getContentSpacePolicy() {
910            return contentSpacePolicy;
911        }
912    
913        /**
914         * @param reportingDoctype
915         * @see nu.validator.htmlparser.impl.TreeBuilder#setReportingDoctype(boolean)
916         */
917        public void setReportingDoctype(boolean reportingDoctype) {
918            this.reportingDoctype = reportingDoctype;
919            if (treeBuilder != null) {
920                treeBuilder.setReportingDoctype(reportingDoctype);
921            }
922        }
923    
924        /**
925         * Returns the reportingDoctype.
926         * 
927         * @return the reportingDoctype
928         */
929        public boolean isReportingDoctype() {
930            return reportingDoctype;
931        }
932    
933        /**
934         * The policy for non-NCName element and attribute names.
935         * @param namePolicy
936         * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
937         */
938        public void setNamePolicy(XmlViolationPolicy namePolicy) {
939            this.namePolicy = namePolicy;
940            if (tokenizer != null) {
941                tokenizer.setNamePolicy(namePolicy);
942            }
943        }
944    
945        /**
946         * This is a catch-all convenience method for setting name, xmlns, content space, 
947         * content non-XML char and comment policies in one go. This does not affect the 
948         * streamability policy or doctype reporting.
949         * 
950         * @param xmlPolicy
951         */
952        public void setXmlPolicy(XmlViolationPolicy xmlPolicy) {
953            setNamePolicy(xmlPolicy);
954            setXmlnsPolicy(xmlPolicy == XmlViolationPolicy.FATAL ? XmlViolationPolicy.ALTER_INFOSET : xmlPolicy);
955            setContentSpacePolicy(xmlPolicy);
956            setContentNonXmlCharPolicy(xmlPolicy);
957            setCommentPolicy(xmlPolicy);
958            setBogusXmlnsPolicy(xmlPolicy);
959        }
960    
961        /**
962         * The policy for non-NCName element and attribute names.
963         * 
964         * @return the namePolicy
965         */
966        public XmlViolationPolicy getNamePolicy() {
967            return namePolicy;
968        }
969    
970        /**
971         * Sets the policy for forbidden <code>xmlns</code> attributes.
972         * @param bogusXmlnsPolicy the policy
973         * @see nu.validator.htmlparser.impl.Tokenizer#setBogusXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
974         */
975        public void setBogusXmlnsPolicy(
976                XmlViolationPolicy bogusXmlnsPolicy) {
977            this.bogusXmlnsPolicy = bogusXmlnsPolicy;
978            if (tokenizer != null) {
979                tokenizer.setBogusXmlnsPolicy(bogusXmlnsPolicy);
980            }
981        }
982    
983        /**
984         * Returns the bogusXmlnsPolicy.
985         * 
986         * @return the bogusXmlnsPolicy
987         */
988        public XmlViolationPolicy getBogusXmlnsPolicy() {
989            return bogusXmlnsPolicy;
990        }
991        
992        public void addCharacterHandler(CharacterHandler characterHandler) {
993            this.characterHandlers.add(characterHandler);
994            if (tokenizer != null) {
995                tokenizer.addCharacterHandler(characterHandler);
996            }
997        }
998    }