001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 * Copyright (c) 2007 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.sax;
025
026 import java.io.IOException;
027 import java.net.MalformedURLException;
028 import java.net.URL;
029 import java.util.LinkedList;
030 import java.util.List;
031
032 import nu.validator.htmlparser.common.DoctypeExpectation;
033 import nu.validator.htmlparser.common.DocumentModeHandler;
034 import nu.validator.htmlparser.common.XmlViolationPolicy;
035 import nu.validator.htmlparser.impl.CharacterHandler;
036 import nu.validator.htmlparser.impl.Tokenizer;
037 import nu.validator.htmlparser.impl.TreeBuilder;
038 import nu.validator.saxtree.Document;
039 import nu.validator.saxtree.DocumentFragment;
040 import nu.validator.saxtree.TreeParser;
041
042 import org.xml.sax.ContentHandler;
043 import org.xml.sax.DTDHandler;
044 import org.xml.sax.EntityResolver;
045 import org.xml.sax.ErrorHandler;
046 import org.xml.sax.InputSource;
047 import org.xml.sax.Locator;
048 import org.xml.sax.SAXException;
049 import org.xml.sax.SAXNotRecognizedException;
050 import org.xml.sax.SAXNotSupportedException;
051 import org.xml.sax.XMLReader;
052 import org.xml.sax.ext.LexicalHandler;
053 import org.xml.sax.helpers.DefaultHandler;
054
055 /**
056 * This class implements an HTML5 parser that exposes data through the SAX2
057 * interface.
058 *
059 * <p>By default, when using the constructor without arguments, the
060 * this parser treats XML 1.0-incompatible infosets as fatal errors in
061 * order to adhere to the SAX2 API contract strictly. This corresponds to
062 * <code>FATAL</code> as the general XML violation policy. To make the parser
063 * support non-conforming HTML fully per the HTML 5 spec while on the other
064 * hand potentially violating the SAX2 API contract, set the general XML
065 * violation policy to <code>ALLOW</code>. Handling all input without fatal
066 * errors and without violating the SAX2 API contract is possible by setting
067 * the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This
068 * makes the parser non-conforming</em> but is probably the most useful
069 * setting for most applications.
070 *
071 * <p>By default, this parser doesn't do true streaming but buffers everything
072 * first. The parser can be made truly streaming by calling
073 * <code>setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL)</code>. This
074 * has the consequence that errors that require non-streamable recovery are
075 * treated as fatal.
076 *
077 * <p>By default, in order to make the parse events emulate the parse events
078 * for a DTDless XML document, the parser does not report the doctype through
079 * <code>LexicalHandler</code>. Doctype reporting through
080 * <code>LexicalHandler</code> can be turned on by calling
081 * <code>setReportingDoctype(true)</code>.
082 *
083 * @version $Id: HtmlParser.java 161 2007-10-02 09:10:00Z hsivonen $
084 * @author hsivonen
085 */
086 public class HtmlParser implements XMLReader {
087
088 private Tokenizer tokenizer = null;
089
090 private TreeBuilder<?> treeBuilder = null;
091
092 private SAXStreamer saxStreamer = null; // work around javac bug
093
094 private SAXTreeBuilder saxTreeBuilder = null; // work around javac bug
095
096 private ContentHandler contentHandler = null;
097
098 private LexicalHandler lexicalHandler = null;
099
100 private DTDHandler dtdHandler = null;
101
102 private EntityResolver entityResolver = null;
103
104 private ErrorHandler errorHandler = null;
105
106 private DocumentModeHandler documentModeHandler = null;
107
108 private DoctypeExpectation doctypeExpectation = DoctypeExpectation.HTML;
109
110 private boolean checkingNormalization = false;
111
112 private boolean scriptingEnabled = false;
113
114 private final List<CharacterHandler> characterHandlers = new LinkedList<CharacterHandler>();
115
116 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.FATAL;
117
118 private XmlViolationPolicy contentNonXmlCharPolicy = XmlViolationPolicy.FATAL;
119
120 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.FATAL;
121
122 private XmlViolationPolicy namePolicy = XmlViolationPolicy.FATAL;
123
124 private XmlViolationPolicy streamabilityViolationPolicy = XmlViolationPolicy.ALLOW;
125
126 private boolean html4ModeCompatibleWithXhtml1Schemata;
127
128 private boolean mappingLangToXmlLang;
129
130 private XmlViolationPolicy xmlnsPolicy;
131
132 private XmlViolationPolicy bogusXmlnsPolicy;
133
134 private boolean reportingDoctype = true;
135
136 private ErrorHandler treeBuilderErrorHandler;
137
138 /**
139 * Instantiates the parser with a fatal XML violation policy.
140 *
141 */
142 public HtmlParser() {
143 this(XmlViolationPolicy.FATAL);
144 }
145
146 /**
147 * Instantiates the parser with a specific XML violation policy.
148 * @param xmlPolicy the policy
149 */
150 public HtmlParser(XmlViolationPolicy xmlPolicy) {
151 setXmlPolicy(xmlPolicy);
152 }
153
154 /**
155 * This class wraps differnt tree builders depending on configuration. This
156 * method does the work of hiding this from the user of the class.
157 */
158 private void lazyInit() {
159 if (tokenizer == null) {
160 if (streamabilityViolationPolicy == XmlViolationPolicy.ALLOW) {
161 this.saxTreeBuilder = new SAXTreeBuilder();
162 this.treeBuilder = this.saxTreeBuilder;
163 this.saxStreamer = null;
164 } else {
165 this.saxStreamer = new SAXStreamer();
166 this.treeBuilder = this.saxStreamer;
167 this.saxTreeBuilder = null;
168 }
169 this.tokenizer = new Tokenizer(treeBuilder);
170 this.tokenizer.setErrorHandler(errorHandler);
171 this.treeBuilder.setErrorHandler(treeBuilderErrorHandler);
172 this.tokenizer.setCheckingNormalization(checkingNormalization);
173 this.tokenizer.setCommentPolicy(commentPolicy);
174 this.tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
175 this.tokenizer.setContentSpacePolicy(contentSpacePolicy);
176 this.tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
177 this.tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
178 this.tokenizer.setXmlnsPolicy(xmlnsPolicy);
179 for (CharacterHandler characterHandler : characterHandlers) {
180 this.tokenizer.addCharacterHandler(characterHandler);
181 }
182 this.treeBuilder.setDoctypeExpectation(doctypeExpectation);
183 this.treeBuilder.setDocumentModeHandler(documentModeHandler);
184 this.treeBuilder.setIgnoringComments(lexicalHandler == null);
185 this.treeBuilder.setScriptingEnabled(scriptingEnabled);
186 this.treeBuilder.setReportingDoctype(reportingDoctype);
187 if (saxStreamer != null) {
188 saxStreamer.setContentHandler(contentHandler == null ? new DefaultHandler()
189 : contentHandler);
190 saxStreamer.setLexicalHandler(lexicalHandler);
191 }
192 }
193 }
194
195 /**
196 * @see org.xml.sax.XMLReader#getContentHandler()
197 */
198 public ContentHandler getContentHandler() {
199 return contentHandler;
200 }
201
202 /**
203 * @see org.xml.sax.XMLReader#getDTDHandler()
204 */
205 public DTDHandler getDTDHandler() {
206 return dtdHandler;
207 }
208
209 /**
210 * @see org.xml.sax.XMLReader#getEntityResolver()
211 */
212 public EntityResolver getEntityResolver() {
213 return entityResolver;
214 }
215
216 /**
217 * @see org.xml.sax.XMLReader#getErrorHandler()
218 */
219 public ErrorHandler getErrorHandler() {
220 return errorHandler;
221 }
222
223 /**
224 * Exposes the configuration of the emulated XML parser as well as
225 * boolean-valued configuration without using non-<code>XMLReader</code>
226 * getters directly.
227 *
228 * <dl>
229 * <dt><code>http://xml.org/sax/features/external-general-entities</code></dt>
230 * <dd><code>false</code></dd>
231 * <dt><code>http://xml.org/sax/features/external-parameter-entities</code></dt>
232 * <dd><code>false</code></dd>
233 * <dt><code>http://xml.org/sax/features/is-standalone</code></dt>
234 * <dd><code>true</code></dd>
235 * <dt><code>http://xml.org/sax/features/lexical-handler/parameter-entities</code></dt>
236 * <dd><code>false</code></dd>
237 * <dt><code>http://xml.org/sax/features/namespaces</code></dt>
238 * <dd><code>true</code></dd>
239 * <dt><code>http://xml.org/sax/features/namespace-prefixes</code></dt>
240 * <dd><code>false</code></dd>
241 * <dt><code>http://xml.org/sax/features/resolve-dtd-uris</code></dt>
242 * <dd><code>true</code></dd>
243 * <dt><code>http://xml.org/sax/features/string-interning</code></dt>
244 * <dd><code>false</code></dd>
245 * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
246 * <dd><code>isCheckingNormalization</code></dd>
247 * <dt><code>http://xml.org/sax/features/use-attributes2</code></dt>
248 * <dd><code>false</code></dd>
249 * <dt><code>http://xml.org/sax/features/use-locator2</code></dt>
250 * <dd><code>false</code></dd>
251 * <dt><code>http://xml.org/sax/features/use-entity-resolver2</code></dt>
252 * <dd><code>false</code></dd>
253 * <dt><code>http://xml.org/sax/features/validation</code></dt>
254 * <dd><code>false</code></dd>
255 * <dt><code>http://xml.org/sax/features/xmlns-uris</code></dt>
256 * <dd><code>false</code></dd>
257 * <dt><code>http://xml.org/sax/features/xml-1.1</code></dt>
258 * <dd><code>false</code></dd>
259 * <dt><code>http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata</code></dt>
260 * <dd><code>isHtml4ModeCompatibleWithXhtml1Schemata</code></dd>
261 * <dt><code>http://validator.nu/features/mapping-lang-to-xml-lang</code></dt>
262 * <dd><code>isMappingLangToXmlLang</code></dd>
263 * <dt><code>http://validator.nu/features/scripting-enabled</code></dt>
264 * <dd><code>isScriptingEnabled</code></dd>
265 * </dl>
266 *
267 * @param name
268 * feature URI string
269 * @return a value per the list above
270 * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
271 */
272 public boolean getFeature(String name) throws SAXNotRecognizedException,
273 SAXNotSupportedException {
274 if ("http://xml.org/sax/features/external-general-entities".equals(name)) {
275 return false;
276 } else if ("http://xml.org/sax/features/external-parameter-entities".equals(name)) {
277 return false;
278 } else if ("http://xml.org/sax/features/is-standalone".equals(name)) {
279 return true;
280 } else if ("http://xml.org/sax/features/lexical-handler/parameter-entities".equals(name)) {
281 return false;
282 } else if ("http://xml.org/sax/features/namespaces".equals(name)) {
283 return true;
284 } else if ("http://xml.org/sax/features/namespace-prefixes".equals(name)) {
285 return false;
286 } else if ("http://xml.org/sax/features/resolve-dtd-uris".equals(name)) {
287 return true; // default value--applicable scenario never happens
288 } else if ("http://xml.org/sax/features/string-interning".equals(name)) {
289 return false; // XXX revisit
290 } else if ("http://xml.org/sax/features/unicode-normalization-checking".equals(name)) {
291 return isCheckingNormalization(); // the checks aren't really per
292 // XML 1.1
293 } else if ("http://xml.org/sax/features/use-attributes2".equals(name)) {
294 return false;
295 } else if ("http://xml.org/sax/features/use-locator2".equals(name)) {
296 return false;
297 } else if ("http://xml.org/sax/features/use-entity-resolver2".equals(name)) {
298 return false;
299 } else if ("http://xml.org/sax/features/validation".equals(name)) {
300 return false;
301 } else if ("http://xml.org/sax/features/xmlns-uris".equals(name)) {
302 return false;
303 } else if ("http://xml.org/sax/features/xml-1.1".equals(name)) {
304 return false;
305 } else if ("http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata".equals(name)) {
306 return isHtml4ModeCompatibleWithXhtml1Schemata();
307 } else if ("http://validator.nu/features/mapping-lang-to-xml-lang".equals(name)) {
308 return isMappingLangToXmlLang();
309 } else if ("http://validator.nu/features/scripting-enabled".equals(name)) {
310 return isScriptingEnabled();
311 } else {
312 throw new SAXNotRecognizedException();
313 }
314 }
315
316 /**
317 * Allows <code>XMLReader</code>-level access to non-boolean valued
318 * getters.
319 *
320 * <p>
321 * The properties are mapped as follows:
322 *
323 * <dl>
324 * <dt><code>http://xml.org/sax/properties/document-xml-version</code></dt>
325 * <dd><code>"1.0"</code></dd>
326 * <dt><code>http://xml.org/sax/properties/lexical-handler</code></dt>
327 * <dd><code>getLexicalHandler</code></dd>
328 * <dt><code>http://validator.nu/properties/content-space-policy</code></dt>
329 * <dd><code>getContentSpacePolicy</code></dd>
330 * <dt><code>http://validator.nu/properties/content-non-xml-char-policy</code></dt>
331 * <dd><code>getContentNonXmlCharPolicy</code></dd>
332 * <dt><code>http://validator.nu/properties/comment-policy</code></dt>
333 * <dd><code>getCommentPolicy</code></dd>
334 * <dt><code>http://validator.nu/properties/xmlns-policy</code></dt>
335 * <dd><code>getXmlnsPolicy</code></dd>
336 * <dt><code>http://validator.nu/properties/name-policy</code></dt>
337 * <dd><code>getNamePolicy</code></dd>
338 * <dt><code>http://validator.nu/properties/streamability-violation-policy</code></dt>
339 * <dd><code>getStreamabilityViolationPolicy</code></dd>
340 * <dt><code>http://validator.nu/properties/document-mode-handler</code></dt>
341 * <dd><code>getDocumentModeHandler</code></dd>
342 * <dt><code>http://validator.nu/properties/doctype-expectation</code></dt>
343 * <dd><code>getDoctypeExpectation</code></dd>
344 * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
345 * </dl>
346 *
347 * @param name
348 * property URI string
349 * @return a value per the list above
350 * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
351 */
352 public Object getProperty(String name) throws SAXNotRecognizedException,
353 SAXNotSupportedException {
354 if ("http://xml.org/sax/properties/declaration-handler".equals(name)) {
355 throw new SAXNotSupportedException(
356 "This parser does not suppert DeclHandler.");
357 } else if ("http://xml.org/sax/properties/document-xml-version".equals(name)) {
358 return "1.0"; // Emulating an XML 1.1 parser is not supported.
359 } else if ("http://xml.org/sax/properties/dom-node".equals(name)) {
360 throw new SAXNotSupportedException(
361 "This parser does not walk the DOM.");
362 } else if ("http://xml.org/sax/properties/lexical-handler".equals(name)) {
363 return getLexicalHandler();
364 } else if ("http://xml.org/sax/properties/xml-string".equals(name)) {
365 throw new SAXNotSupportedException(
366 "This parser does not expose the source as a string.");
367 } else if ("http://validator.nu/properties/content-space-policy".equals(name)) {
368 return getContentSpacePolicy();
369 } else if ("http://validator.nu/properties/content-non-xml-char-policy".equals(name)) {
370 return getContentNonXmlCharPolicy();
371 } else if ("http://validator.nu/properties/comment-policy".equals(name)) {
372 return getCommentPolicy();
373 } else if ("http://validator.nu/properties/xmlns-policy".equals(name)) {
374 return getXmlnsPolicy();
375 } else if ("http://validator.nu/properties/name-policy".equals(name)) {
376 return getNamePolicy();
377 } else if ("http://validator.nu/properties/streamability-violation-policy".equals(name)) {
378 return getStreamabilityViolationPolicy();
379 } else if ("http://validator.nu/properties/document-mode-handler".equals(name)) {
380 return getDocumentModeHandler();
381 } else if ("http://validator.nu/properties/doctype-expectation".equals(name)) {
382 return getDoctypeExpectation();
383 } else if ("http://validator.nu/properties/xml-policy".equals(name)) {
384 throw new SAXNotSupportedException(
385 "Cannot get a convenience setter.");
386 } else {
387 throw new SAXNotRecognizedException();
388 }
389 }
390
391 /**
392 * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource)
393 */
394 public void parse(InputSource input) throws IOException, SAXException {
395 lazyInit();
396 try {
397 treeBuilder.setFragmentContext(null);
398 tokenize(input);
399 } finally {
400 if (saxTreeBuilder != null) {
401 Document document = saxTreeBuilder.getDocument();
402 if (document != null) {
403 new TreeParser(contentHandler, lexicalHandler).parse(document);
404 }
405 }
406 }
407 }
408
409 /**
410 * Parser a fragment.
411 *
412 * @param input the input to parse
413 * @param context the name of the context element
414 * @throws IOException
415 * @throws SAXException
416 */
417 public void parseFragment(InputSource input, String context)
418 throws IOException, SAXException {
419 lazyInit();
420 try {
421 treeBuilder.setFragmentContext(context);
422 tokenize(input);
423 } finally {
424 if (saxTreeBuilder != null) {
425 DocumentFragment fragment = saxTreeBuilder.getDocumentFragment();
426 new TreeParser(contentHandler, lexicalHandler).parse(fragment);
427 }
428 }
429 }
430
431 /**
432 * @param is
433 * @throws SAXException
434 * @throws IOException
435 * @throws MalformedURLException
436 */
437 private void tokenize(InputSource is) throws SAXException, IOException, MalformedURLException {
438 if (is == null) {
439 throw new IllegalArgumentException("Null input.");
440 }
441 if (is.getByteStream() == null && is.getCharacterStream() == null) {
442 String systemId = is.getSystemId();
443 if (systemId == null) {
444 throw new IllegalArgumentException("No byte stream, no character stream nor URI.");
445 }
446 if (entityResolver != null) {
447 is = entityResolver.resolveEntity(is.getPublicId(), systemId);
448 }
449 if (is.getByteStream() == null || is.getCharacterStream() == null) {
450 is = new InputSource();
451 is.setSystemId(systemId);
452 is.setByteStream(new URL(systemId).openStream());
453 }
454 }
455 tokenizer.tokenize(is);
456 }
457
458 /**
459 * @see org.xml.sax.XMLReader#parse(java.lang.String)
460 */
461 public void parse(String systemId) throws IOException, SAXException {
462 parse(new InputSource(systemId));
463 }
464
465 /**
466 * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler)
467 */
468 public void setContentHandler(ContentHandler handler) {
469 contentHandler = handler;
470 if (saxStreamer != null) {
471 saxStreamer.setContentHandler(contentHandler == null ? new DefaultHandler()
472 : contentHandler);
473 }
474 }
475
476 /**
477 * Sets the lexical handler.
478 * @param handler the hander.
479 */
480 public void setLexicalHandler(LexicalHandler handler) {
481 lexicalHandler = handler;
482 if (treeBuilder != null) {
483 treeBuilder.setIgnoringComments(handler == null);
484 if (saxStreamer != null) {
485 saxStreamer.setLexicalHandler(handler);
486 }
487 }
488 }
489
490 /**
491 * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
492 */
493 public void setDTDHandler(DTDHandler handler) {
494 dtdHandler = handler;
495 }
496
497 /**
498 * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
499 */
500 public void setEntityResolver(EntityResolver resolver) {
501 entityResolver = resolver;
502 }
503
504 /**
505 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
506 */
507 public void setErrorHandler(ErrorHandler handler) {
508 errorHandler = handler;
509 treeBuilderErrorHandler = handler;
510 if (tokenizer != null) {
511 tokenizer.setErrorHandler(handler);
512 treeBuilder.setErrorHandler(handler);
513 }
514 }
515
516 /**
517 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
518 * @deprecated For Validator.nu internal use
519 */
520 public void setTreeBuilderErrorHandlerOverride(ErrorHandler handler) {
521 treeBuilderErrorHandler = handler;
522 if (tokenizer != null) {
523 treeBuilder.setErrorHandler(handler);
524 }
525 }
526
527 /**
528 * Sets a boolean feature without having to use non-<code>XMLReader</code>
529 * setters directly.
530 *
531 * <p>
532 * The supported features are:
533 *
534 * <dl>
535 * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
536 * <dd><code>setCheckingNormalization</code></dd>
537 * <dt><code>http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata</code></dt>
538 * <dd><code>setHtml4ModeCompatibleWithXhtml1Schemata</code></dd>
539 * <dt><code>http://validator.nu/features/mapping-lang-to-xml-lang</code></dt>
540 * <dd><code>setMappingLangToXmlLang</code></dd>
541 * <dt><code>http://validator.nu/features/scripting-enabled</code></dt>
542 * <dd><code>setScriptingEnabled</code></dd>
543 * </dl>
544 *
545 * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
546 */
547 public void setFeature(String name, boolean value)
548 throws SAXNotRecognizedException, SAXNotSupportedException {
549 if ("http://xml.org/sax/features/external-general-entities".equals(name)) {
550 throw new SAXNotSupportedException("Cannot set " + name + ".");
551 } else if ("http://xml.org/sax/features/external-parameter-entities".equals(name)) {
552 throw new SAXNotSupportedException("Cannot set " + name + ".");
553 } else if ("http://xml.org/sax/features/is-standalone".equals(name)) {
554 throw new SAXNotSupportedException("Cannot set " + name + ".");
555 } else if ("http://xml.org/sax/features/lexical-handler/parameter-entities".equals(name)) {
556 throw new SAXNotSupportedException("Cannot set " + name + ".");
557 } else if ("http://xml.org/sax/features/namespaces".equals(name)) {
558 throw new SAXNotSupportedException("Cannot set " + name + ".");
559 } else if ("http://xml.org/sax/features/namespace-prefixes".equals(name)) {
560 throw new SAXNotSupportedException("Cannot set " + name + ".");
561 } else if ("http://xml.org/sax/features/resolve-dtd-uris".equals(name)) {
562 throw new SAXNotSupportedException("Cannot set " + name + ".");
563 } else if ("http://xml.org/sax/features/string-interning".equals(name)) {
564 throw new SAXNotSupportedException("Cannot set " + name + ".");
565 } else if ("http://xml.org/sax/features/unicode-normalization-checking".equals(name)) {
566 setCheckingNormalization(value);
567 } else if ("http://xml.org/sax/features/use-attributes2".equals(name)) {
568 throw new SAXNotSupportedException("Cannot set " + name + ".");
569 } else if ("http://xml.org/sax/features/use-locator2".equals(name)) {
570 throw new SAXNotSupportedException("Cannot set " + name + ".");
571 } else if ("http://xml.org/sax/features/use-entity-resolver2".equals(name)) {
572 throw new SAXNotSupportedException("Cannot set " + name + ".");
573 } else if ("http://xml.org/sax/features/validation".equals(name)) {
574 throw new SAXNotSupportedException("Cannot set " + name + ".");
575 } else if ("http://xml.org/sax/features/xmlns-uris".equals(name)) {
576 throw new SAXNotSupportedException("Cannot set " + name + ".");
577 } else if ("http://xml.org/sax/features/xml-1.1".equals(name)) {
578 throw new SAXNotSupportedException("Cannot set " + name + ".");
579 } else if ("http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata".equals(name)) {
580 setHtml4ModeCompatibleWithXhtml1Schemata(value);
581 } else if ("http://validator.nu/features/mapping-lang-to-xml-lang".equals(name)) {
582 setMappingLangToXmlLang(value);
583 } else if ("http://validator.nu/features/scripting-enabled".equals(name)) {
584 setScriptingEnabled(value);
585 } else {
586 throw new SAXNotRecognizedException();
587 }
588 }
589
590 /**
591 * Sets a non-boolean property without having to use non-<code>XMLReader</code>
592 * setters directly.
593 *
594 * <dl>
595 * <dt><code>http://xml.org/sax/properties/lexical-handler</code></dt>
596 * <dd><code>setLexicalHandler</code></dd>
597 * <dt><code>http://validator.nu/properties/content-space-policy</code></dt>
598 * <dd><code>setContentSpacePolicy</code></dd>
599 * <dt><code>http://validator.nu/properties/content-non-xml-char-policy</code></dt>
600 * <dd><code>setContentNonXmlCharPolicy</code></dd>
601 * <dt><code>http://validator.nu/properties/comment-policy</code></dt>
602 * <dd><code>setCommentPolicy</code></dd>
603 * <dt><code>http://validator.nu/properties/xmlns-policy</code></dt>
604 * <dd><code>setXmlnsPolicy</code></dd>
605 * <dt><code>http://validator.nu/properties/name-policy</code></dt>
606 * <dd><code>setNamePolicy</code></dd>
607 * <dt><code>http://validator.nu/properties/streamability-violation-policy</code></dt>
608 * <dd><code>setStreamabilityViolationPolicy</code></dd>
609 * <dt><code>http://validator.nu/properties/document-mode-handler</code></dt>
610 * <dd><code>setDocumentModeHandler</code></dd>
611 * <dt><code>http://validator.nu/properties/doctype-expectation</code></dt>
612 * <dd><code>setDoctypeExpectation</code></dd>
613 * <dt><code>http://validator.nu/properties/xml-policy</code></dt>
614 * <dd><code>setXmlPolicy</code></dd>
615 * </dl>
616 *
617 * @see org.xml.sax.XMLReader#setProperty(java.lang.String,
618 * java.lang.Object)
619 */
620 public void setProperty(String name, Object value)
621 throws SAXNotRecognizedException, SAXNotSupportedException {
622 if ("http://xml.org/sax/properties/declaration-handler".equals(name)) {
623 throw new SAXNotSupportedException(
624 "This parser does not suppert DeclHandler.");
625 } else if ("http://xml.org/sax/properties/document-xml-version".equals(name)) {
626 throw new SAXNotSupportedException(
627 "Can't set document-xml-version.");
628 } else if ("http://xml.org/sax/properties/dom-node".equals(name)) {
629 throw new SAXNotSupportedException("Can't set dom-node.");
630 } else if ("http://xml.org/sax/properties/lexical-handler".equals(name)) {
631 setLexicalHandler((LexicalHandler) value);
632 } else if ("http://xml.org/sax/properties/xml-string".equals(name)) {
633 throw new SAXNotSupportedException("Can't set xml-string.");
634 } else if ("http://validator.nu/properties/content-space-policy".equals(name)) {
635 setContentSpacePolicy((XmlViolationPolicy) value);
636 } else if ("http://validator.nu/properties/content-non-xml-char-policy".equals(name)) {
637 setContentNonXmlCharPolicy((XmlViolationPolicy) value);
638 } else if ("http://validator.nu/properties/comment-policy".equals(name)) {
639 setCommentPolicy((XmlViolationPolicy) value);
640 } else if ("http://validator.nu/properties/xmlns-policy".equals(name)) {
641 setXmlnsPolicy((XmlViolationPolicy) value);
642 } else if ("http://validator.nu/properties/name-policy".equals(name)) {
643 setNamePolicy((XmlViolationPolicy) value);
644 } else if ("http://validator.nu/properties/streamability-violation-policy".equals(name)) {
645 setStreamabilityViolationPolicy((XmlViolationPolicy) value);
646 } else if ("http://validator.nu/properties/document-mode-handler".equals(name)) {
647 setDocumentModeHandler((DocumentModeHandler) value);
648 } else if ("http://validator.nu/properties/doctype-expectation".equals(name)) {
649 setDoctypeExpectation((DoctypeExpectation) value);
650 } else if ("http://validator.nu/properties/xml-policy".equals(name)) {
651 setXmlPolicy((XmlViolationPolicy) value);
652 } else {
653 throw new SAXNotRecognizedException();
654 }
655 }
656
657 /**
658 * Indicates whether NFC normalization of source is being checked.
659 * @return <code>true</code> if NFC normalization of source is being checked.
660 * @see nu.validator.htmlparser.impl.Tokenizer#isCheckingNormalization()
661 */
662 public boolean isCheckingNormalization() {
663 return checkingNormalization;
664 }
665
666 /**
667 * Toggles the checking of the NFC normalization of source.
668 * @param enable <code>true</code> to check normalization
669 * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean)
670 */
671 public void setCheckingNormalization(boolean enable) {
672 this.checkingNormalization = enable;
673 if (tokenizer != null) {
674 tokenizer.setCheckingNormalization(checkingNormalization);
675 }
676 }
677
678 /**
679 * Sets the policy for consecutive hyphens in comments.
680 * @param commentPolicy the policy
681 * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
682 */
683 public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
684 this.commentPolicy = commentPolicy;
685 if (tokenizer != null) {
686 tokenizer.setCommentPolicy(commentPolicy);
687 }
688 }
689
690 /**
691 * Sets the policy for non-XML characters except white space.
692 * @param contentNonXmlCharPolicy the policy
693 * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
694 */
695 public void setContentNonXmlCharPolicy(
696 XmlViolationPolicy contentNonXmlCharPolicy) {
697 this.contentNonXmlCharPolicy = contentNonXmlCharPolicy;
698 if (tokenizer != null) {
699 tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
700 }
701 }
702
703 /**
704 * Sets the policy for non-XML white space.
705 * @param contentSpacePolicy the policy
706 * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
707 */
708 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
709 this.contentSpacePolicy = contentSpacePolicy;
710 if (tokenizer != null) {
711 tokenizer.setContentSpacePolicy(contentSpacePolicy);
712 }
713 }
714
715 /**
716 * Whether the parser considers scripting to be enabled for noscript treatment.
717 *
718 * @return <code>true</code> if enabled
719 * @see nu.validator.htmlparser.impl.TreeBuilder#isScriptingEnabled()
720 */
721 public boolean isScriptingEnabled() {
722 return scriptingEnabled;
723 }
724
725 /**
726 * Sets whether the parser considers scripting to be enabled for noscript treatment.
727 * @param scriptingEnabled <code>true</code> to enable
728 * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
729 */
730 public void setScriptingEnabled(boolean scriptingEnabled) {
731 this.scriptingEnabled = scriptingEnabled;
732 if (treeBuilder != null) {
733 treeBuilder.setScriptingEnabled(scriptingEnabled);
734 }
735 }
736
737 /**
738 * Returns the doctype expectation.
739 *
740 * @return the doctypeExpectation
741 */
742 public DoctypeExpectation getDoctypeExpectation() {
743 return doctypeExpectation;
744 }
745
746 /**
747 * Sets the doctype expectation.
748 *
749 * @param doctypeExpectation
750 * the doctypeExpectation to set
751 * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation)
752 */
753 public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) {
754 this.doctypeExpectation = doctypeExpectation;
755 if (treeBuilder != null) {
756 treeBuilder.setDoctypeExpectation(doctypeExpectation);
757 }
758 }
759
760 /**
761 * Returns the document mode handler.
762 *
763 * @return the documentModeHandler
764 */
765 public DocumentModeHandler getDocumentModeHandler() {
766 return documentModeHandler;
767 }
768
769 /**
770 * Sets the document mode handler.
771 *
772 * @param documentModeHandler
773 * the documentModeHandler to set
774 * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler)
775 */
776 public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) {
777 this.documentModeHandler = documentModeHandler;
778 }
779
780 /**
781 * Returns the streamabilityViolationPolicy.
782 *
783 * @return the streamabilityViolationPolicy
784 */
785 public XmlViolationPolicy getStreamabilityViolationPolicy() {
786 return streamabilityViolationPolicy;
787 }
788
789 /**
790 * Sets the streamabilityViolationPolicy.
791 *
792 * @param streamabilityViolationPolicy
793 * the streamabilityViolationPolicy to set
794 */
795 public void setStreamabilityViolationPolicy(
796 XmlViolationPolicy streamabilityViolationPolicy) {
797 this.streamabilityViolationPolicy = streamabilityViolationPolicy;
798 }
799
800 /**
801 * Whether the HTML 4 mode reports boolean attributes in a way that repeats
802 * the name in the value.
803 * @param html4ModeCompatibleWithXhtml1Schemata
804 */
805 public void setHtml4ModeCompatibleWithXhtml1Schemata(
806 boolean html4ModeCompatibleWithXhtml1Schemata) {
807 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
808 if (tokenizer != null) {
809 tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
810 }
811 }
812
813 /**
814 * Returns the <code>Locator</code> during parse.
815 * @return the <code>Locator</code>
816 */
817 public Locator getDocumentLocator() {
818 return tokenizer;
819 }
820
821 /**
822 * Whether the HTML 4 mode reports boolean attributes in a way that repeats
823 * the name in the value.
824 *
825 * @return the html4ModeCompatibleWithXhtml1Schemata
826 */
827 public boolean isHtml4ModeCompatibleWithXhtml1Schemata() {
828 return html4ModeCompatibleWithXhtml1Schemata;
829 }
830
831 /**
832 * Whether <code>lang</code> is mapped to <code>xml:lang</code>.
833 * @param mappingLangToXmlLang
834 * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
835 */
836 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
837 this.mappingLangToXmlLang = mappingLangToXmlLang;
838 if (tokenizer != null) {
839 tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
840 }
841 }
842
843 /**
844 * Whether <code>lang</code> is mapped to <code>xml:lang</code>.
845 *
846 * @return the mappingLangToXmlLang
847 */
848 public boolean isMappingLangToXmlLang() {
849 return mappingLangToXmlLang;
850 }
851
852 /**
853 * Whether the <code>xmlns</code> attribute on the root element is
854 * passed to through. (FATAL not allowed.)
855 * @param xmlnsPolicy
856 * @see nu.validator.htmlparser.impl.Tokenizer#setXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
857 */
858 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
859 if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
860 throw new IllegalArgumentException("Can't use FATAL here.");
861 }
862 this.xmlnsPolicy = xmlnsPolicy;
863 if (tokenizer != null) {
864 tokenizer.setXmlnsPolicy(xmlnsPolicy);
865 }
866 }
867
868 /**
869 * Returns the xmlnsPolicy.
870 *
871 * @return the xmlnsPolicy
872 */
873 public XmlViolationPolicy getXmlnsPolicy() {
874 return xmlnsPolicy;
875 }
876
877 /**
878 * Returns the lexicalHandler.
879 *
880 * @return the lexicalHandler
881 */
882 public LexicalHandler getLexicalHandler() {
883 return lexicalHandler;
884 }
885
886 /**
887 * Returns the commentPolicy.
888 *
889 * @return the commentPolicy
890 */
891 public XmlViolationPolicy getCommentPolicy() {
892 return commentPolicy;
893 }
894
895 /**
896 * Returns the contentNonXmlCharPolicy.
897 *
898 * @return the contentNonXmlCharPolicy
899 */
900 public XmlViolationPolicy getContentNonXmlCharPolicy() {
901 return contentNonXmlCharPolicy;
902 }
903
904 /**
905 * Returns the contentSpacePolicy.
906 *
907 * @return the contentSpacePolicy
908 */
909 public XmlViolationPolicy getContentSpacePolicy() {
910 return contentSpacePolicy;
911 }
912
913 /**
914 * @param reportingDoctype
915 * @see nu.validator.htmlparser.impl.TreeBuilder#setReportingDoctype(boolean)
916 */
917 public void setReportingDoctype(boolean reportingDoctype) {
918 this.reportingDoctype = reportingDoctype;
919 if (treeBuilder != null) {
920 treeBuilder.setReportingDoctype(reportingDoctype);
921 }
922 }
923
924 /**
925 * Returns the reportingDoctype.
926 *
927 * @return the reportingDoctype
928 */
929 public boolean isReportingDoctype() {
930 return reportingDoctype;
931 }
932
933 /**
934 * The policy for non-NCName element and attribute names.
935 * @param namePolicy
936 * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
937 */
938 public void setNamePolicy(XmlViolationPolicy namePolicy) {
939 this.namePolicy = namePolicy;
940 if (tokenizer != null) {
941 tokenizer.setNamePolicy(namePolicy);
942 }
943 }
944
945 /**
946 * This is a catch-all convenience method for setting name, xmlns, content space,
947 * content non-XML char and comment policies in one go. This does not affect the
948 * streamability policy or doctype reporting.
949 *
950 * @param xmlPolicy
951 */
952 public void setXmlPolicy(XmlViolationPolicy xmlPolicy) {
953 setNamePolicy(xmlPolicy);
954 setXmlnsPolicy(xmlPolicy == XmlViolationPolicy.FATAL ? XmlViolationPolicy.ALTER_INFOSET : xmlPolicy);
955 setContentSpacePolicy(xmlPolicy);
956 setContentNonXmlCharPolicy(xmlPolicy);
957 setCommentPolicy(xmlPolicy);
958 setBogusXmlnsPolicy(xmlPolicy);
959 }
960
961 /**
962 * The policy for non-NCName element and attribute names.
963 *
964 * @return the namePolicy
965 */
966 public XmlViolationPolicy getNamePolicy() {
967 return namePolicy;
968 }
969
970 /**
971 * Sets the policy for forbidden <code>xmlns</code> attributes.
972 * @param bogusXmlnsPolicy the policy
973 * @see nu.validator.htmlparser.impl.Tokenizer#setBogusXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
974 */
975 public void setBogusXmlnsPolicy(
976 XmlViolationPolicy bogusXmlnsPolicy) {
977 this.bogusXmlnsPolicy = bogusXmlnsPolicy;
978 if (tokenizer != null) {
979 tokenizer.setBogusXmlnsPolicy(bogusXmlnsPolicy);
980 }
981 }
982
983 /**
984 * Returns the bogusXmlnsPolicy.
985 *
986 * @return the bogusXmlnsPolicy
987 */
988 public XmlViolationPolicy getBogusXmlnsPolicy() {
989 return bogusXmlnsPolicy;
990 }
991
992 public void addCharacterHandler(CharacterHandler characterHandler) {
993 this.characterHandlers.add(characterHandler);
994 if (tokenizer != null) {
995 tokenizer.addCharacterHandler(characterHandler);
996 }
997 }
998 }