001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 * Copyright (c) 2007-2010 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.sax;
025
026 import java.io.IOException;
027 import java.net.MalformedURLException;
028 import java.net.URL;
029 import java.util.LinkedList;
030 import java.util.List;
031 import java.util.HashMap;
032
033 import nu.validator.htmlparser.common.CharacterHandler;
034 import nu.validator.htmlparser.common.DoctypeExpectation;
035 import nu.validator.htmlparser.common.DocumentModeHandler;
036 import nu.validator.htmlparser.common.Heuristics;
037 import nu.validator.htmlparser.common.TokenHandler;
038 import nu.validator.htmlparser.common.TransitionHandler;
039 import nu.validator.htmlparser.common.XmlViolationPolicy;
040 import nu.validator.htmlparser.impl.ErrorReportingTokenizer;
041 import nu.validator.htmlparser.impl.Tokenizer;
042 import nu.validator.htmlparser.impl.TreeBuilder;
043 import nu.validator.htmlparser.io.Driver;
044 import nu.validator.saxtree.Document;
045 import nu.validator.saxtree.DocumentFragment;
046 import nu.validator.saxtree.TreeParser;
047
048 import org.xml.sax.ContentHandler;
049 import org.xml.sax.DTDHandler;
050 import org.xml.sax.EntityResolver;
051 import org.xml.sax.ErrorHandler;
052 import org.xml.sax.InputSource;
053 import org.xml.sax.Locator;
054 import org.xml.sax.SAXException;
055 import org.xml.sax.SAXNotRecognizedException;
056 import org.xml.sax.SAXNotSupportedException;
057 import org.xml.sax.XMLReader;
058 import org.xml.sax.ext.LexicalHandler;
059 import org.xml.sax.helpers.DefaultHandler;
060
061 /**
062 * This class implements an HTML5 parser that exposes data through the SAX2
063 * interface.
064 *
065 * <p>By default, when using the constructor without arguments, the
066 * this parser coerces XML 1.0-incompatible infosets into XML 1.0-compatible
067 * infosets. This corresponds to <code>ALTER_INFOSET</code> as the general
068 * XML violation policy. To make the parser support non-conforming HTML fully
069 * per the HTML 5 spec while on the other hand potentially violating the SAX2
070 * API contract, set the general XML violation policy to <code>ALLOW</code>.
071 * It is possible to treat XML 1.0 infoset violations as fatal by setting
072 * the general XML violation policy to <code>FATAL</code>.
073 *
074 * <p>By default, this parser doesn't do true streaming but buffers everything
075 * first. The parser can be made truly streaming by calling
076 * <code>setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL)</code>. This
077 * has the consequence that errors that require non-streamable recovery are
078 * treated as fatal.
079 *
080 * <p>By default, in order to make the parse events emulate the parse events
081 * for a DTDless XML document, the parser does not report the doctype through
082 * <code>LexicalHandler</code>. Doctype reporting through
083 * <code>LexicalHandler</code> can be turned on by calling
084 * <code>setReportingDoctype(true)</code>.
085 *
086 * @version $Id$
087 * @author hsivonen
088 */
089 public class HtmlParser implements XMLReader {
090
091 private Driver driver = null;
092
093 private TreeBuilder<?> treeBuilder = null;
094
095 private SAXStreamer saxStreamer = null; // work around javac bug
096
097 private SAXTreeBuilder saxTreeBuilder = null; // work around javac bug
098
099 private ContentHandler contentHandler = null;
100
101 private LexicalHandler lexicalHandler = null;
102
103 private DTDHandler dtdHandler = null;
104
105 private EntityResolver entityResolver = null;
106
107 private ErrorHandler errorHandler = null;
108
109 private DocumentModeHandler documentModeHandler = null;
110
111 private DoctypeExpectation doctypeExpectation = DoctypeExpectation.HTML;
112
113 private boolean checkingNormalization = false;
114
115 private boolean scriptingEnabled = false;
116
117 private final List<CharacterHandler> characterHandlers = new LinkedList<CharacterHandler>();
118
119 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.FATAL;
120
121 private XmlViolationPolicy contentNonXmlCharPolicy = XmlViolationPolicy.FATAL;
122
123 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.FATAL;
124
125 private XmlViolationPolicy namePolicy = XmlViolationPolicy.FATAL;
126
127 private XmlViolationPolicy streamabilityViolationPolicy = XmlViolationPolicy.ALLOW;
128
129 private boolean html4ModeCompatibleWithXhtml1Schemata = false;
130
131 private boolean mappingLangToXmlLang = false;
132
133 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.FATAL;
134
135 private boolean reportingDoctype = true;
136
137 private ErrorHandler treeBuilderErrorHandler = null;
138
139 private Heuristics heuristics = Heuristics.NONE;
140
141 private HashMap<String, String> errorProfileMap = null;
142
143 private TransitionHandler transitionHandler = null;
144
145 /**
146 * Instantiates the parser with a fatal XML violation policy.
147 *
148 */
149 public HtmlParser() {
150 this(XmlViolationPolicy.FATAL);
151 }
152
153 /**
154 * Instantiates the parser with a specific XML violation policy.
155 * @param xmlPolicy the policy
156 */
157 public HtmlParser(XmlViolationPolicy xmlPolicy) {
158 setXmlPolicy(xmlPolicy);
159 }
160
161 private Tokenizer newTokenizer(TokenHandler handler, boolean newAttributesEachTime) {
162 if (errorHandler == null && transitionHandler == null &&
163 contentNonXmlCharPolicy == XmlViolationPolicy.ALLOW) {
164 return new Tokenizer(handler, newAttributesEachTime);
165 }
166 ErrorReportingTokenizer tokenizer =
167 new ErrorReportingTokenizer(handler, newAttributesEachTime);
168 tokenizer.setErrorProfile(errorProfileMap);
169 return tokenizer;
170 }
171
172 /**
173 * This class wraps different tree builders depending on configuration. This
174 * method does the work of hiding this from the user of the class.
175 */
176 private void lazyInit() {
177 if (driver == null) {
178 if (streamabilityViolationPolicy == XmlViolationPolicy.ALLOW) {
179 this.saxTreeBuilder = new SAXTreeBuilder();
180 this.treeBuilder = this.saxTreeBuilder;
181 this.saxStreamer = null;
182 this.driver = new Driver(newTokenizer(treeBuilder, true));
183 } else {
184 this.saxStreamer = new SAXStreamer();
185 this.treeBuilder = this.saxStreamer;
186 this.saxTreeBuilder = null;
187 this.driver = new Driver(newTokenizer(treeBuilder, false));
188 }
189 this.driver.setErrorHandler(errorHandler);
190 this.driver.setTransitionHandler(transitionHandler);
191 this.treeBuilder.setErrorHandler(treeBuilderErrorHandler);
192 this.driver.setCheckingNormalization(checkingNormalization);
193 this.driver.setCommentPolicy(commentPolicy);
194 this.driver.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
195 this.driver.setContentSpacePolicy(contentSpacePolicy);
196 this.driver.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
197 this.driver.setMappingLangToXmlLang(mappingLangToXmlLang);
198 this.driver.setXmlnsPolicy(xmlnsPolicy);
199 this.driver.setHeuristics(heuristics);
200 for (CharacterHandler characterHandler : characterHandlers) {
201 this.driver.addCharacterHandler(characterHandler);
202 }
203 this.treeBuilder.setDoctypeExpectation(doctypeExpectation);
204 this.treeBuilder.setDocumentModeHandler(documentModeHandler);
205 this.treeBuilder.setIgnoringComments(lexicalHandler == null);
206 this.treeBuilder.setScriptingEnabled(scriptingEnabled);
207 this.treeBuilder.setReportingDoctype(reportingDoctype);
208 this.treeBuilder.setNamePolicy(namePolicy);
209 if (saxStreamer != null) {
210 saxStreamer.setContentHandler(contentHandler == null ? new DefaultHandler()
211 : contentHandler);
212 saxStreamer.setLexicalHandler(lexicalHandler);
213 driver.setAllowRewinding(false);
214 }
215 }
216 }
217
218 /**
219 * @see org.xml.sax.XMLReader#getContentHandler()
220 */
221 public ContentHandler getContentHandler() {
222 return contentHandler;
223 }
224
225 /**
226 * @see org.xml.sax.XMLReader#getDTDHandler()
227 */
228 public DTDHandler getDTDHandler() {
229 return dtdHandler;
230 }
231
232 /**
233 * @see org.xml.sax.XMLReader#getEntityResolver()
234 */
235 public EntityResolver getEntityResolver() {
236 return entityResolver;
237 }
238
239 /**
240 * @see org.xml.sax.XMLReader#getErrorHandler()
241 */
242 public ErrorHandler getErrorHandler() {
243 return errorHandler;
244 }
245
246 /**
247 * Exposes the configuration of the emulated XML parser as well as
248 * boolean-valued configuration without using non-<code>XMLReader</code>
249 * getters directly.
250 *
251 * <dl>
252 * <dt><code>http://xml.org/sax/features/external-general-entities</code></dt>
253 * <dd><code>false</code></dd>
254 * <dt><code>http://xml.org/sax/features/external-parameter-entities</code></dt>
255 * <dd><code>false</code></dd>
256 * <dt><code>http://xml.org/sax/features/is-standalone</code></dt>
257 * <dd><code>true</code></dd>
258 * <dt><code>http://xml.org/sax/features/lexical-handler/parameter-entities</code></dt>
259 * <dd><code>false</code></dd>
260 * <dt><code>http://xml.org/sax/features/namespaces</code></dt>
261 * <dd><code>true</code></dd>
262 * <dt><code>http://xml.org/sax/features/namespace-prefixes</code></dt>
263 * <dd><code>false</code></dd>
264 * <dt><code>http://xml.org/sax/features/resolve-dtd-uris</code></dt>
265 * <dd><code>true</code></dd>
266 * <dt><code>http://xml.org/sax/features/string-interning</code></dt>
267 * <dd><code>false</code></dd>
268 * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
269 * <dd><code>isCheckingNormalization</code></dd>
270 * <dt><code>http://xml.org/sax/features/use-attributes2</code></dt>
271 * <dd><code>false</code></dd>
272 * <dt><code>http://xml.org/sax/features/use-locator2</code></dt>
273 * <dd><code>false</code></dd>
274 * <dt><code>http://xml.org/sax/features/use-entity-resolver2</code></dt>
275 * <dd><code>false</code></dd>
276 * <dt><code>http://xml.org/sax/features/validation</code></dt>
277 * <dd><code>false</code></dd>
278 * <dt><code>http://xml.org/sax/features/xmlns-uris</code></dt>
279 * <dd><code>false</code></dd>
280 * <dt><code>http://xml.org/sax/features/xml-1.1</code></dt>
281 * <dd><code>false</code></dd>
282 * <dt><code>http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata</code></dt>
283 * <dd><code>isHtml4ModeCompatibleWithXhtml1Schemata</code></dd>
284 * <dt><code>http://validator.nu/features/mapping-lang-to-xml-lang</code></dt>
285 * <dd><code>isMappingLangToXmlLang</code></dd>
286 * <dt><code>http://validator.nu/features/scripting-enabled</code></dt>
287 * <dd><code>isScriptingEnabled</code></dd>
288 * </dl>
289 *
290 * @param name
291 * feature URI string
292 * @return a value per the list above
293 * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
294 */
295 public boolean getFeature(String name) throws SAXNotRecognizedException,
296 SAXNotSupportedException {
297 if ("http://xml.org/sax/features/external-general-entities".equals(name)) {
298 return false;
299 } else if ("http://xml.org/sax/features/external-parameter-entities".equals(name)) {
300 return false;
301 } else if ("http://xml.org/sax/features/is-standalone".equals(name)) {
302 return true;
303 } else if ("http://xml.org/sax/features/lexical-handler/parameter-entities".equals(name)) {
304 return false;
305 } else if ("http://xml.org/sax/features/namespaces".equals(name)) {
306 return true;
307 } else if ("http://xml.org/sax/features/namespace-prefixes".equals(name)) {
308 return false;
309 } else if ("http://xml.org/sax/features/resolve-dtd-uris".equals(name)) {
310 return true; // default value--applicable scenario never happens
311 } else if ("http://xml.org/sax/features/string-interning".equals(name)) {
312 return true;
313 } else if ("http://xml.org/sax/features/unicode-normalization-checking".equals(name)) {
314 return isCheckingNormalization(); // the checks aren't really per
315 // XML 1.1
316 } else if ("http://xml.org/sax/features/use-attributes2".equals(name)) {
317 return false;
318 } else if ("http://xml.org/sax/features/use-locator2".equals(name)) {
319 return false;
320 } else if ("http://xml.org/sax/features/use-entity-resolver2".equals(name)) {
321 return false;
322 } else if ("http://xml.org/sax/features/validation".equals(name)) {
323 return false;
324 } else if ("http://xml.org/sax/features/xmlns-uris".equals(name)) {
325 return false;
326 } else if ("http://xml.org/sax/features/xml-1.1".equals(name)) {
327 return false;
328 } else if ("http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata".equals(name)) {
329 return isHtml4ModeCompatibleWithXhtml1Schemata();
330 } else if ("http://validator.nu/features/mapping-lang-to-xml-lang".equals(name)) {
331 return isMappingLangToXmlLang();
332 } else if ("http://validator.nu/features/scripting-enabled".equals(name)) {
333 return isScriptingEnabled();
334 } else {
335 throw new SAXNotRecognizedException();
336 }
337 }
338
339 /**
340 * Allows <code>XMLReader</code>-level access to non-boolean valued
341 * getters.
342 *
343 * <p>
344 * The properties are mapped as follows:
345 *
346 * <dl>
347 * <dt><code>http://xml.org/sax/properties/document-xml-version</code></dt>
348 * <dd><code>"1.0"</code></dd>
349 * <dt><code>http://xml.org/sax/properties/lexical-handler</code></dt>
350 * <dd><code>getLexicalHandler</code></dd>
351 * <dt><code>http://validator.nu/properties/content-space-policy</code></dt>
352 * <dd><code>getContentSpacePolicy</code></dd>
353 * <dt><code>http://validator.nu/properties/content-non-xml-char-policy</code></dt>
354 * <dd><code>getContentNonXmlCharPolicy</code></dd>
355 * <dt><code>http://validator.nu/properties/comment-policy</code></dt>
356 * <dd><code>getCommentPolicy</code></dd>
357 * <dt><code>http://validator.nu/properties/xmlns-policy</code></dt>
358 * <dd><code>getXmlnsPolicy</code></dd>
359 * <dt><code>http://validator.nu/properties/name-policy</code></dt>
360 * <dd><code>getNamePolicy</code></dd>
361 * <dt><code>http://validator.nu/properties/streamability-violation-policy</code></dt>
362 * <dd><code>getStreamabilityViolationPolicy</code></dd>
363 * <dt><code>http://validator.nu/properties/document-mode-handler</code></dt>
364 * <dd><code>getDocumentModeHandler</code></dd>
365 * <dt><code>http://validator.nu/properties/doctype-expectation</code></dt>
366 * <dd><code>getDoctypeExpectation</code></dd>
367 * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
368 * </dl>
369 *
370 * @param name
371 * property URI string
372 * @return a value per the list above
373 * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
374 */
375 public Object getProperty(String name) throws SAXNotRecognizedException,
376 SAXNotSupportedException {
377 if ("http://xml.org/sax/properties/declaration-handler".equals(name)) {
378 throw new SAXNotSupportedException(
379 "This parser does not suppert DeclHandler.");
380 } else if ("http://xml.org/sax/properties/document-xml-version".equals(name)) {
381 return "1.0"; // Emulating an XML 1.1 parser is not supported.
382 } else if ("http://xml.org/sax/properties/dom-node".equals(name)) {
383 throw new SAXNotSupportedException(
384 "This parser does not walk the DOM.");
385 } else if ("http://xml.org/sax/properties/lexical-handler".equals(name)) {
386 return getLexicalHandler();
387 } else if ("http://xml.org/sax/properties/xml-string".equals(name)) {
388 throw new SAXNotSupportedException(
389 "This parser does not expose the source as a string.");
390 } else if ("http://validator.nu/properties/content-space-policy".equals(name)) {
391 return getContentSpacePolicy();
392 } else if ("http://validator.nu/properties/content-non-xml-char-policy".equals(name)) {
393 return getContentNonXmlCharPolicy();
394 } else if ("http://validator.nu/properties/comment-policy".equals(name)) {
395 return getCommentPolicy();
396 } else if ("http://validator.nu/properties/xmlns-policy".equals(name)) {
397 return getXmlnsPolicy();
398 } else if ("http://validator.nu/properties/name-policy".equals(name)) {
399 return getNamePolicy();
400 } else if ("http://validator.nu/properties/streamability-violation-policy".equals(name)) {
401 return getStreamabilityViolationPolicy();
402 } else if ("http://validator.nu/properties/document-mode-handler".equals(name)) {
403 return getDocumentModeHandler();
404 } else if ("http://validator.nu/properties/doctype-expectation".equals(name)) {
405 return getDoctypeExpectation();
406 } else if ("http://validator.nu/properties/xml-policy".equals(name)) {
407 throw new SAXNotSupportedException(
408 "Cannot get a convenience setter.");
409 } else if ("http://validator.nu/properties/heuristics".equals(name)) {
410 return getHeuristics();
411 } else {
412 throw new SAXNotRecognizedException();
413 }
414 }
415
416 /**
417 * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource)
418 */
419 public void parse(InputSource input) throws IOException, SAXException {
420 lazyInit();
421 try {
422 treeBuilder.setFragmentContext(null);
423 tokenize(input);
424 } finally {
425 if (saxTreeBuilder != null) {
426 Document document = saxTreeBuilder.getDocument();
427 if (document != null) {
428 new TreeParser(contentHandler, lexicalHandler).parse(document);
429 }
430 }
431 }
432 }
433
434 /**
435 * Parses a fragment.
436 *
437 * @param input the input to parse
438 * @param context the name of the context element
439 * @throws IOException
440 * @throws SAXException
441 */
442 public void parseFragment(InputSource input, String context)
443 throws IOException, SAXException {
444 lazyInit();
445 try {
446 treeBuilder.setFragmentContext(context.intern());
447 tokenize(input);
448 } finally {
449 if (saxTreeBuilder != null) {
450 DocumentFragment fragment = saxTreeBuilder.getDocumentFragment();
451 new TreeParser(contentHandler, lexicalHandler).parse(fragment);
452 }
453 }
454 }
455
456 /**
457 * @param is
458 * @throws SAXException
459 * @throws IOException
460 * @throws MalformedURLException
461 */
462 private void tokenize(InputSource is) throws SAXException, IOException, MalformedURLException {
463 if (is == null) {
464 throw new IllegalArgumentException("Null input.");
465 }
466 if (is.getByteStream() == null && is.getCharacterStream() == null) {
467 String systemId = is.getSystemId();
468 if (systemId == null) {
469 throw new IllegalArgumentException("No byte stream, no character stream nor URI.");
470 }
471 if (entityResolver != null) {
472 is = entityResolver.resolveEntity(is.getPublicId(), systemId);
473 }
474 if (is.getByteStream() == null || is.getCharacterStream() == null) {
475 is = new InputSource();
476 is.setSystemId(systemId);
477 is.setByteStream(new URL(systemId).openStream());
478 }
479 }
480 driver.tokenize(is);
481 }
482
483 /**
484 * @see org.xml.sax.XMLReader#parse(java.lang.String)
485 */
486 public void parse(String systemId) throws IOException, SAXException {
487 parse(new InputSource(systemId));
488 }
489
490 /**
491 * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler)
492 */
493 public void setContentHandler(ContentHandler handler) {
494 contentHandler = handler;
495 if (saxStreamer != null) {
496 saxStreamer.setContentHandler(contentHandler == null ? new DefaultHandler()
497 : contentHandler);
498 }
499 }
500
501 /**
502 * Sets the lexical handler.
503 * @param handler the hander.
504 */
505 public void setLexicalHandler(LexicalHandler handler) {
506 lexicalHandler = handler;
507 if (treeBuilder != null) {
508 treeBuilder.setIgnoringComments(handler == null);
509 if (saxStreamer != null) {
510 saxStreamer.setLexicalHandler(handler);
511 }
512 }
513 }
514
515 /**
516 * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
517 */
518 public void setDTDHandler(DTDHandler handler) {
519 dtdHandler = handler;
520 }
521
522 /**
523 * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
524 */
525 public void setEntityResolver(EntityResolver resolver) {
526 entityResolver = resolver;
527 }
528
529 /**
530 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
531 */
532 public void setErrorHandler(ErrorHandler handler) {
533 errorHandler = handler;
534 treeBuilderErrorHandler = handler;
535 driver = null;
536 }
537
538 public void setTransitionHandler(TransitionHandler handler) {
539 transitionHandler = handler;
540 driver = null;
541 }
542
543 /**
544 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
545 * @deprecated For Validator.nu internal use
546 */
547 public void setTreeBuilderErrorHandlerOverride(ErrorHandler handler) {
548 treeBuilderErrorHandler = handler;
549 if (driver != null) {
550 treeBuilder.setErrorHandler(handler);
551 }
552 }
553
554 /**
555 * Sets a boolean feature without having to use non-<code>XMLReader</code>
556 * setters directly.
557 *
558 * <p>
559 * The supported features are:
560 *
561 * <dl>
562 * <dt><code>http://xml.org/sax/features/unicode-normalization-checking</code></dt>
563 * <dd><code>setCheckingNormalization</code></dd>
564 * <dt><code>http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata</code></dt>
565 * <dd><code>setHtml4ModeCompatibleWithXhtml1Schemata</code></dd>
566 * <dt><code>http://validator.nu/features/mapping-lang-to-xml-lang</code></dt>
567 * <dd><code>setMappingLangToXmlLang</code></dd>
568 * <dt><code>http://validator.nu/features/scripting-enabled</code></dt>
569 * <dd><code>setScriptingEnabled</code></dd>
570 * </dl>
571 *
572 * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
573 */
574 public void setFeature(String name, boolean value)
575 throws SAXNotRecognizedException, SAXNotSupportedException {
576 if ("http://xml.org/sax/features/external-general-entities".equals(name)) {
577 if (value) {
578 throw new SAXNotSupportedException("Cannot set " + name + ".");
579 }
580 } else if ("http://xml.org/sax/features/external-parameter-entities".equals(name)) {
581 if (value) {
582 throw new SAXNotSupportedException("Cannot set " + name + ".");
583 }
584 } else if ("http://xml.org/sax/features/is-standalone".equals(name)) {
585 if (!value) {
586 throw new SAXNotSupportedException("Cannot set " + name + ".");
587 }
588 } else if ("http://xml.org/sax/features/lexical-handler/parameter-entities".equals(name)) {
589 if (value) {
590 throw new SAXNotSupportedException("Cannot set " + name + ".");
591 }
592 } else if ("http://xml.org/sax/features/namespaces".equals(name)) {
593 if (!value) {
594 throw new SAXNotSupportedException("Cannot set " + name + ".");
595 }
596 } else if ("http://xml.org/sax/features/namespace-prefixes".equals(name)) {
597 if (value) {
598 throw new SAXNotSupportedException("Cannot set " + name + ".");
599 }
600 } else if ("http://xml.org/sax/features/resolve-dtd-uris".equals(name)) {
601 if (!value) {
602 throw new SAXNotSupportedException("Cannot set " + name + ".");
603 }
604 } else if ("http://xml.org/sax/features/string-interning".equals(name)) {
605 if (!value) {
606 throw new SAXNotSupportedException("Cannot set " + name + ".");
607 }
608 } else if ("http://xml.org/sax/features/unicode-normalization-checking".equals(name)) {
609 setCheckingNormalization(value);
610 } else if ("http://xml.org/sax/features/use-attributes2".equals(name)) {
611 if (value) {
612 throw new SAXNotSupportedException("Cannot set " + name + ".");
613 }
614 } else if ("http://xml.org/sax/features/use-locator2".equals(name)) {
615 if (value) {
616 throw new SAXNotSupportedException("Cannot set " + name + ".");
617 }
618 } else if ("http://xml.org/sax/features/use-entity-resolver2".equals(name)) {
619 if (value) {
620 throw new SAXNotSupportedException("Cannot set " + name + ".");
621 }
622 } else if ("http://xml.org/sax/features/validation".equals(name)) {
623 if (value) {
624 throw new SAXNotSupportedException("Cannot set " + name + ".");
625 }
626 } else if ("http://xml.org/sax/features/xmlns-uris".equals(name)) {
627 if (value) {
628 throw new SAXNotSupportedException("Cannot set " + name + ".");
629 }
630 } else if ("http://xml.org/sax/features/xml-1.1".equals(name)) {
631 if (value) {
632 throw new SAXNotSupportedException("Cannot set " + name + ".");
633 }
634 } else if ("http://validator.nu/features/html4-mode-compatible-with-xhtml1-schemata".equals(name)) {
635 setHtml4ModeCompatibleWithXhtml1Schemata(value);
636 } else if ("http://validator.nu/features/mapping-lang-to-xml-lang".equals(name)) {
637 setMappingLangToXmlLang(value);
638 } else if ("http://validator.nu/features/scripting-enabled".equals(name)) {
639 setScriptingEnabled(value);
640 } else {
641 throw new SAXNotRecognizedException();
642 }
643 }
644
645 /**
646 * Sets a non-boolean property without having to use non-<code>XMLReader</code>
647 * setters directly.
648 *
649 * <dl>
650 * <dt><code>http://xml.org/sax/properties/lexical-handler</code></dt>
651 * <dd><code>setLexicalHandler</code></dd>
652 * <dt><code>http://validator.nu/properties/content-space-policy</code></dt>
653 * <dd><code>setContentSpacePolicy</code></dd>
654 * <dt><code>http://validator.nu/properties/content-non-xml-char-policy</code></dt>
655 * <dd><code>setContentNonXmlCharPolicy</code></dd>
656 * <dt><code>http://validator.nu/properties/comment-policy</code></dt>
657 * <dd><code>setCommentPolicy</code></dd>
658 * <dt><code>http://validator.nu/properties/xmlns-policy</code></dt>
659 * <dd><code>setXmlnsPolicy</code></dd>
660 * <dt><code>http://validator.nu/properties/name-policy</code></dt>
661 * <dd><code>setNamePolicy</code></dd>
662 * <dt><code>http://validator.nu/properties/streamability-violation-policy</code></dt>
663 * <dd><code>setStreamabilityViolationPolicy</code></dd>
664 * <dt><code>http://validator.nu/properties/document-mode-handler</code></dt>
665 * <dd><code>setDocumentModeHandler</code></dd>
666 * <dt><code>http://validator.nu/properties/doctype-expectation</code></dt>
667 * <dd><code>setDoctypeExpectation</code></dd>
668 * <dt><code>http://validator.nu/properties/xml-policy</code></dt>
669 * <dd><code>setXmlPolicy</code></dd>
670 * </dl>
671 *
672 * @see org.xml.sax.XMLReader#setProperty(java.lang.String,
673 * java.lang.Object)
674 */
675 public void setProperty(String name, Object value)
676 throws SAXNotRecognizedException, SAXNotSupportedException {
677 if ("http://xml.org/sax/properties/declaration-handler".equals(name)) {
678 throw new SAXNotSupportedException(
679 "This parser does not suppert DeclHandler.");
680 } else if ("http://xml.org/sax/properties/document-xml-version".equals(name)) {
681 throw new SAXNotSupportedException(
682 "Can't set document-xml-version.");
683 } else if ("http://xml.org/sax/properties/dom-node".equals(name)) {
684 throw new SAXNotSupportedException("Can't set dom-node.");
685 } else if ("http://xml.org/sax/properties/lexical-handler".equals(name)) {
686 setLexicalHandler((LexicalHandler) value);
687 } else if ("http://xml.org/sax/properties/xml-string".equals(name)) {
688 throw new SAXNotSupportedException("Can't set xml-string.");
689 } else if ("http://validator.nu/properties/content-space-policy".equals(name)) {
690 setContentSpacePolicy((XmlViolationPolicy) value);
691 } else if ("http://validator.nu/properties/content-non-xml-char-policy".equals(name)) {
692 setContentNonXmlCharPolicy((XmlViolationPolicy) value);
693 } else if ("http://validator.nu/properties/comment-policy".equals(name)) {
694 setCommentPolicy((XmlViolationPolicy) value);
695 } else if ("http://validator.nu/properties/xmlns-policy".equals(name)) {
696 setXmlnsPolicy((XmlViolationPolicy) value);
697 } else if ("http://validator.nu/properties/name-policy".equals(name)) {
698 setNamePolicy((XmlViolationPolicy) value);
699 } else if ("http://validator.nu/properties/streamability-violation-policy".equals(name)) {
700 setStreamabilityViolationPolicy((XmlViolationPolicy) value);
701 } else if ("http://validator.nu/properties/document-mode-handler".equals(name)) {
702 setDocumentModeHandler((DocumentModeHandler) value);
703 } else if ("http://validator.nu/properties/doctype-expectation".equals(name)) {
704 setDoctypeExpectation((DoctypeExpectation) value);
705 } else if ("http://validator.nu/properties/xml-policy".equals(name)) {
706 setXmlPolicy((XmlViolationPolicy) value);
707 } else if ("http://validator.nu/properties/heuristics".equals(name)) {
708 setHeuristics((Heuristics) value);
709 } else {
710 throw new SAXNotRecognizedException();
711 }
712 }
713
714 /**
715 * Indicates whether NFC normalization of source is being checked.
716 * @return <code>true</code> if NFC normalization of source is being checked.
717 * @see nu.validator.htmlparser.impl.Tokenizer#isCheckingNormalization()
718 */
719 public boolean isCheckingNormalization() {
720 return checkingNormalization;
721 }
722
723 /**
724 * Toggles the checking of the NFC normalization of source.
725 * @param enable <code>true</code> to check normalization
726 * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean)
727 */
728 public void setCheckingNormalization(boolean enable) {
729 this.checkingNormalization = enable;
730 if (driver != null) {
731 driver.setCheckingNormalization(checkingNormalization);
732 }
733 }
734
735 /**
736 * Sets the policy for consecutive hyphens in comments.
737 * @param commentPolicy the policy
738 * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
739 */
740 public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
741 this.commentPolicy = commentPolicy;
742 if (driver != null) {
743 driver.setCommentPolicy(commentPolicy);
744 }
745 }
746
747 /**
748 * Sets the policy for non-XML characters except white space.
749 * @param contentNonXmlCharPolicy the policy
750 * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
751 */
752 public void setContentNonXmlCharPolicy(
753 XmlViolationPolicy contentNonXmlCharPolicy) {
754 this.contentNonXmlCharPolicy = contentNonXmlCharPolicy;
755 driver = null;
756 }
757
758 /**
759 * Sets the policy for non-XML white space.
760 * @param contentSpacePolicy the policy
761 * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
762 */
763 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
764 this.contentSpacePolicy = contentSpacePolicy;
765 if (driver != null) {
766 driver.setContentSpacePolicy(contentSpacePolicy);
767 }
768 }
769
770 /**
771 * Whether the parser considers scripting to be enabled for noscript treatment.
772 *
773 * @return <code>true</code> if enabled
774 * @see nu.validator.htmlparser.impl.TreeBuilder#isScriptingEnabled()
775 */
776 public boolean isScriptingEnabled() {
777 return scriptingEnabled;
778 }
779
780 /**
781 * Sets whether the parser considers scripting to be enabled for noscript treatment.
782 * @param scriptingEnabled <code>true</code> to enable
783 * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
784 */
785 public void setScriptingEnabled(boolean scriptingEnabled) {
786 this.scriptingEnabled = scriptingEnabled;
787 if (treeBuilder != null) {
788 treeBuilder.setScriptingEnabled(scriptingEnabled);
789 }
790 }
791
792 /**
793 * Returns the doctype expectation.
794 *
795 * @return the doctypeExpectation
796 */
797 public DoctypeExpectation getDoctypeExpectation() {
798 return doctypeExpectation;
799 }
800
801 /**
802 * Sets the doctype expectation.
803 *
804 * @param doctypeExpectation
805 * the doctypeExpectation to set
806 * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation)
807 */
808 public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) {
809 this.doctypeExpectation = doctypeExpectation;
810 if (treeBuilder != null) {
811 treeBuilder.setDoctypeExpectation(doctypeExpectation);
812 }
813 }
814
815 /**
816 * Returns the document mode handler.
817 *
818 * @return the documentModeHandler
819 */
820 public DocumentModeHandler getDocumentModeHandler() {
821 return documentModeHandler;
822 }
823
824 /**
825 * Sets the document mode handler.
826 *
827 * @param documentModeHandler
828 * the documentModeHandler to set
829 * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler)
830 */
831 public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) {
832 this.documentModeHandler = documentModeHandler;
833 }
834
835 /**
836 * Returns the streamabilityViolationPolicy.
837 *
838 * @return the streamabilityViolationPolicy
839 */
840 public XmlViolationPolicy getStreamabilityViolationPolicy() {
841 return streamabilityViolationPolicy;
842 }
843
844 /**
845 * Sets the streamabilityViolationPolicy.
846 *
847 * @param streamabilityViolationPolicy
848 * the streamabilityViolationPolicy to set
849 */
850 public void setStreamabilityViolationPolicy(
851 XmlViolationPolicy streamabilityViolationPolicy) {
852 this.streamabilityViolationPolicy = streamabilityViolationPolicy;
853 driver = null;
854 }
855
856 /**
857 * Whether the HTML 4 mode reports boolean attributes in a way that repeats
858 * the name in the value.
859 * @param html4ModeCompatibleWithXhtml1Schemata
860 */
861 public void setHtml4ModeCompatibleWithXhtml1Schemata(
862 boolean html4ModeCompatibleWithXhtml1Schemata) {
863 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
864 if (driver != null) {
865 driver.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
866 }
867 }
868
869 /**
870 * Returns the <code>Locator</code> during parse.
871 * @return the <code>Locator</code>
872 */
873 public Locator getDocumentLocator() {
874 return driver.getDocumentLocator();
875 }
876
877 /**
878 * Whether the HTML 4 mode reports boolean attributes in a way that repeats
879 * the name in the value.
880 *
881 * @return the html4ModeCompatibleWithXhtml1Schemata
882 */
883 public boolean isHtml4ModeCompatibleWithXhtml1Schemata() {
884 return html4ModeCompatibleWithXhtml1Schemata;
885 }
886
887 /**
888 * Whether <code>lang</code> is mapped to <code>xml:lang</code>.
889 * @param mappingLangToXmlLang
890 * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
891 */
892 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
893 this.mappingLangToXmlLang = mappingLangToXmlLang;
894 if (driver != null) {
895 driver.setMappingLangToXmlLang(mappingLangToXmlLang);
896 }
897 }
898
899 /**
900 * Whether <code>lang</code> is mapped to <code>xml:lang</code>.
901 *
902 * @return the mappingLangToXmlLang
903 */
904 public boolean isMappingLangToXmlLang() {
905 return mappingLangToXmlLang;
906 }
907
908 /**
909 * Whether the <code>xmlns</code> attribute on the root element is
910 * passed to through. (FATAL not allowed.)
911 * @param xmlnsPolicy
912 * @see nu.validator.htmlparser.impl.Tokenizer#setXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
913 */
914 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
915 if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
916 throw new IllegalArgumentException("Can't use FATAL here.");
917 }
918 this.xmlnsPolicy = xmlnsPolicy;
919 if (driver != null) {
920 driver.setXmlnsPolicy(xmlnsPolicy);
921 }
922 }
923
924 /**
925 * Returns the xmlnsPolicy.
926 *
927 * @return the xmlnsPolicy
928 */
929 public XmlViolationPolicy getXmlnsPolicy() {
930 return xmlnsPolicy;
931 }
932
933 /**
934 * Returns the lexicalHandler.
935 *
936 * @return the lexicalHandler
937 */
938 public LexicalHandler getLexicalHandler() {
939 return lexicalHandler;
940 }
941
942 /**
943 * Returns the commentPolicy.
944 *
945 * @return the commentPolicy
946 */
947 public XmlViolationPolicy getCommentPolicy() {
948 return commentPolicy;
949 }
950
951 /**
952 * Returns the contentNonXmlCharPolicy.
953 *
954 * @return the contentNonXmlCharPolicy
955 */
956 public XmlViolationPolicy getContentNonXmlCharPolicy() {
957 return contentNonXmlCharPolicy;
958 }
959
960 /**
961 * Returns the contentSpacePolicy.
962 *
963 * @return the contentSpacePolicy
964 */
965 public XmlViolationPolicy getContentSpacePolicy() {
966 return contentSpacePolicy;
967 }
968
969 /**
970 * @param reportingDoctype
971 * @see nu.validator.htmlparser.impl.TreeBuilder#setReportingDoctype(boolean)
972 */
973 public void setReportingDoctype(boolean reportingDoctype) {
974 this.reportingDoctype = reportingDoctype;
975 if (treeBuilder != null) {
976 treeBuilder.setReportingDoctype(reportingDoctype);
977 }
978 }
979
980 /**
981 * Returns the reportingDoctype.
982 *
983 * @return the reportingDoctype
984 */
985 public boolean isReportingDoctype() {
986 return reportingDoctype;
987 }
988
989 /**
990 * @param errorProfile
991 * @see nu.validator.htmlparser.impl.errorReportingTokenizer#setErrorProfile(set)
992 */
993 public void setErrorProfile(HashMap<String, String> errorProfileMap) {
994 this.errorProfileMap = errorProfileMap;
995 }
996
997 /**
998 * The policy for non-NCName element and attribute names.
999 * @param namePolicy
1000 * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
1001 */
1002 public void setNamePolicy(XmlViolationPolicy namePolicy) {
1003 this.namePolicy = namePolicy;
1004 if (driver != null) {
1005 driver.setNamePolicy(namePolicy);
1006 treeBuilder.setNamePolicy(namePolicy);
1007 }
1008 }
1009
1010 /**
1011 * Sets the encoding sniffing heuristics.
1012 *
1013 * @param heuristics the heuristics to set
1014 * @see nu.validator.htmlparser.impl.Tokenizer#setHeuristics(nu.validator.htmlparser.common.Heuristics)
1015 */
1016 public void setHeuristics(Heuristics heuristics) {
1017 this.heuristics = heuristics;
1018 if (driver != null) {
1019 driver.setHeuristics(heuristics);
1020 }
1021 }
1022
1023 public Heuristics getHeuristics() {
1024 return this.heuristics;
1025 }
1026
1027 /**
1028 * This is a catch-all convenience method for setting name, xmlns, content space,
1029 * content non-XML char and comment policies in one go. This does not affect the
1030 * streamability policy or doctype reporting.
1031 *
1032 * @param xmlPolicy
1033 */
1034 public void setXmlPolicy(XmlViolationPolicy xmlPolicy) {
1035 setNamePolicy(xmlPolicy);
1036 setXmlnsPolicy(xmlPolicy == XmlViolationPolicy.FATAL ? XmlViolationPolicy.ALTER_INFOSET : xmlPolicy);
1037 setContentSpacePolicy(xmlPolicy);
1038 setContentNonXmlCharPolicy(xmlPolicy);
1039 setCommentPolicy(xmlPolicy);
1040 }
1041
1042 /**
1043 * The policy for non-NCName element and attribute names.
1044 *
1045 * @return the namePolicy
1046 */
1047 public XmlViolationPolicy getNamePolicy() {
1048 return namePolicy;
1049 }
1050
1051 /**
1052 * Does nothing.
1053 * @deprecated
1054 */
1055 public void setBogusXmlnsPolicy(
1056 XmlViolationPolicy bogusXmlnsPolicy) {
1057 }
1058
1059 /**
1060 * Returns <code>XmlViolationPolicy.ALTER_INFOSET</code>.
1061 * @deprecated
1062 * @return <code>XmlViolationPolicy.ALTER_INFOSET</code>
1063 */
1064 public XmlViolationPolicy getBogusXmlnsPolicy() {
1065 return XmlViolationPolicy.ALTER_INFOSET;
1066 }
1067
1068 public void addCharacterHandler(CharacterHandler characterHandler) {
1069 this.characterHandlers.add(characterHandler);
1070 if (driver != null) {
1071 driver.addCharacterHandler(characterHandler);
1072 }
1073 }
1074 }