001    /*
002     * Copyright (c) 2005, 2006, 2007 Henri Sivonen
003     * Copyright (c) 2007-2008 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.io;
025    
026    import java.io.IOException;
027    import java.io.InputStream;
028    import java.io.Reader;
029    import java.nio.charset.UnsupportedCharsetException;
030    
031    import nu.validator.htmlparser.common.CharacterHandler;
032    import nu.validator.htmlparser.common.EncodingDeclarationHandler;
033    import nu.validator.htmlparser.common.Heuristics;
034    import nu.validator.htmlparser.common.TokenHandler;
035    import nu.validator.htmlparser.common.TransitionHandler;
036    import nu.validator.htmlparser.common.XmlViolationPolicy;
037    import nu.validator.htmlparser.extra.NormalizationChecker;
038    import nu.validator.htmlparser.impl.ErrorReportingTokenizer;
039    import nu.validator.htmlparser.impl.Tokenizer;
040    import nu.validator.htmlparser.impl.TreeBuilder;
041    import nu.validator.htmlparser.impl.UTF16Buffer;
042    import nu.validator.htmlparser.rewindable.RewindableInputStream;
043    
044    import org.xml.sax.ErrorHandler;
045    import org.xml.sax.InputSource;
046    import org.xml.sax.Locator;
047    import org.xml.sax.SAXException;
048    import org.xml.sax.SAXParseException;
049    
050    public class Driver implements EncodingDeclarationHandler {
051    
052        /**
053         * The input UTF-16 code unit stream. If a byte stream was given, this
054         * object is an instance of <code>HtmlInputStreamReader</code>.
055         */
056        private Reader reader;
057    
058        /**
059         * The reference to the rewindable byte stream. <code>null</code> if p
060         * rohibited or no longer needed.
061         */
062        private RewindableInputStream rewindableInputStream;
063    
064        private boolean swallowBom;
065    
066        private Encoding characterEncoding;
067    
068        private boolean allowRewinding = true;
069    
070        private Heuristics heuristics = Heuristics.NONE;
071        
072        private final Tokenizer tokenizer;
073        
074        private Confidence confidence;
075    
076        /**
077         * Used for NFC checking if non-<code>null</code>, source code capture,
078         * etc.
079         */
080        private CharacterHandler[] characterHandlers = new CharacterHandler[0];
081    
082        public Driver(Tokenizer tokenizer) {
083            this.tokenizer = tokenizer;
084            tokenizer.setEncodingDeclarationHandler(this);
085        }
086        
087        /**
088         * Returns the allowRewinding.
089         * 
090         * @return the allowRewinding
091         */
092        public boolean isAllowRewinding() {
093            return allowRewinding;
094        }
095    
096        /**
097         * Sets the allowRewinding.
098         * 
099         * @param allowRewinding
100         *            the allowRewinding to set
101         */
102        public void setAllowRewinding(boolean allowRewinding) {
103            this.allowRewinding = allowRewinding;
104        }
105    
106        /**
107         * Turns NFC checking on or off.
108         * 
109         * @param enable
110         *            <code>true</code> if checking on
111         */
112        public void setCheckingNormalization(boolean enable) {
113            if (enable) {
114                if (isCheckingNormalization()) {
115                    return;
116                } else {
117                    NormalizationChecker normalizationChecker = new NormalizationChecker(tokenizer);
118                    normalizationChecker.setErrorHandler(tokenizer.getErrorHandler());
119    
120                }
121            } else {
122                if (isCheckingNormalization()) {
123                    CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length - 1];
124                    boolean skipped = false;
125                    int j = 0;
126                    for (int i = 0; i < characterHandlers.length; i++) {
127                        CharacterHandler ch = characterHandlers[i];
128                        if (!(!skipped && (ch instanceof NormalizationChecker))) {
129                            newHandlers[j] = ch;
130                            j++;
131                        }
132                    }
133                    characterHandlers = newHandlers;
134                } else {
135                    return;
136                }
137            }
138        }
139    
140        public void addCharacterHandler(CharacterHandler characterHandler) {
141            if (characterHandler == null) {
142                throw new IllegalArgumentException("Null argument.");
143            }
144            CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length + 1];
145            System.arraycopy(characterHandlers, 0, newHandlers, 0,
146                    characterHandlers.length);
147            newHandlers[characterHandlers.length] = characterHandler;
148            characterHandlers = newHandlers;
149        }
150    
151        /**
152         * Query if checking normalization.
153         * 
154         * @return <code>true</code> if checking on
155         */
156        public boolean isCheckingNormalization() {
157            for (int i = 0; i < characterHandlers.length; i++) {
158                CharacterHandler ch = characterHandlers[i];
159                if (ch instanceof NormalizationChecker) {
160                    return true;
161                }
162            }
163            return false;
164        }
165    
166        /**
167         * Runs the tokenization. This is the main entry point.
168         * 
169         * @param is
170         *            the input source
171         * @throws SAXException
172         *             on fatal error (if configured to treat XML violations as
173         *             fatal) or if the token handler threw
174         * @throws IOException
175         *             if the stream threw
176         */
177        public void tokenize(InputSource is) throws SAXException, IOException {
178            if (is == null) {
179                throw new IllegalArgumentException("InputSource was null.");
180            }
181            tokenizer.start();
182            confidence = Confidence.TENTATIVE;
183            swallowBom = true;
184            rewindableInputStream = null;
185            tokenizer.initLocation(is.getPublicId(), is.getSystemId());
186            this.reader = is.getCharacterStream();
187            this.characterEncoding = encodingFromExternalDeclaration(is.getEncoding());
188            if (this.reader == null) {
189                InputStream inputStream = is.getByteStream();
190                if (inputStream == null) {
191                    throw new SAXException("Both streams in InputSource were null.");
192                }
193                if (this.characterEncoding == null) {
194                    if (allowRewinding) {
195                        inputStream = rewindableInputStream = new RewindableInputStream(
196                                inputStream);
197                    }
198                    this.reader = new HtmlInputStreamReader(inputStream,
199                            tokenizer.getErrorHandler(), tokenizer, this, heuristics);
200                } else {
201                    becomeConfident();
202                    this.reader = new HtmlInputStreamReader(inputStream,
203                            tokenizer.getErrorHandler(), tokenizer, this, this.characterEncoding);
204                }
205            } else {
206                becomeConfident();
207            }
208            Throwable t = null;
209            try {
210                for (;;) {
211                    try {
212                        for (int i = 0; i < characterHandlers.length; i++) {
213                            CharacterHandler ch = characterHandlers[i];
214                            ch.start();
215                        }
216                        runStates();
217                        if (confidence == Confidence.TENTATIVE
218                                && !tokenizer.isAlreadyComplainedAboutNonAscii()) {
219                            warnWithoutLocation("The character encoding of the document was not declared.");
220                        }
221                        break;
222                    } catch (ReparseException e) {
223                        if (rewindableInputStream == null) {
224                            tokenizer.fatal("Changing encoding at this point would need non-streamable behavior.");
225                        } else {
226                            rewindableInputStream.rewind();
227                            becomeConfident();
228                            this.reader = new HtmlInputStreamReader(
229                                    rewindableInputStream, tokenizer.getErrorHandler(), tokenizer,
230                                    this, this.characterEncoding);
231                        }
232                        continue;
233                    }
234                }
235            } catch (Throwable tr) {
236                t = tr;
237            } finally {
238                try {
239                    tokenizer.end();
240                    characterEncoding = null;
241                    for (int i = 0; i < characterHandlers.length; i++) {
242                        CharacterHandler ch = characterHandlers[i];
243                        ch.end();
244                    }
245                    reader.close();
246                    reader = null;
247                    rewindableInputStream = null;
248                } catch (Throwable tr) {
249                    if (t == null) {
250                        t = tr;
251                    } // else drop the later throwable
252                }
253                if (t != null) {
254                    if (t instanceof IOException) {
255                        throw (IOException) t;
256                    } else if (t instanceof SAXException) {
257                        throw (SAXException) t;
258                    } else if (t instanceof RuntimeException) {
259                        throw (RuntimeException) t;
260                    } else if (t instanceof Error) {
261                        throw (Error) t;
262                    } else {
263                        // impossible
264                        throw new RuntimeException(t);
265                    }
266                }
267            }
268        }
269    
270        void dontSwallowBom() {
271            swallowBom = false;
272        }
273    
274        private void runStates() throws SAXException, IOException {
275            char[] buffer = new char[2048];
276            UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0);
277            boolean lastWasCR = false;
278            int len = -1;
279            if ((len = reader.read(buffer)) != -1) {
280                assert len > 0;
281                int streamOffset = 0;
282                int offset = 0;
283                int length = len;
284                if (swallowBom) {
285                    if (buffer[0] == '\uFEFF') {
286                        streamOffset = -1;
287                        offset = 1;
288                        length--;
289                    }
290                }
291                if (length > 0) {
292                    for (int i = 0; i < characterHandlers.length; i++) {
293                        CharacterHandler ch = characterHandlers[i];
294                        ch.characters(buffer, offset, length);
295                    }
296                    tokenizer.setTransitionBaseOffset(streamOffset);
297                    bufr.setStart(offset);
298                    bufr.setEnd(offset + length);
299                    while (bufr.hasMore()) {
300                        bufr.adjust(lastWasCR);
301                        lastWasCR = false;
302                        if (bufr.hasMore()) {
303                            lastWasCR = tokenizer.tokenizeBuffer(bufr);                    
304                        }
305                    }
306                }
307                streamOffset = length;
308                while ((len = reader.read(buffer)) != -1) {
309                    assert len > 0;
310                    for (int i = 0; i < characterHandlers.length; i++) {
311                        CharacterHandler ch = characterHandlers[i];
312                        ch.characters(buffer, 0, len);
313                    }
314                    tokenizer.setTransitionBaseOffset(streamOffset);
315                    bufr.setStart(0);
316                    bufr.setEnd(len);
317                    while (bufr.hasMore()) {
318                        bufr.adjust(lastWasCR);
319                        lastWasCR = false;
320                        if (bufr.hasMore()) {
321                            lastWasCR = tokenizer.tokenizeBuffer(bufr);                    
322                        }
323                    }
324                    streamOffset += len;
325                }
326            }
327            tokenizer.eof();
328        }
329    
330        public void setEncoding(Encoding encoding, Confidence confidence) {
331            this.characterEncoding = encoding;
332            if (confidence == Confidence.CERTAIN) {
333                becomeConfident();
334            }
335        }
336    
337        public boolean internalEncodingDeclaration(String internalCharset)
338                throws SAXException {
339            try {
340                internalCharset = Encoding.toAsciiLowerCase(internalCharset);
341                Encoding cs;
342                if ("utf-16".equals(internalCharset)
343                        || "utf-16be".equals(internalCharset)
344                        || "utf-16le".equals(internalCharset)) {
345                    tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
346                            + internalCharset
347                            + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D.");
348                    cs = Encoding.UTF8;
349                    internalCharset = "utf-8";
350                } else {
351                    cs = Encoding.forName(internalCharset);
352                }
353                Encoding actual = cs.getActualHtmlEncoding();
354                if (actual == null) {
355                    actual = cs;
356                }
357                if (!actual.isAsciiSuperset()) {
358                    tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
359                            + internalCharset
360                            + "\u201D which is not an ASCII superset. Not changing the encoding.");
361                    return false;
362                }
363                if (characterEncoding == null) {
364                    // Reader case
365                    return true;
366                }
367                if (characterEncoding == actual) {
368                    becomeConfident();
369                    return true;
370                }
371                if (confidence == Confidence.CERTAIN && actual != characterEncoding) {
372                    tokenizer.errTreeBuilder("Internal encoding declaration \u201C"
373                            + internalCharset
374                            + "\u201D disagrees with the actual encoding of the document (\u201C"
375                            + characterEncoding.getCanonName() + "\u201D).");
376                } else {
377                    Encoding newEnc = whineAboutEncodingAndReturnActual(
378                            internalCharset, cs);
379                    tokenizer.errTreeBuilder("Changing character encoding \u201C"
380                            + internalCharset + "\u201D and reparsing.");
381                    characterEncoding = newEnc;
382                    throw new ReparseException();
383                }
384                return true;
385            } catch (UnsupportedCharsetException e) {
386                tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C"
387                        + internalCharset + "\u201D.");
388                return false;
389            }
390        }
391    
392        /**
393         * 
394         */
395        private void becomeConfident() {
396            if (rewindableInputStream != null) {
397                rewindableInputStream.willNotRewind();
398            }
399            confidence = Confidence.CERTAIN;
400            tokenizer.becomeConfident();
401        }
402    
403        /**
404         * Sets the encoding sniffing heuristics.
405         * 
406         * @param heuristics
407         *            the heuristics to set
408         */
409        public void setHeuristics(Heuristics heuristics) {
410            this.heuristics = heuristics;
411        }
412    
413        /**
414         * Reports a warning without line/col
415         * 
416         * @param message
417         *            the message
418         * @throws SAXException
419         */
420        protected void warnWithoutLocation(String message) throws SAXException {
421            ErrorHandler errorHandler = tokenizer.getErrorHandler();
422            if (errorHandler == null) {
423                return;
424            }
425            SAXParseException spe = new SAXParseException(message, null,
426                    tokenizer.getSystemId(), -1, -1);
427            errorHandler.warning(spe);
428        }
429    
430        /**
431         * Initializes a decoder from external decl.
432         */
433        protected Encoding encodingFromExternalDeclaration(String encoding)
434                throws SAXException {
435            if (encoding == null) {
436                return null;
437            }
438            encoding = Encoding.toAsciiLowerCase(encoding);
439            try {
440                Encoding cs = Encoding.forName(encoding);
441                if ("utf-16".equals(cs.getCanonName())
442                        || "utf-32".equals(cs.getCanonName())) {
443                    swallowBom = false;
444                }
445                return whineAboutEncodingAndReturnActual(encoding, cs);
446            } catch (UnsupportedCharsetException e) {
447                tokenizer.err("Unsupported character encoding name: \u201C" + encoding
448                        + "\u201D. Will sniff.");
449                swallowBom = true;
450            }
451            return null; // keep the compiler happy
452        }
453    
454        /**
455         * @param encoding
456         * @param cs
457         * @return
458         * @throws SAXException
459         */
460        protected Encoding whineAboutEncodingAndReturnActual(String encoding,
461                Encoding cs) throws SAXException {
462            String canonName = cs.getCanonName();
463            if (!cs.isRegistered()) {
464                if (encoding.startsWith("x-")) {
465                    tokenizer.err("The encoding \u201C"
466                            + encoding
467                            + "\u201D is not an IANA-registered encoding. (Charmod C022)");
468                } else {
469                    tokenizer.err("The encoding \u201C"
470                            + encoding
471                            + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
472                }
473            } else if (!canonName.equals(encoding)) {
474                tokenizer.err("The encoding \u201C"
475                        + encoding
476                        + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
477                        + canonName + "\u201D. (Charmod C024)");
478            }
479            if (cs.isShouldNot()) {
480                tokenizer.warn("Authors should not use the character encoding \u201C"
481                        + encoding
482                        + "\u201D. It is recommended to use \u201CUTF-8\u201D.");
483            } else if (cs.isLikelyEbcdic()) {
484                tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D.");
485            } else if (cs.isObscure()) {
486                tokenizer.warn("The character encoding \u201C"
487                        + encoding
488                        + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
489            }
490            Encoding actual = cs.getActualHtmlEncoding();
491            if (actual == null) {
492                return cs;
493            } else {
494                tokenizer.warn("Using \u201C" + actual.getCanonName()
495                        + "\u201D instead of the declared encoding \u201C"
496                        + encoding + "\u201D.");
497                return actual;
498            }
499        }
500    
501        private class ReparseException extends SAXException {
502    
503        }
504    
505        void notifyAboutMetaBoundary() {
506            tokenizer.notifyAboutMetaBoundary();
507        }
508    
509        /**
510         * @param commentPolicy
511         * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
512         */
513        public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
514            tokenizer.setCommentPolicy(commentPolicy);
515        }
516    
517        /**
518         * @param contentNonXmlCharPolicy
519         * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
520         */
521        public void setContentNonXmlCharPolicy(
522                XmlViolationPolicy contentNonXmlCharPolicy) {
523            tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy);
524        }
525    
526        /**
527         * @param contentSpacePolicy
528         * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
529         */
530        public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
531            tokenizer.setContentSpacePolicy(contentSpacePolicy);
532        }
533    
534        /**
535         * @param eh
536         * @see nu.validator.htmlparser.impl.Tokenizer#setErrorHandler(org.xml.sax.ErrorHandler)
537         */
538        public void setErrorHandler(ErrorHandler eh) {
539            tokenizer.setErrorHandler(eh);
540            for (int i = 0; i < characterHandlers.length; i++) {
541                CharacterHandler ch = characterHandlers[i];
542                if (ch instanceof NormalizationChecker) {
543                    NormalizationChecker nc = (NormalizationChecker) ch;
544                    nc.setErrorHandler(eh);
545                }
546            }
547        }
548        
549        public void setTransitionHandler(TransitionHandler transitionHandler) {
550            if (tokenizer instanceof ErrorReportingTokenizer) {
551                ErrorReportingTokenizer ert = (ErrorReportingTokenizer) tokenizer;
552                ert.setTransitionHandler(transitionHandler);
553            } else if (transitionHandler != null) {
554                throw new IllegalStateException("Attempt to set a transition handler on a plain tokenizer.");
555            }
556        }
557    
558        /**
559         * @param html4ModeCompatibleWithXhtml1Schemata
560         * @see nu.validator.htmlparser.impl.Tokenizer#setHtml4ModeCompatibleWithXhtml1Schemata(boolean)
561         */
562        public void setHtml4ModeCompatibleWithXhtml1Schemata(
563                boolean html4ModeCompatibleWithXhtml1Schemata) {
564            tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata);
565        }
566    
567        /**
568         * @param mappingLangToXmlLang
569         * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean)
570         */
571        public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
572            tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang);
573        }
574    
575        /**
576         * @param namePolicy
577         * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
578         */
579        public void setNamePolicy(XmlViolationPolicy namePolicy) {
580            tokenizer.setNamePolicy(namePolicy);
581        }
582    
583        /**
584         * @param xmlnsPolicy
585         * @see nu.validator.htmlparser.impl.Tokenizer#setXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy)
586         */
587        public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
588            tokenizer.setXmlnsPolicy(xmlnsPolicy);
589        }
590    
591        public String getCharacterEncoding() throws SAXException {
592            return characterEncoding.getCanonName();
593        }
594    
595        public Locator getDocumentLocator() {
596            return tokenizer;
597        }
598    }