001    /*
002     * Copyright (c) 2009 Mozilla Foundation
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package nu.validator.htmlparser.io;
024    
025    import java.io.IOException;
026    import java.nio.charset.UnsupportedCharsetException;
027    
028    import nu.validator.htmlparser.common.ByteReadable;
029    import nu.validator.htmlparser.impl.MetaScanner;
030    
031    import org.xml.sax.ErrorHandler;
032    import org.xml.sax.Locator;
033    import org.xml.sax.SAXException;
034    import org.xml.sax.SAXParseException;
035    
036    public class MetaSniffer extends MetaScanner implements Locator {
037        
038        private Encoding characterEncoding = null;
039    
040        private final ErrorHandler errorHandler;
041        
042        private final Locator locator;
043        
044        private int line = 1;
045        
046        private int col = 0;
047        
048        private boolean prevWasCR = false;
049    
050        public MetaSniffer(ErrorHandler eh, Locator locator) {
051            this.errorHandler = eh;
052            this.locator = locator;
053            this.characterEncoding = null;
054        }
055        
056        /**
057         * -1 means end.
058         * @return
059         * @throws IOException
060         */
061        protected int read() throws IOException {
062            int b = readable.readByte();
063            // [NOCPP[
064            switch (b) {
065                case '\n':
066                    if (!prevWasCR) {
067                        line++;
068                        col = 0;
069                    }
070                    prevWasCR = false;
071                    break;
072                case '\r':
073                    line++;
074                    col = 0;
075                    prevWasCR = true;
076                    break;
077                default:
078                    col++;
079                    prevWasCR = false;
080                    break;
081            }
082            // ]NOCPP]
083            return b;
084        }
085    
086        /**
087         * Main loop.
088         * 
089         * @return
090         * 
091         * @throws SAXException
092         * @throws IOException
093         * @throws
094         */
095        public Encoding sniff(ByteReadable readable) throws SAXException, IOException {
096            this.readable = readable;
097            stateLoop(stateSave);
098            return characterEncoding;
099        }
100        
101    
102        /**
103         * @param string
104         * @throws SAXException
105         */
106        private void err(String message) throws SAXException {
107            if (errorHandler != null) {
108              SAXParseException spe = new SAXParseException(message, this);
109              errorHandler.error(spe);
110            }
111        }
112    
113        /**
114         * @param string
115         * @throws SAXException
116         */
117        private void warn(String message) throws SAXException {
118            if (errorHandler != null) {
119              SAXParseException spe = new SAXParseException(message, this);
120              errorHandler.warning(spe);
121            }
122        }
123        
124        public int getColumnNumber() {
125            return col;
126        }
127    
128        public int getLineNumber() {
129            return line;
130        }
131    
132        public String getPublicId() {
133            if (locator != null) {
134                return locator.getPublicId();
135            }
136            return null;
137        }
138    
139        public String getSystemId() {
140            if (locator != null) {
141                return locator.getSystemId();
142            }
143            return null;
144        }
145        
146        protected boolean tryCharset(String encoding) throws SAXException {
147            encoding = Encoding.toAsciiLowerCase(encoding);
148            try {
149                // XXX spec says only UTF-16
150                if ("utf-16".equals(encoding) || "utf-16be".equals(encoding) || "utf-16le".equals(encoding) || "utf-32".equals(encoding) || "utf-32be".equals(encoding) || "utf-32le".equals(encoding)) {
151                    this.characterEncoding = Encoding.UTF8;
152                    err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead.");
153                    return true;
154                } else {
155                    Encoding cs = Encoding.forName(encoding);
156                    String canonName = cs.getCanonName();
157                    if (!cs.isAsciiSuperset()) {
158                        err("The encoding \u201C"
159                                    + encoding
160                                    + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
161                        return false;
162                    }
163                    if (!cs.isRegistered()) {
164                        if (encoding.startsWith("x-")) {
165                            err("The encoding \u201C"
166                                    + encoding
167                                    + "\u201D is not an IANA-registered encoding. (Charmod C022)");                    
168                        } else {
169                            err("The encoding \u201C"
170                                    + encoding
171                                    + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
172                        }
173                    } else if (!cs.getCanonName().equals(encoding)) {
174                        err("The encoding \u201C" + encoding
175                                + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
176                                + canonName + "\u201D. (Charmod C024)");
177                    }
178                    if (cs.isShouldNot()) {
179                        warn("Authors should not use the character encoding \u201C"
180                                + encoding
181                                + "\u201D. It is recommended to use \u201CUTF-8\u201D.");                
182                    } else if (cs.isObscure()) {
183                        warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
184                    }
185                    Encoding actual = cs.getActualHtmlEncoding();
186                    if (actual == null) {
187                        this.characterEncoding = cs;
188                    } else {
189                        warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D.");
190                        this.characterEncoding = actual;
191                    }
192                    return true;
193                }
194            } catch (UnsupportedCharsetException e) {
195                err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
196            }
197            return false;
198        }
199    }