001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package nu.validator.htmlparser.impl;
024    
025    import java.io.IOException;
026    import java.nio.charset.Charset;
027    import java.nio.charset.CharsetDecoder;
028    import java.nio.charset.IllegalCharsetNameException;
029    import java.nio.charset.UnsupportedCharsetException;
030    import java.util.regex.Matcher;
031    import java.util.regex.Pattern;
032    
033    
034    import org.xml.sax.ErrorHandler;
035    import org.xml.sax.Locator;
036    import org.xml.sax.SAXException;
037    import org.xml.sax.SAXParseException;
038    
039    public final class MetaSniffer implements Locator {
040    
041        private class StopSniffingException extends Exception {
042    
043        }
044    
045        private static final Pattern CONTENT = Pattern.compile("^[^;]*;[\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*[cC][hH][aA][rR][sS][eE][tT][\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*=[\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*(?:(?:([^'\"\\x09\\x0A\\x0B\\x0C\\x0D\\x20][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*)(?:[\\x09\\x0A\\x0B\\x0C\\x0D\\x20].*)?)|(?:\"([^\"]*)\".*)|(?:'([^']*)'.*))$", Pattern.DOTALL);
046        
047        private enum MetaState {
048            NO, M, E, T, A
049        }
050    
051        private final ByteReadable source;
052        
053        private final ErrorHandler errorHandler;
054        
055        private CharsetDecoder charsetDecoder = null;
056        
057        private StringBuilder attributeName = new StringBuilder();
058    
059        private StringBuilder attributeValue = new StringBuilder();
060    
061        private MetaState metaState = MetaState.NO;
062    
063        private int unread = -1;
064    
065        private int line = 1;
066        
067        private int col = 0;
068        
069        private boolean prevWasCR = false;
070    
071        private final Locator locator;
072        
073        /**
074         * @param source
075         * @param errorHandler
076         * @param publicId
077         * @param systemId
078         */
079        public MetaSniffer(ByteReadable source, ErrorHandler eh, Locator locator) {
080            this.source = source;
081            this.errorHandler = eh;
082            this.locator = locator;
083        }
084    
085        // Making this method return an int instead of a char was
086        // probably a mistake :-(
087        private int read() throws IOException, StopSniffingException {
088            if (unread == -1) {
089                int b = source.readByte();
090                switch (b) {
091                    case -1: // end
092                        throw new StopSniffingException();
093                    case 0x0A: // LF
094                        if (!prevWasCR) {
095                            line++;
096                            col = 0;
097                        }
098                        prevWasCR = false;
099                        break;
100                    case 0x0D: // CR
101                        line++;
102                        col = 0;
103                        prevWasCR = true;
104                        break;
105                    default:
106                        col++;
107                        prevWasCR = false;
108                        break;
109                }
110                return b;
111            } else {
112                int b = unread;
113                unread = -1;
114                return b;
115            }
116        }
117    
118        private void unread(int b) {
119            this.unread = b;
120        }
121    
122        /**
123         * Main loop.
124         * 
125         * @return
126         * 
127         * @throws SAXException
128         * @throws IOException
129         * @throws
130         */
131        public CharsetDecoder sniff() throws SAXException, IOException {
132            try {
133                for (;;) {
134                    if (read() == 0x3C) { // <
135                        markup();
136                    }
137                }
138            } catch (StopSniffingException e) {
139                return charsetDecoder;
140            }
141        }
142    
143        /**
144         * <
145         * 
146         * @throws SAXException
147         * @throws StopSniffingException 
148         * @throws IOException 
149         */
150        private void markup() throws SAXException, StopSniffingException, IOException {
151            int b = read();
152            if (b == 0x21) { // !
153                markupDecl();
154            } else if (b == 0x2F) { // /
155                endTag();
156            } else if (b == 0x3F) { // ?
157                consumeUntilAndIncludingGt();
158            } else if (b == 0x4D || b == 0x6D) { // m or M
159                metaState = MetaState.M;
160                tag();
161            } else if ((b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) { // ASCII
162                                                                                // letter
163                metaState = MetaState.NO;
164                tag();
165            }
166        }
167    
168        /**
169         * < , x
170         * 
171         * @throws SAXException
172         * @throws StopSniffingException 
173         * @throws IOException 
174         */
175        private void tag() throws SAXException, StopSniffingException, IOException {
176            int b;
177            loop: for (;;) {
178                b = read();
179                switch (b) {
180                    case 0x09: // tab
181                    case 0x0A: // LF
182                    case 0x0B: // VT
183                    case 0x0C: // FF
184                    case 0x0D: // CR
185                    case 0x20: // space
186                    case 0x3E: // >
187                    case 0x3C: // <
188                        break loop;
189                    case 0x45: // E
190                    case 0x65: // e
191                        if (metaState == MetaState.M) {
192                            metaState = MetaState.E;
193                        } else {
194                            metaState = MetaState.NO;
195                        }
196                        continue loop;
197                    case 0x54: // T
198                    case 0x74: // t
199                        if (metaState == MetaState.E) {
200                            metaState = MetaState.T;
201                        } else {
202                            metaState = MetaState.NO;
203                        }
204                        continue loop;
205                    case 0x41: // A
206                    case 0x61: // a
207                        if (metaState == MetaState.T) {
208                            metaState = MetaState.A;
209                        } else {
210                            metaState = MetaState.NO;
211                        }
212                        continue loop;
213                    default:
214                        metaState = MetaState.NO;
215                        continue loop;
216                }
217            }
218            unread(b);
219            if (b != 0x3C) {
220                while (attribute())
221                    ;
222            }
223        }
224    
225        /**
226         * The "get an attribute" subalgorithm.
227         * 
228         * @return <code>false</code> when to stop
229         * @throws SAXException
230         * @throws StopSniffingException 
231         * @throws IOException 
232         */
233        private boolean attribute() throws SAXException, StopSniffingException, IOException {
234            int b;
235            loop: for (;;) {
236                b = read();
237                switch (b) {
238                    case 0x09: // tab
239                    case 0x0A: // LF
240                    case 0x0B: // VT
241                    case 0x0C: // FF
242                    case 0x0D: // CR
243                    case 0x20: // space
244                    case 0x2F: // /
245                        continue loop;
246                    default:
247                        break loop;
248                }
249            }
250            if (b == 0x3C) { // <
251                unread(b);
252                return false;
253            }
254            if (b == 0x3E) { // >
255                return false;
256            }
257            attributeName.setLength(0);
258            attributeValue.setLength(0);
259            unread(b); // this is a bit ugly
260            name: for (;;) {
261                b = read();
262                switch (b) {
263                    case 0x3D: // =
264                        // not actually advancing here yet
265                        break name;
266                    case 0x09: // tab
267                    case 0x0A: // LF
268                    case 0x0B: // VT
269                    case 0x0C: // FF
270                    case 0x0D: // CR
271                    case 0x20: // space
272                        spaces: for (;;) {
273                            b = read();
274                            switch (b) {
275                                case 0x09: // tab
276                                case 0x0A: // LF
277                                case 0x0B: // VT
278                                case 0x0C: // FF
279                                case 0x0D: // CR
280                                case 0x20: // space
281                                    continue spaces;
282                                default:
283                                    break name;
284                            }
285                        }
286                    case 0x2f: // /
287                        return true;
288                    case 0x3C: // <
289                        unread(b);
290                        return false;
291                    case 0x3E: // >
292                        return false;
293                    default:
294                        if (metaState == MetaState.A) {
295                            // could use a highly-efficient state machine
296                            // here instead of a buffer...
297                            if (b >= 0x41 && b <= 0x5A) {
298                                attributeName.append((char) (b + 0x20));
299                            } else {
300                                attributeName.append((char) b);
301                            }
302                        }
303                        continue name;
304                }
305            }
306            if (b != 0x3D) {
307                // "If the byte at position is not 0x3D (ASCII '='), stop looking
308                // for
309                // an attribute. Move position back to the previous byte."
310                unread(b);
311                return true;
312            }
313            value: for (;;) {
314                b = read();
315                switch (b) {
316                    case 0x09: // tab
317                    case 0x0A: // LF
318                    case 0x0B: // VT
319                    case 0x0C: // FF
320                    case 0x0D: // CR
321                    case 0x20: // space
322                        continue value;
323                    default:
324                        break value;
325                }
326            }
327            switch (b) {
328                case 0x22: // "
329                    quotedAttribute(0x22);
330                    return true;
331                case 0x27: // '
332                    quotedAttribute(0x27);
333                    return true;
334                case 0x3C: // <
335                    unread(b);
336                    return false;
337                case 0x3E: // >
338                    return false;
339                default:
340                    unread(b);
341                    return unquotedAttribute();
342            }
343        }
344    
345        private boolean unquotedAttribute() throws SAXException, StopSniffingException, IOException {
346            int b;
347            for (;;) {
348                b = read();
349                switch (b) {
350                    case 0x09: // tab
351                    case 0x0A: // LF
352                    case 0x0B: // VT
353                    case 0x0C: // FF
354                    case 0x0D: // CR
355                    case 0x20: // space
356                        checkAttribute();
357                        return true;
358                    case 0x3E: // >
359                        checkAttribute();
360                        return false;
361                    case 0x3C: // <
362                        checkAttribute();
363                        unread(b);
364                        return false;
365                    default:
366                        // omitting uppercasing
367                        if (metaState == MetaState.A) {
368                            attributeValue.append((char) b);
369                        }
370                        break;
371                }
372            }
373        }
374    
375        private void checkAttribute() throws SAXException, StopSniffingException {
376            if (metaState == MetaState.A) {
377                String name = attributeName.toString();
378                if ("charset".equals(name)) {
379                    // XXX revisit trim() to trime only space characters
380                    tryCharset(attributeValue.toString().trim());
381                } else if ("content".equals(name)) {
382                    Matcher m = CONTENT.matcher(attributeValue);
383                    if (m.matches()) {
384                        String value = null;
385                        for (int i = 1; i < 4; i++) {
386                            value = m.group(i);
387                            if (value != null) {
388                                tryCharset(value);
389                                break;
390                            }
391                        }
392                    }
393                }
394            }
395        }
396    
397        private void tryCharset(String encoding) throws SAXException, StopSniffingException {
398            encoding = encoding.toUpperCase();
399            try {
400                // XXX deviating from the spec as per mjs on IRC.
401                if ("UTF-16".equals(encoding) || "UTF-16BE".equals(encoding) || "UTF-16LE".equals(encoding) || "UTF-32".equals(encoding) || "UTF-32BE".equals(encoding) || "UTF-32LE".equals(encoding)) {
402                    this.charsetDecoder = Charset.forName("UTF-8").newDecoder();
403                    err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead.");
404                    throw new StopSniffingException();
405                } else {
406                    Charset cs = Charset.forName(encoding);
407                    String canonName = cs.name();
408                    if (!EncodingInfo.isAsciiSuperset(canonName)) {
409                        err("The encoding \u201C"
410                                    + encoding
411                                    + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
412                        return;
413                    }
414                    if (canonName.startsWith("X-") || canonName.startsWith("x-")
415                            || canonName.startsWith("Mac")) {
416                        if (encoding.startsWith("X-")) {
417                            err("The encoding \u201C" + encoding
418                                    + "\u201D is not an IANA-registered encoding. (Charmod C022)");
419                        } else {
420                            err("The encoding \u201C" + encoding
421                                    + "\u201D is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
422                        }
423                    } else if (!canonName.equalsIgnoreCase(encoding)) {
424                        err("The encoding \u201C" + encoding
425                                + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
426                                + canonName + "\u201D. (Charmod C024)");
427                    }
428                    if (EncodingInfo.isObscure(canonName)) {
429                        warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
430                    }
431                    this.charsetDecoder = cs.newDecoder();
432                    throw new StopSniffingException();
433                }
434            } catch (IllegalCharsetNameException e) {
435                err("Illegal character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
436            } catch (UnsupportedCharsetException e) {
437                err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
438            }
439        }
440    
441        /**
442         * @param string
443         * @throws SAXException
444         */
445        private void err(String message) throws SAXException {
446            if (errorHandler != null) {
447              SAXParseException spe = new SAXParseException(message, this);
448              errorHandler.error(spe);
449            }
450        }
451    
452        /**
453         * @param string
454         * @throws SAXException
455         */
456        private void warn(String message) throws SAXException {
457            if (errorHandler != null) {
458              SAXParseException spe = new SAXParseException(message, this);
459              errorHandler.warning(spe);
460            }
461        }
462        
463        private void quotedAttribute(int delim) throws SAXException, StopSniffingException, IOException {
464            int b;
465            for (;;) {
466                b = read();
467                if (b == delim) {
468                    checkAttribute();
469                    return;
470                } else {
471                    if (metaState == MetaState.A) {
472                        attributeValue.append((char) b);
473                    }
474                }
475            }
476        }
477    
478        private void consumeUntilAndIncludingGt() throws IOException, StopSniffingException {
479            for (;;) {
480                if (read() == 0x3E) { // >
481                    return;
482                }
483            }
484        }
485    
486        /**
487         * Seen < , /
488         * 
489         * @throws SAXException
490         * @throws StopSniffingException 
491         * @throws IOException 
492         */
493        private void endTag() throws SAXException, StopSniffingException, IOException {
494            int b = read();
495            if ((b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) { // ASCII
496                // letter
497                metaState = MetaState.NO;
498                tag();
499            } else {
500                consumeUntilAndIncludingGt();
501            }
502        }
503    
504        /**
505         * Seen < , !
506         * @throws IOException 
507         * @throws StopSniffingException 
508         */
509        private void markupDecl() throws IOException, StopSniffingException {
510            if (read() == 0x2D) { // -
511                comment();
512            } else {
513                consumeUntilAndIncludingGt();
514            }
515        }
516    
517        /**
518         * Seen < , ! , -
519         * @throws IOException 
520         * @throws StopSniffingException 
521         */
522        private void comment() throws IOException, StopSniffingException {
523            if (read() == 0x2D) { // -
524                int hyphensSeen = 2;
525                for (;;) {
526                    int b = read();
527                    if (b == 0x2D) { // -
528                        hyphensSeen++;
529                    } else if (b == 0x3E) { // >
530                        if (hyphensSeen >= 2) {
531                            return;
532                        } else {
533                            hyphensSeen = 0;
534                        }
535                    } else {
536                        hyphensSeen = 0;
537                    }
538                }
539            } else {
540                consumeUntilAndIncludingGt();
541            }
542        }
543    
544        public int getColumnNumber() {
545            return col;
546        }
547    
548        public int getLineNumber() {
549            return line;
550        }
551    
552        public String getPublicId() {
553            if (locator != null) {
554                return locator.getPublicId();
555            }
556            return null;
557        }
558    
559        public String getSystemId() {
560            if (locator != null) {
561                return locator.getSystemId();
562            }
563            return null;
564        }
565    
566    }