001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package nu.validator.htmlparser.io;
024    
025    import java.io.IOException;
026    
027    import nu.validator.htmlparser.common.ByteReadable;
028    
029    /**
030     * The BOM sniffing part of the HTML5 encoding sniffing algorithm.
031     * 
032     * @version $Id$
033     * @author hsivonen
034     */
035    public final class BomSniffer {
036        
037        private final ByteReadable source;
038    
039        /**
040         * @param source
041         */
042        public BomSniffer(final ByteReadable source) {
043            this.source = source;
044        }
045        
046        Encoding sniff() throws IOException {
047            int b = source.readByte();
048            if (b == 0xEF) { // UTF-8
049                b = source.readByte();
050                if (b == 0xBB) {
051                    b = source.readByte();
052                    if (b == 0xBF) {
053                        return Encoding.UTF8;
054                    } else {
055                        return null;
056                    }
057                } else {
058                    return null;
059                }
060            } else if (b == 0xFF) { // little-endian
061                b = source.readByte();
062                if (b == 0xFE) {
063                    return Encoding.UTF16LE;
064                } else {
065                    return null;
066                }
067            } else if (b == 0xFE) { // big-endian UTF-16
068                b = source.readByte();
069                if (b == 0xFF) {
070                    return Encoding.UTF16BE;
071                } else {
072                    return null;
073                }
074            } else {
075                return null;            
076            }
077        }
078        
079    }