001    /*
002     * Copyright (c) 2008 Mozilla Foundation
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package nu.validator.htmlparser.extra;
024    
025    import java.io.IOException;
026    import java.io.InputStream;
027    
028    import nu.validator.htmlparser.common.ByteReadable;
029    import nu.validator.htmlparser.io.Encoding;
030    
031    import com.ibm.icu.text.CharsetDetector;
032    import com.ibm.icu.text.CharsetMatch;
033    
034    public class IcuDetectorSniffer extends InputStream {
035    
036        private final ByteReadable source;
037    
038        /**
039         * @param source
040         */
041        public IcuDetectorSniffer(final ByteReadable source) {
042            this.source = source;
043        }
044        
045        @Override
046        public int read() throws IOException {
047            return source.readByte();
048        }
049        
050        public Encoding sniff() throws IOException {
051            try {
052                CharsetDetector detector = new CharsetDetector();
053                detector.setText(this);
054                CharsetMatch match = detector.detect();
055                Encoding enc = Encoding.forName(match.getName());
056                Encoding actual = enc.getActualHtmlEncoding();
057                if (actual != null) {
058                    enc = actual;
059                }
060                if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
061                    return enc;
062                } else {
063                    return null;
064                }
065            } catch (Exception e) {
066                return null;
067            }
068        }
069        
070        public static void main(String[] args) {
071            String[] detectable = CharsetDetector.getAllDetectableCharsets();
072            for (int i = 0; i < detectable.length; i++) {
073                String charset = detectable[i];
074                System.out.println(charset);
075            }
076        }
077    }