001    /*
002     * Copyright (c) 2008 Mozilla Foundation
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package nu.validator.htmlparser.extra;
024    
025    import java.io.IOException;
026    import java.nio.charset.UnsupportedCharsetException;
027    
028    import nu.validator.htmlparser.io.Encoding;
029    
030    import org.mozilla.intl.chardet.nsDetector;
031    import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
032    import org.mozilla.intl.chardet.nsPSMDetector;
033    
034    import com.ibm.icu.text.CharsetDetector;
035    
036    public class ChardetSniffer implements nsICharsetDetectionObserver {
037    
038        private final byte[] source;
039    
040        private final int length;
041        
042        private Encoding returnValue = null;
043        
044        /**
045         * @param source
046         */
047        public ChardetSniffer(final byte[] source, final int length) {
048            this.source = source;
049            this.length = length;
050        }
051        
052        public Encoding sniff() throws IOException {
053            nsDetector detector = new nsDetector(nsPSMDetector.ALL);
054            detector.Init(this);
055            detector.DoIt(source, length, false);
056            detector.DataEnd();
057            if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
058                return returnValue;
059            } else {
060                return null;
061            }
062        }
063        
064        public static void main(String[] args) {
065            String[] detectable = CharsetDetector.getAllDetectableCharsets();
066            for (int i = 0; i < detectable.length; i++) {
067                String charset = detectable[i];
068                System.out.println(charset);
069            }
070        }
071    
072        public void Notify(String charsetName) {
073            try {
074                Encoding enc = Encoding.forName(charsetName);
075                Encoding actual = enc.getActualHtmlEncoding();
076                if (actual != null) {
077                    enc = actual;
078                }
079                returnValue = enc;
080            } catch (UnsupportedCharsetException e) {
081                returnValue = null;
082            }
083        }
084    }