001 /*
002 * Copyright (c) 2008 Mozilla Foundation
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package nu.validator.htmlparser.extra;
024
025 import java.io.IOException;
026 import java.nio.charset.UnsupportedCharsetException;
027
028 import nu.validator.htmlparser.io.Encoding;
029
030 import org.mozilla.intl.chardet.nsDetector;
031 import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
032 import org.mozilla.intl.chardet.nsPSMDetector;
033
034 import com.ibm.icu.text.CharsetDetector;
035
036 public class ChardetSniffer implements nsICharsetDetectionObserver {
037
038 private final byte[] source;
039
040 private final int length;
041
042 private Encoding returnValue = null;
043
044 /**
045 * @param source
046 */
047 public ChardetSniffer(final byte[] source, final int length) {
048 this.source = source;
049 this.length = length;
050 }
051
052 public Encoding sniff() throws IOException {
053 nsDetector detector = new nsDetector(nsPSMDetector.ALL);
054 detector.Init(this);
055 detector.DoIt(source, length, false);
056 detector.DataEnd();
057 if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
058 return returnValue;
059 } else {
060 return null;
061 }
062 }
063
064 public static void main(String[] args) {
065 String[] detectable = CharsetDetector.getAllDetectableCharsets();
066 for (int i = 0; i < detectable.length; i++) {
067 String charset = detectable[i];
068 System.out.println(charset);
069 }
070 }
071
072 public void Notify(String charsetName) {
073 try {
074 Encoding enc = Encoding.forName(charsetName);
075 Encoding actual = enc.getActualHtmlEncoding();
076 if (actual != null) {
077 enc = actual;
078 }
079 returnValue = enc;
080 } catch (UnsupportedCharsetException e) {
081 returnValue = null;
082 }
083 }
084 }