001 /* 002 * Copyright (c) 2008 Mozilla Foundation 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package nu.validator.htmlparser.extra; 024 025 import java.io.IOException; 026 import java.nio.charset.UnsupportedCharsetException; 027 028 import nu.validator.htmlparser.io.Encoding; 029 030 import org.mozilla.intl.chardet.nsDetector; 031 import org.mozilla.intl.chardet.nsICharsetDetectionObserver; 032 import org.mozilla.intl.chardet.nsPSMDetector; 033 034 import com.ibm.icu.text.CharsetDetector; 035 036 public class ChardetSniffer implements nsICharsetDetectionObserver { 037 038 private final byte[] source; 039 040 private final int length; 041 042 private Encoding returnValue = null; 043 044 /** 045 * @param source 046 */ 047 public ChardetSniffer(final byte[] source, final int length) { 048 this.source = source; 049 this.length = length; 050 } 051 052 public Encoding sniff() throws IOException { 053 nsDetector detector = new nsDetector(nsPSMDetector.ALL); 054 detector.Init(this); 055 detector.DoIt(source, length, false); 056 detector.DataEnd(); 057 if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) { 058 return returnValue; 059 } else { 060 return null; 061 } 062 } 063 064 public static void main(String[] args) { 065 String[] detectable = CharsetDetector.getAllDetectableCharsets(); 066 for (int i = 0; i < detectable.length; i++) { 067 String charset = detectable[i]; 068 System.out.println(charset); 069 } 070 } 071 072 public void Notify(String charsetName) { 073 try { 074 Encoding enc = Encoding.forName(charsetName); 075 Encoding actual = enc.getActualHtmlEncoding(); 076 if (actual != null) { 077 enc = actual; 078 } 079 returnValue = enc; 080 } catch (UnsupportedCharsetException e) { 081 returnValue = null; 082 } 083 } 084 }