001 /* 002 * Copyright (c) 2008 Mozilla Foundation 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package nu.validator.htmlparser.extra; 024 025 import java.io.IOException; 026 import java.io.InputStream; 027 028 import nu.validator.htmlparser.common.ByteReadable; 029 import nu.validator.htmlparser.io.Encoding; 030 031 import com.ibm.icu.text.CharsetDetector; 032 import com.ibm.icu.text.CharsetMatch; 033 034 public class IcuDetectorSniffer extends InputStream { 035 036 private final ByteReadable source; 037 038 /** 039 * @param source 040 */ 041 public IcuDetectorSniffer(final ByteReadable source) { 042 this.source = source; 043 } 044 045 @Override 046 public int read() throws IOException { 047 return source.readByte(); 048 } 049 050 public Encoding sniff() throws IOException { 051 try { 052 CharsetDetector detector = new CharsetDetector(); 053 detector.setText(this); 054 CharsetMatch match = detector.detect(); 055 Encoding enc = Encoding.forName(match.getName()); 056 Encoding actual = enc.getActualHtmlEncoding(); 057 if (actual != null) { 058 enc = actual; 059 } 060 if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { 061 return enc; 062 } else { 063 return null; 064 } 065 } catch (Exception e) { 066 return null; 067 } 068 } 069 070 public static void main(String[] args) { 071 String[] detectable = CharsetDetector.getAllDetectableCharsets(); 072 for (int i = 0; i < detectable.length; i++) { 073 String charset = detectable[i]; 074 System.out.println(charset); 075 } 076 } 077 }