001 /* 002 * Copyright (c) 2006 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package nu.validator.htmlparser.impl; 024 025 import java.io.ByteArrayInputStream; 026 import java.io.IOException; 027 import java.io.InputStreamReader; 028 import java.io.Reader; 029 import java.nio.charset.Charset; 030 import java.nio.charset.CharsetDecoder; 031 import java.nio.charset.CodingErrorAction; 032 import java.util.Arrays; 033 import java.util.Iterator; 034 import java.util.Map; 035 import java.util.SortedMap; 036 import java.util.SortedSet; 037 import java.util.TreeSet; 038 039 public class EncodingInfo { 040 041 private static String[] NOT_OBSCURE = {"Big5", 042 "Big5-HKSCS", 043 "EUC-JP", 044 "EUC-KR", 045 "GB18030", 046 "GBK", 047 "ISO-2022-JP", 048 "ISO-2022-KR", 049 "ISO-8859-1", 050 "ISO-8859-13", 051 "ISO-8859-15", 052 "ISO-8859-2", 053 "ISO-8859-3", 054 "ISO-8859-4", 055 "ISO-8859-5", 056 "ISO-8859-6", 057 "ISO-8859-7", 058 "ISO-8859-8", 059 "ISO-8859-9", 060 "KOI8-R", 061 "Shift_JIS", 062 "TIS-620", 063 "US-ASCII", 064 "UTF-16", 065 "UTF-16BE", 066 "UTF-16LE", 067 "UTF-8", 068 "windows-1250", 069 "windows-1251", 070 "windows-1252", 071 "windows-1253", 072 "windows-1254", 073 "windows-1255", 074 "windows-1256", 075 "windows-1257", 076 "windows-1258"}; 077 078 private static String[] asciiSuperset; 079 080 private static String[] notAsciiSuperset; 081 082 static { 083 byte[] testBuf = new byte[0x63]; 084 for (int i = 0; i < 0x60; i++) { 085 testBuf[i] = (byte) (i + 0x20); 086 } 087 testBuf[0x60] = (byte) '\n'; 088 testBuf[0x61] = (byte) '\r'; 089 testBuf[0x62] = (byte) '\t'; 090 091 SortedSet<String> asciiSupersetSet = new TreeSet<String>(); 092 SortedSet<String> notAsciiSupersetSet = new TreeSet<String>(); 093 094 SortedMap charsets = Charset.availableCharsets(); 095 for (Iterator iter = charsets.entrySet().iterator(); iter.hasNext();) { 096 Map.Entry entry = (Map.Entry) iter.next(); 097 Charset cs = (Charset) entry.getValue(); 098 if (asciiMapsToBasicLatin(testBuf, cs)) { 099 asciiSupersetSet.add(cs.name().intern()); 100 } else { 101 notAsciiSupersetSet.add(cs.name().intern()); 102 } 103 } 104 105 asciiSuperset = (String[]) asciiSupersetSet.toArray(new String[0]); 106 notAsciiSuperset = (String[]) notAsciiSupersetSet.toArray(new String[0]); 107 } 108 109 public static boolean isAsciiSuperset(String preferredIanaName) { 110 return (Arrays.binarySearch(asciiSuperset, preferredIanaName) > -1); 111 } 112 113 public static boolean isNotAsciiSuperset(String preferredIanaName) { 114 return (Arrays.binarySearch(notAsciiSuperset, preferredIanaName) > -1); 115 } 116 117 public static boolean isObscure(String preferredIanaName) { 118 return !(Arrays.binarySearch(NOT_OBSCURE, preferredIanaName) > -1); 119 } 120 121 /** 122 * @param testBuf 123 * @param cs 124 */ 125 private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) { 126 CharsetDecoder dec = cs.newDecoder(); 127 dec.onMalformedInput(CodingErrorAction.REPORT); 128 dec.onUnmappableCharacter(CodingErrorAction.REPORT); 129 Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec); 130 try { 131 for (int i = 0; i < 0x60; i++) { 132 if ((i + 0x20) != r.read()) { 133 return false; 134 } 135 } 136 if ('\n' != r.read()) { 137 return false; 138 } 139 if ('\r' != r.read()) { 140 return false; 141 } 142 if ('\t' != r.read()) { 143 return false; 144 } 145 } catch (IOException e) { 146 return false; 147 } catch (Exception e) { 148 return false; 149 } 150 151 return true; 152 } 153 154 public static void main(String[] args) { 155 System.out.println("ASCII maps to Basic Latin:"); 156 for (int i = 0; i < asciiSuperset.length; i++) { 157 System.out.println(asciiSuperset[i]); 158 } 159 System.out.println(); 160 System.out.println("ASCII does not map to Basic Latin:"); 161 for (int i = 0; i < notAsciiSuperset.length; i++) { 162 System.out.println(notAsciiSuperset[i]); 163 } 164 } 165 }