001 package nu.validator.io;
002
003 import java.io.ByteArrayInputStream;
004 import java.io.IOException;
005 import java.io.InputStreamReader;
006 import java.io.Reader;
007 import java.nio.charset.Charset;
008 import java.nio.charset.CharsetDecoder;
009 import java.nio.charset.CodingErrorAction;
010 import java.util.Arrays;
011 import java.util.Iterator;
012 import java.util.Map;
013 import java.util.SortedMap;
014 import java.util.SortedSet;
015 import java.util.TreeSet;
016
017 public class EncodingInfo {
018
019 private static String[] NOT_OBSCURE = {"Big5",
020 "Big5-HKSCS",
021 "EUC-JP",
022 "EUC-KR",
023 "GB18030",
024 "GBK",
025 "ISO-2022-JP",
026 "ISO-2022-KR",
027 "ISO-8859-1",
028 "ISO-8859-13",
029 "ISO-8859-15",
030 "ISO-8859-2",
031 "ISO-8859-3",
032 "ISO-8859-4",
033 "ISO-8859-5",
034 "ISO-8859-6",
035 "ISO-8859-7",
036 "ISO-8859-8",
037 "ISO-8859-9",
038 "KOI8-R",
039 "Shift_JIS",
040 "TIS-620",
041 "US-ASCII",
042 "UTF-16",
043 "UTF-16BE",
044 "UTF-16LE",
045 "UTF-8",
046 "windows-1250",
047 "windows-1251",
048 "windows-1252",
049 "windows-1253",
050 "windows-1254",
051 "windows-1255",
052 "windows-1256",
053 "windows-1257",
054 "windows-1258"};
055
056 private static String[] asciiSuperset;
057
058 private static String[] notAsciiSuperset;
059
060 static {
061 byte[] testBuf = new byte[0x63];
062 for (int i = 0; i < 0x60; i++) {
063 testBuf[i] = (byte) (i + 0x20);
064 }
065 testBuf[0x60] = (byte) '\n';
066 testBuf[0x61] = (byte) '\r';
067 testBuf[0x62] = (byte) '\t';
068
069 SortedSet<String> asciiSupersetSet = new TreeSet<String>();
070 SortedSet<String> notAsciiSupersetSet = new TreeSet<String>();
071
072 SortedMap charsets = Charset.availableCharsets();
073 for (Iterator iter = charsets.entrySet().iterator(); iter.hasNext();) {
074 Map.Entry entry = (Map.Entry) iter.next();
075 Charset cs = (Charset) entry.getValue();
076 if (asciiMapsToBasicLatin(testBuf, cs)) {
077 asciiSupersetSet.add(cs.name().intern());
078 } else {
079 notAsciiSupersetSet.add(cs.name().intern());
080 }
081 }
082
083 asciiSuperset = (String[]) asciiSupersetSet.toArray(new String[0]);
084 notAsciiSuperset = (String[]) notAsciiSupersetSet.toArray(new String[0]);
085 }
086
087 public static boolean isAsciiSuperset(String preferredIanaName) {
088 return (Arrays.binarySearch(asciiSuperset, preferredIanaName) > -1);
089 }
090
091 public static boolean isNotAsciiSuperset(String preferredIanaName) {
092 return (Arrays.binarySearch(notAsciiSuperset, preferredIanaName) > -1);
093 }
094
095 public static boolean isObscure(String preferredIanaName) {
096 return !(Arrays.binarySearch(NOT_OBSCURE, preferredIanaName) > -1);
097 }
098
099 /**
100 * @param testBuf
101 * @param cs
102 */
103 private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
104 CharsetDecoder dec = cs.newDecoder();
105 dec.onMalformedInput(CodingErrorAction.REPORT);
106 dec.onUnmappableCharacter(CodingErrorAction.REPORT);
107 Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
108 try {
109 for (int i = 0; i < 0x60; i++) {
110 if ((i + 0x20) != r.read()) {
111 return false;
112 }
113 }
114 if ('\n' != r.read()) {
115 return false;
116 }
117 if ('\r' != r.read()) {
118 return false;
119 }
120 if ('\t' != r.read()) {
121 return false;
122 }
123 } catch (IOException e) {
124 return false;
125 } catch (Exception e) {
126 return false;
127 }
128
129 return true;
130 }
131
132 public static void main(String[] args) {
133 System.out.println("ASCII maps to Basic Latin:");
134 for (int i = 0; i < asciiSuperset.length; i++) {
135 System.out.println(asciiSuperset[i]);
136 }
137 System.out.println();
138 System.out.println("ASCII does not map to Basic Latin:");
139 for (int i = 0; i < notAsciiSuperset.length; i++) {
140 System.out.println(notAsciiSuperset[i]);
141 }
142 }
143 }