001 /*
002 * Copyright (c) 2006 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package nu.validator.htmlparser.impl;
024
025 import java.io.ByteArrayInputStream;
026 import java.io.IOException;
027 import java.io.InputStreamReader;
028 import java.io.Reader;
029 import java.nio.charset.Charset;
030 import java.nio.charset.CharsetDecoder;
031 import java.nio.charset.CodingErrorAction;
032 import java.util.Arrays;
033 import java.util.Iterator;
034 import java.util.Map;
035 import java.util.SortedMap;
036 import java.util.SortedSet;
037 import java.util.TreeSet;
038
039 public class EncodingInfo {
040
041 private static String[] NOT_OBSCURE = {"Big5",
042 "Big5-HKSCS",
043 "EUC-JP",
044 "EUC-KR",
045 "GB18030",
046 "GBK",
047 "ISO-2022-JP",
048 "ISO-2022-KR",
049 "ISO-8859-1",
050 "ISO-8859-13",
051 "ISO-8859-15",
052 "ISO-8859-2",
053 "ISO-8859-3",
054 "ISO-8859-4",
055 "ISO-8859-5",
056 "ISO-8859-6",
057 "ISO-8859-7",
058 "ISO-8859-8",
059 "ISO-8859-9",
060 "KOI8-R",
061 "Shift_JIS",
062 "TIS-620",
063 "US-ASCII",
064 "UTF-16",
065 "UTF-16BE",
066 "UTF-16LE",
067 "UTF-8",
068 "windows-1250",
069 "windows-1251",
070 "windows-1252",
071 "windows-1253",
072 "windows-1254",
073 "windows-1255",
074 "windows-1256",
075 "windows-1257",
076 "windows-1258"};
077
078 private static String[] asciiSuperset;
079
080 private static String[] notAsciiSuperset;
081
082 static {
083 byte[] testBuf = new byte[0x63];
084 for (int i = 0; i < 0x60; i++) {
085 testBuf[i] = (byte) (i + 0x20);
086 }
087 testBuf[0x60] = (byte) '\n';
088 testBuf[0x61] = (byte) '\r';
089 testBuf[0x62] = (byte) '\t';
090
091 SortedSet<String> asciiSupersetSet = new TreeSet<String>();
092 SortedSet<String> notAsciiSupersetSet = new TreeSet<String>();
093
094 SortedMap charsets = Charset.availableCharsets();
095 for (Iterator iter = charsets.entrySet().iterator(); iter.hasNext();) {
096 Map.Entry entry = (Map.Entry) iter.next();
097 Charset cs = (Charset) entry.getValue();
098 if (asciiMapsToBasicLatin(testBuf, cs)) {
099 asciiSupersetSet.add(cs.name().intern());
100 } else {
101 notAsciiSupersetSet.add(cs.name().intern());
102 }
103 }
104
105 asciiSuperset = (String[]) asciiSupersetSet.toArray(new String[0]);
106 notAsciiSuperset = (String[]) notAsciiSupersetSet.toArray(new String[0]);
107 }
108
109 public static boolean isAsciiSuperset(String preferredIanaName) {
110 return (Arrays.binarySearch(asciiSuperset, preferredIanaName) > -1);
111 }
112
113 public static boolean isNotAsciiSuperset(String preferredIanaName) {
114 return (Arrays.binarySearch(notAsciiSuperset, preferredIanaName) > -1);
115 }
116
117 public static boolean isObscure(String preferredIanaName) {
118 return !(Arrays.binarySearch(NOT_OBSCURE, preferredIanaName) > -1);
119 }
120
121 /**
122 * @param testBuf
123 * @param cs
124 */
125 private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
126 CharsetDecoder dec = cs.newDecoder();
127 dec.onMalformedInput(CodingErrorAction.REPORT);
128 dec.onUnmappableCharacter(CodingErrorAction.REPORT);
129 Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
130 try {
131 for (int i = 0; i < 0x60; i++) {
132 if ((i + 0x20) != r.read()) {
133 return false;
134 }
135 }
136 if ('\n' != r.read()) {
137 return false;
138 }
139 if ('\r' != r.read()) {
140 return false;
141 }
142 if ('\t' != r.read()) {
143 return false;
144 }
145 } catch (IOException e) {
146 return false;
147 } catch (Exception e) {
148 return false;
149 }
150
151 return true;
152 }
153
154 public static void main(String[] args) {
155 System.out.println("ASCII maps to Basic Latin:");
156 for (int i = 0; i < asciiSuperset.length; i++) {
157 System.out.println(asciiSuperset[i]);
158 }
159 System.out.println();
160 System.out.println("ASCII does not map to Basic Latin:");
161 for (int i = 0; i < notAsciiSuperset.length; i++) {
162 System.out.println(notAsciiSuperset[i]);
163 }
164 }
165 }