001 /*
002 * Copyright (c) 2006 Henri Sivonen
003 * Copyright (c) 2008 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.io;
025
026 import java.io.ByteArrayInputStream;
027 import java.io.IOException;
028 import java.io.InputStreamReader;
029 import java.io.Reader;
030 import java.nio.charset.Charset;
031 import java.nio.charset.CharsetDecoder;
032 import java.nio.charset.CharsetEncoder;
033 import java.nio.charset.CoderMalfunctionError;
034 import java.nio.charset.CodingErrorAction;
035 import java.nio.charset.UnsupportedCharsetException;
036 import java.util.Arrays;
037 import java.util.HashMap;
038 import java.util.HashSet;
039 import java.util.Map;
040 import java.util.Set;
041 import java.util.SortedMap;
042
043 public class Encoding {
044
045 public static final Encoding UTF8;
046
047 public static final Encoding UTF16;
048
049 public static final Encoding UTF16LE;
050
051 public static final Encoding UTF16BE;
052
053 public static final Encoding WINDOWS1252;
054
055 private static String[] SHOULD_NOT = { "jisx02121990", "xjis0208" };
056
057 private static String[] BANNED = { "bocu1", "cesu8", "compoundtext",
058 "iscii91", "macarabic", "maccentraleurroman", "maccroatian",
059 "maccyrillic", "macdevanagari", "macfarsi", "macgreek",
060 "macgujarati", "macgurmukhi", "machebrew", "macicelandic",
061 "macroman", "macromanian", "macthai", "macturkish", "macukranian",
062 "scsu", "utf32", "utf32be", "utf32le", "utf7", "ximapmailboxname",
063 "xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom",
064 "xutf32lebom", "xutf16oppositeendian", "xutf16platformendian",
065 "xutf32oppositeendian", "xutf32platformendian" };
066
067 private static String[] NOT_OBSCURE = { "big5", "big5hkscs", "eucjp",
068 "euckr", "gb18030", "gbk", "iso2022jp", "iso2022kr", "iso88591",
069 "iso885913", "iso885915", "iso88592", "iso88593", "iso88594",
070 "iso88595", "iso88596", "iso88597", "iso88598", "iso88599",
071 "koi8r", "shiftjis", "tis620", "usascii", "utf16", "utf16be",
072 "utf16le", "utf8", "windows1250", "windows1251", "windows1252",
073 "windows1253", "windows1254", "windows1255", "windows1256",
074 "windows1257", "windows1258" };
075
076 private static Map<String, Encoding> encodingByCookedName = new HashMap<String, Encoding>();
077
078 private final String canonName;
079
080 private final Charset charset;
081
082 private final boolean asciiSuperset;
083
084 private final boolean obscure;
085
086 private final boolean shouldNot;
087
088 private final boolean likelyEbcdic;
089
090 private Encoding actualHtmlEncoding = null;
091
092 static {
093 byte[] testBuf = new byte[0x7F];
094 for (int i = 0; i < 0x7F; i++) {
095 if (isAsciiSupersetnessSensitive(i)) {
096 testBuf[i] = (byte) i;
097 } else {
098 testBuf[i] = (byte) 0x20;
099 }
100 }
101
102 Set<Encoding> encodings = new HashSet<Encoding>();
103
104 SortedMap<String, Charset> charsets = Charset.availableCharsets();
105 for (Map.Entry<String, Charset> entry : charsets.entrySet()) {
106 Charset cs = entry.getValue();
107 String name = toNameKey(cs.name());
108 String canonName = toAsciiLowerCase(cs.name());
109 if (!isBanned(name)) {
110 name = name.intern();
111 boolean asciiSuperset = asciiMapsToBasicLatin(testBuf, cs);
112 Encoding enc = new Encoding(canonName.intern(), cs,
113 asciiSuperset, isObscure(name), isShouldNot(name),
114 isLikelyEbcdic(name, asciiSuperset));
115 encodings.add(enc);
116 Set<String> aliases = cs.aliases();
117 for (String alias : aliases) {
118 encodingByCookedName.put(toNameKey(alias).intern(), enc);
119 }
120 }
121 }
122 // Overwrite possible overlapping aliases with the real things--just in
123 // case
124 for (Encoding encoding : encodings) {
125 encodingByCookedName.put(toNameKey(encoding.getCanonName()),
126 encoding);
127 }
128 UTF8 = forName("utf-8");
129 UTF16 = forName("utf-16");
130 UTF16BE = forName("utf-16be");
131 UTF16LE = forName("utf-16le");
132 WINDOWS1252 = forName("windows-1252");
133 try {
134 forName("iso-8859-1").actualHtmlEncoding = forName("windows-1252");
135 } catch (UnsupportedCharsetException e) {
136 }
137 try {
138 forName("iso-8859-9").actualHtmlEncoding = forName("windows-1254");
139 } catch (UnsupportedCharsetException e) {
140 }
141 try {
142 forName("iso-8859-11").actualHtmlEncoding = forName("windows-874");
143 } catch (UnsupportedCharsetException e) {
144 }
145 try {
146 forName("x-iso-8859-11").actualHtmlEncoding = forName("windows-874");
147 } catch (UnsupportedCharsetException e) {
148 }
149 try {
150 forName("tis-620").actualHtmlEncoding = forName("windows-874");
151 } catch (UnsupportedCharsetException e) {
152 }
153 try {
154 forName("gb_2312-80").actualHtmlEncoding = forName("gbk");
155 } catch (UnsupportedCharsetException e) {
156 }
157 try {
158 forName("gb2312").actualHtmlEncoding = forName("gbk");
159 } catch (UnsupportedCharsetException e) {
160 }
161 try {
162 encodingByCookedName.put("x-x-big5", forName("big5"));
163 } catch (UnsupportedCharsetException e) {
164 }
165 try {
166 encodingByCookedName.put("euc-kr", forName("windows-949"));
167 } catch (UnsupportedCharsetException e) {
168 }
169 try {
170 encodingByCookedName.put("ks_c_5601-1987", forName("windows-949"));
171 } catch (UnsupportedCharsetException e) {
172 }
173 }
174
175 private static boolean isAsciiSupersetnessSensitive(int c) {
176 return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22)
177 || (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F)
178 || (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
179 }
180
181 private static boolean isObscure(String lowerCasePreferredIanaName) {
182 return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1);
183 }
184
185 private static boolean isBanned(String lowerCasePreferredIanaName) {
186 if (lowerCasePreferredIanaName.startsWith("xibm")) {
187 return true;
188 }
189 return (Arrays.binarySearch(BANNED, lowerCasePreferredIanaName) > -1);
190 }
191
192 private static boolean isShouldNot(String lowerCasePreferredIanaName) {
193 return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1);
194 }
195
196 /**
197 * @param testBuf
198 * @param cs
199 */
200 private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
201 CharsetDecoder dec = cs.newDecoder();
202 dec.onMalformedInput(CodingErrorAction.REPORT);
203 dec.onUnmappableCharacter(CodingErrorAction.REPORT);
204 Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
205 try {
206 for (int i = 0; i < 0x7F; i++) {
207 if (isAsciiSupersetnessSensitive(i)) {
208 if (r.read() != i) {
209 return false;
210 }
211 } else {
212 if (r.read() != 0x20) {
213 return false;
214 }
215 }
216 }
217 } catch (IOException e) {
218 return false;
219 } catch (Exception e) {
220 return false;
221 } catch (CoderMalfunctionError e) {
222 return false;
223 }
224
225 return true;
226 }
227
228 private static boolean isLikelyEbcdic(String canonName,
229 boolean asciiSuperset) {
230 if (!asciiSuperset) {
231 return (canonName.startsWith("cp") || canonName.startsWith("ibm") || canonName.startsWith("xibm"));
232 } else {
233 return false;
234 }
235 }
236
237 public static Encoding forName(String name) {
238 Encoding rv = encodingByCookedName.get(toNameKey(name));
239 if (rv == null) {
240 throw new UnsupportedCharsetException(name);
241 } else {
242 return rv;
243 }
244 }
245
246 public static String toNameKey(String str) {
247 if (str == null) {
248 return null;
249 }
250 int j = 0;
251 char[] buf = new char[str.length()];
252 for (int i = 0; i < str.length(); i++) {
253 char c = str.charAt(i);
254 if (c >= 'A' && c <= 'Z') {
255 c += 0x20;
256 }
257 if (!((c >= '\t' && c <= '\r') || (c >= '\u0020' && c <= '\u002F')
258 || (c >= '\u003A' && c <= '\u0040')
259 || (c >= '\u005B' && c <= '\u0060') || (c >= '\u007B' && c <= '\u007E'))) {
260 buf[j] = c;
261 j++;
262 }
263 }
264 return new String(buf, 0, j);
265 }
266
267 public static String toAsciiLowerCase(String str) {
268 if (str == null) {
269 return null;
270 }
271 char[] buf = new char[str.length()];
272 for (int i = 0; i < str.length(); i++) {
273 char c = str.charAt(i);
274 if (c >= 'A' && c <= 'Z') {
275 c += 0x20;
276 }
277 buf[i] = c;
278 }
279 return new String(buf);
280 }
281
282 /**
283 * @param canonName
284 * @param charset
285 * @param asciiSuperset
286 * @param obscure
287 * @param shouldNot
288 * @param likelyEbcdic
289 */
290 private Encoding(final String canonName, final Charset charset,
291 final boolean asciiSuperset, final boolean obscure,
292 final boolean shouldNot, final boolean likelyEbcdic) {
293 this.canonName = canonName;
294 this.charset = charset;
295 this.asciiSuperset = asciiSuperset;
296 this.obscure = obscure;
297 this.shouldNot = shouldNot;
298 this.likelyEbcdic = likelyEbcdic;
299 }
300
301 /**
302 * Returns the asciiSuperset.
303 *
304 * @return the asciiSuperset
305 */
306 public boolean isAsciiSuperset() {
307 return asciiSuperset;
308 }
309
310 /**
311 * Returns the canonName.
312 *
313 * @return the canonName
314 */
315 public String getCanonName() {
316 return canonName;
317 }
318
319 /**
320 * Returns the likelyEbcdic.
321 *
322 * @return the likelyEbcdic
323 */
324 public boolean isLikelyEbcdic() {
325 return likelyEbcdic;
326 }
327
328 /**
329 * Returns the obscure.
330 *
331 * @return the obscure
332 */
333 public boolean isObscure() {
334 return obscure;
335 }
336
337 /**
338 * Returns the shouldNot.
339 *
340 * @return the shouldNot
341 */
342 public boolean isShouldNot() {
343 return shouldNot;
344 }
345
346 public boolean isRegistered() {
347 return !canonName.startsWith("x-");
348 }
349
350 /**
351 * @return
352 * @see java.nio.charset.Charset#canEncode()
353 */
354 public boolean canEncode() {
355 return charset.canEncode();
356 }
357
358 /**
359 * @return
360 * @see java.nio.charset.Charset#newDecoder()
361 */
362 public CharsetDecoder newDecoder() {
363 return charset.newDecoder();
364 }
365
366 /**
367 * @return
368 * @see java.nio.charset.Charset#newEncoder()
369 */
370 public CharsetEncoder newEncoder() {
371 return charset.newEncoder();
372 }
373
374 /**
375 * Returns the actualHtmlEncoding.
376 *
377 * @return the actualHtmlEncoding
378 */
379 public Encoding getActualHtmlEncoding() {
380 return actualHtmlEncoding;
381 }
382
383 public static void main(String[] args) {
384 for (Map.Entry<String, Encoding> entry : encodingByCookedName.entrySet()) {
385 String name = entry.getKey();
386 Encoding enc = entry.getValue();
387 System.out.printf(
388 "%21s: canon %21s, obs %5s, reg %5s, asc %5s, ebc %5s\n",
389 name, enc.getCanonName(), enc.isObscure(),
390 enc.isRegistered(), enc.isAsciiSuperset(),
391 enc.isLikelyEbcdic());
392 }
393 }
394
395 }