001 /* 002 * Copyright (c) 2006 Henri Sivonen 003 * Copyright (c) 2008 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.htmlparser.io; 025 026 import java.io.ByteArrayInputStream; 027 import java.io.IOException; 028 import java.io.InputStreamReader; 029 import java.io.Reader; 030 import java.nio.charset.Charset; 031 import java.nio.charset.CharsetDecoder; 032 import java.nio.charset.CharsetEncoder; 033 import java.nio.charset.CoderMalfunctionError; 034 import java.nio.charset.CodingErrorAction; 035 import java.nio.charset.UnsupportedCharsetException; 036 import java.util.Arrays; 037 import java.util.HashMap; 038 import java.util.HashSet; 039 import java.util.Map; 040 import java.util.Set; 041 import java.util.SortedMap; 042 043 public class Encoding { 044 045 public static final Encoding UTF8; 046 047 public static final Encoding UTF16; 048 049 public static final Encoding UTF16LE; 050 051 public static final Encoding UTF16BE; 052 053 public static final Encoding WINDOWS1252; 054 055 private static String[] SHOULD_NOT = { "jisx02121990", "xjis0208" }; 056 057 private static String[] BANNED = { "bocu1", "cesu8", "compoundtext", 058 "iscii91", "macarabic", "maccentraleurroman", "maccroatian", 059 "maccyrillic", "macdevanagari", "macfarsi", "macgreek", 060 "macgujarati", "macgurmukhi", "machebrew", "macicelandic", 061 "macroman", "macromanian", "macthai", "macturkish", "macukranian", 062 "scsu", "utf32", "utf32be", "utf32le", "utf7", "ximapmailboxname", 063 "xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom", 064 "xutf32lebom", "xutf16oppositeendian", "xutf16platformendian", 065 "xutf32oppositeendian", "xutf32platformendian" }; 066 067 private static String[] NOT_OBSCURE = { "big5", "big5hkscs", "eucjp", 068 "euckr", "gb18030", "gbk", "iso2022jp", "iso2022kr", "iso88591", 069 "iso885913", "iso885915", "iso88592", "iso88593", "iso88594", 070 "iso88595", "iso88596", "iso88597", "iso88598", "iso88599", 071 "koi8r", "shiftjis", "tis620", "usascii", "utf16", "utf16be", 072 "utf16le", "utf8", "windows1250", "windows1251", "windows1252", 073 "windows1253", "windows1254", "windows1255", "windows1256", 074 "windows1257", "windows1258" }; 075 076 private static Map<String, Encoding> encodingByCookedName = new HashMap<String, Encoding>(); 077 078 private final String canonName; 079 080 private final Charset charset; 081 082 private final boolean asciiSuperset; 083 084 private final boolean obscure; 085 086 private final boolean shouldNot; 087 088 private final boolean likelyEbcdic; 089 090 private Encoding actualHtmlEncoding = null; 091 092 static { 093 byte[] testBuf = new byte[0x7F]; 094 for (int i = 0; i < 0x7F; i++) { 095 if (isAsciiSupersetnessSensitive(i)) { 096 testBuf[i] = (byte) i; 097 } else { 098 testBuf[i] = (byte) 0x20; 099 } 100 } 101 102 Set<Encoding> encodings = new HashSet<Encoding>(); 103 104 SortedMap<String, Charset> charsets = Charset.availableCharsets(); 105 for (Map.Entry<String, Charset> entry : charsets.entrySet()) { 106 Charset cs = entry.getValue(); 107 String name = toNameKey(cs.name()); 108 String canonName = toAsciiLowerCase(cs.name()); 109 if (!isBanned(name)) { 110 name = name.intern(); 111 boolean asciiSuperset = asciiMapsToBasicLatin(testBuf, cs); 112 Encoding enc = new Encoding(canonName.intern(), cs, 113 asciiSuperset, isObscure(name), isShouldNot(name), 114 isLikelyEbcdic(name, asciiSuperset)); 115 encodings.add(enc); 116 Set<String> aliases = cs.aliases(); 117 for (String alias : aliases) { 118 encodingByCookedName.put(toNameKey(alias).intern(), enc); 119 } 120 } 121 } 122 // Overwrite possible overlapping aliases with the real things--just in 123 // case 124 for (Encoding encoding : encodings) { 125 encodingByCookedName.put(toNameKey(encoding.getCanonName()), 126 encoding); 127 } 128 UTF8 = forName("utf-8"); 129 UTF16 = forName("utf-16"); 130 UTF16BE = forName("utf-16be"); 131 UTF16LE = forName("utf-16le"); 132 WINDOWS1252 = forName("windows-1252"); 133 try { 134 forName("iso-8859-1").actualHtmlEncoding = forName("windows-1252"); 135 } catch (UnsupportedCharsetException e) { 136 } 137 try { 138 forName("iso-8859-9").actualHtmlEncoding = forName("windows-1254"); 139 } catch (UnsupportedCharsetException e) { 140 } 141 try { 142 forName("iso-8859-11").actualHtmlEncoding = forName("windows-874"); 143 } catch (UnsupportedCharsetException e) { 144 } 145 try { 146 forName("x-iso-8859-11").actualHtmlEncoding = forName("windows-874"); 147 } catch (UnsupportedCharsetException e) { 148 } 149 try { 150 forName("tis-620").actualHtmlEncoding = forName("windows-874"); 151 } catch (UnsupportedCharsetException e) { 152 } 153 try { 154 forName("gb_2312-80").actualHtmlEncoding = forName("gbk"); 155 } catch (UnsupportedCharsetException e) { 156 } 157 try { 158 forName("gb2312").actualHtmlEncoding = forName("gbk"); 159 } catch (UnsupportedCharsetException e) { 160 } 161 try { 162 encodingByCookedName.put("x-x-big5", forName("big5")); 163 } catch (UnsupportedCharsetException e) { 164 } 165 try { 166 encodingByCookedName.put("euc-kr", forName("windows-949")); 167 } catch (UnsupportedCharsetException e) { 168 } 169 try { 170 encodingByCookedName.put("ks_c_5601-1987", forName("windows-949")); 171 } catch (UnsupportedCharsetException e) { 172 } 173 } 174 175 private static boolean isAsciiSupersetnessSensitive(int c) { 176 return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22) 177 || (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F) 178 || (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A); 179 } 180 181 private static boolean isObscure(String lowerCasePreferredIanaName) { 182 return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1); 183 } 184 185 private static boolean isBanned(String lowerCasePreferredIanaName) { 186 if (lowerCasePreferredIanaName.startsWith("xibm")) { 187 return true; 188 } 189 return (Arrays.binarySearch(BANNED, lowerCasePreferredIanaName) > -1); 190 } 191 192 private static boolean isShouldNot(String lowerCasePreferredIanaName) { 193 return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1); 194 } 195 196 /** 197 * @param testBuf 198 * @param cs 199 */ 200 private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) { 201 CharsetDecoder dec = cs.newDecoder(); 202 dec.onMalformedInput(CodingErrorAction.REPORT); 203 dec.onUnmappableCharacter(CodingErrorAction.REPORT); 204 Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec); 205 try { 206 for (int i = 0; i < 0x7F; i++) { 207 if (isAsciiSupersetnessSensitive(i)) { 208 if (r.read() != i) { 209 return false; 210 } 211 } else { 212 if (r.read() != 0x20) { 213 return false; 214 } 215 } 216 } 217 } catch (IOException e) { 218 return false; 219 } catch (Exception e) { 220 return false; 221 } catch (CoderMalfunctionError e) { 222 return false; 223 } 224 225 return true; 226 } 227 228 private static boolean isLikelyEbcdic(String canonName, 229 boolean asciiSuperset) { 230 if (!asciiSuperset) { 231 return (canonName.startsWith("cp") || canonName.startsWith("ibm") || canonName.startsWith("xibm")); 232 } else { 233 return false; 234 } 235 } 236 237 public static Encoding forName(String name) { 238 Encoding rv = encodingByCookedName.get(toNameKey(name)); 239 if (rv == null) { 240 throw new UnsupportedCharsetException(name); 241 } else { 242 return rv; 243 } 244 } 245 246 public static String toNameKey(String str) { 247 if (str == null) { 248 return null; 249 } 250 int j = 0; 251 char[] buf = new char[str.length()]; 252 for (int i = 0; i < str.length(); i++) { 253 char c = str.charAt(i); 254 if (c >= 'A' && c <= 'Z') { 255 c += 0x20; 256 } 257 if (!((c >= '\t' && c <= '\r') || (c >= '\u0020' && c <= '\u002F') 258 || (c >= '\u003A' && c <= '\u0040') 259 || (c >= '\u005B' && c <= '\u0060') || (c >= '\u007B' && c <= '\u007E'))) { 260 buf[j] = c; 261 j++; 262 } 263 } 264 return new String(buf, 0, j); 265 } 266 267 public static String toAsciiLowerCase(String str) { 268 if (str == null) { 269 return null; 270 } 271 char[] buf = new char[str.length()]; 272 for (int i = 0; i < str.length(); i++) { 273 char c = str.charAt(i); 274 if (c >= 'A' && c <= 'Z') { 275 c += 0x20; 276 } 277 buf[i] = c; 278 } 279 return new String(buf); 280 } 281 282 /** 283 * @param canonName 284 * @param charset 285 * @param asciiSuperset 286 * @param obscure 287 * @param shouldNot 288 * @param likelyEbcdic 289 */ 290 private Encoding(final String canonName, final Charset charset, 291 final boolean asciiSuperset, final boolean obscure, 292 final boolean shouldNot, final boolean likelyEbcdic) { 293 this.canonName = canonName; 294 this.charset = charset; 295 this.asciiSuperset = asciiSuperset; 296 this.obscure = obscure; 297 this.shouldNot = shouldNot; 298 this.likelyEbcdic = likelyEbcdic; 299 } 300 301 /** 302 * Returns the asciiSuperset. 303 * 304 * @return the asciiSuperset 305 */ 306 public boolean isAsciiSuperset() { 307 return asciiSuperset; 308 } 309 310 /** 311 * Returns the canonName. 312 * 313 * @return the canonName 314 */ 315 public String getCanonName() { 316 return canonName; 317 } 318 319 /** 320 * Returns the likelyEbcdic. 321 * 322 * @return the likelyEbcdic 323 */ 324 public boolean isLikelyEbcdic() { 325 return likelyEbcdic; 326 } 327 328 /** 329 * Returns the obscure. 330 * 331 * @return the obscure 332 */ 333 public boolean isObscure() { 334 return obscure; 335 } 336 337 /** 338 * Returns the shouldNot. 339 * 340 * @return the shouldNot 341 */ 342 public boolean isShouldNot() { 343 return shouldNot; 344 } 345 346 public boolean isRegistered() { 347 return !canonName.startsWith("x-"); 348 } 349 350 /** 351 * @return 352 * @see java.nio.charset.Charset#canEncode() 353 */ 354 public boolean canEncode() { 355 return charset.canEncode(); 356 } 357 358 /** 359 * @return 360 * @see java.nio.charset.Charset#newDecoder() 361 */ 362 public CharsetDecoder newDecoder() { 363 return charset.newDecoder(); 364 } 365 366 /** 367 * @return 368 * @see java.nio.charset.Charset#newEncoder() 369 */ 370 public CharsetEncoder newEncoder() { 371 return charset.newEncoder(); 372 } 373 374 /** 375 * Returns the actualHtmlEncoding. 376 * 377 * @return the actualHtmlEncoding 378 */ 379 public Encoding getActualHtmlEncoding() { 380 return actualHtmlEncoding; 381 } 382 383 public static void main(String[] args) { 384 for (Map.Entry<String, Encoding> entry : encodingByCookedName.entrySet()) { 385 String name = entry.getKey(); 386 Encoding enc = entry.getValue(); 387 System.out.printf( 388 "%21s: canon %21s, obs %5s, reg %5s, asc %5s, ebc %5s\n", 389 name, enc.getCanonName(), enc.isObscure(), 390 enc.isRegistered(), enc.isAsciiSuperset(), 391 enc.isLikelyEbcdic()); 392 } 393 } 394 395 }