001    /*
002     * Copyright (c) 2006 Henri Sivonen
003     * Copyright (c) 2008 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.io;
025    
026    import java.io.ByteArrayInputStream;
027    import java.io.IOException;
028    import java.io.InputStreamReader;
029    import java.io.Reader;
030    import java.nio.charset.Charset;
031    import java.nio.charset.CharsetDecoder;
032    import java.nio.charset.CharsetEncoder;
033    import java.nio.charset.CoderMalfunctionError;
034    import java.nio.charset.CodingErrorAction;
035    import java.nio.charset.UnsupportedCharsetException;
036    import java.util.Arrays;
037    import java.util.HashMap;
038    import java.util.HashSet;
039    import java.util.Map;
040    import java.util.Set;
041    import java.util.SortedMap;
042    
043    public class Encoding {
044    
045        public static final Encoding UTF8;
046    
047        public static final Encoding UTF16;
048    
049        public static final Encoding UTF16LE;
050    
051        public static final Encoding UTF16BE;
052    
053        public static final Encoding WINDOWS1252;
054    
055        private static String[] SHOULD_NOT = { "jisx02121990", "xjis0208" };
056    
057        private static String[] BANNED = { "bocu1", "cesu8", "compoundtext",
058                "iscii91", "macarabic", "maccentraleurroman", "maccroatian",
059                "maccyrillic", "macdevanagari", "macfarsi", "macgreek",
060                "macgujarati", "macgurmukhi", "machebrew", "macicelandic",
061                "macroman", "macromanian", "macthai", "macturkish", "macukranian",
062                "scsu", "utf32", "utf32be", "utf32le", "utf7", "ximapmailboxname",
063                "xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom",
064                "xutf32lebom", "xutf16oppositeendian", "xutf16platformendian",
065                "xutf32oppositeendian", "xutf32platformendian" };
066    
067        private static String[] NOT_OBSCURE = { "big5", "big5hkscs", "eucjp",
068                "euckr", "gb18030", "gbk", "iso2022jp", "iso2022kr", "iso88591",
069                "iso885913", "iso885915", "iso88592", "iso88593", "iso88594",
070                "iso88595", "iso88596", "iso88597", "iso88598", "iso88599",
071                "koi8r", "shiftjis", "tis620", "usascii", "utf16", "utf16be",
072                "utf16le", "utf8", "windows1250", "windows1251", "windows1252",
073                "windows1253", "windows1254", "windows1255", "windows1256",
074                "windows1257", "windows1258" };
075    
076        private static Map<String, Encoding> encodingByCookedName = new HashMap<String, Encoding>();
077    
078        private final String canonName;
079    
080        private final Charset charset;
081    
082        private final boolean asciiSuperset;
083    
084        private final boolean obscure;
085    
086        private final boolean shouldNot;
087    
088        private final boolean likelyEbcdic;
089    
090        private Encoding actualHtmlEncoding = null;
091    
092        static {
093            byte[] testBuf = new byte[0x7F];
094            for (int i = 0; i < 0x7F; i++) {
095                if (isAsciiSupersetnessSensitive(i)) {
096                    testBuf[i] = (byte) i;
097                } else {
098                    testBuf[i] = (byte) 0x20;
099                }
100            }
101    
102            Set<Encoding> encodings = new HashSet<Encoding>();
103    
104            SortedMap<String, Charset> charsets = Charset.availableCharsets();
105            for (Map.Entry<String, Charset> entry : charsets.entrySet()) {
106                Charset cs = entry.getValue();
107                String name = toNameKey(cs.name());
108                String canonName = toAsciiLowerCase(cs.name());
109                if (!isBanned(name)) {
110                    name = name.intern();
111                    boolean asciiSuperset = asciiMapsToBasicLatin(testBuf, cs);
112                    Encoding enc = new Encoding(canonName.intern(), cs,
113                            asciiSuperset, isObscure(name), isShouldNot(name),
114                            isLikelyEbcdic(name, asciiSuperset));
115                    encodings.add(enc);
116                    Set<String> aliases = cs.aliases();
117                    for (String alias : aliases) {
118                        encodingByCookedName.put(toNameKey(alias).intern(), enc);
119                    }
120                }
121            }
122            // Overwrite possible overlapping aliases with the real things--just in
123            // case
124            for (Encoding encoding : encodings) {
125                encodingByCookedName.put(toNameKey(encoding.getCanonName()),
126                        encoding);
127            }
128            UTF8 = forName("utf-8");
129            UTF16 = forName("utf-16");
130            UTF16BE = forName("utf-16be");
131            UTF16LE = forName("utf-16le");
132            WINDOWS1252 = forName("windows-1252");
133            try {
134                forName("iso-8859-1").actualHtmlEncoding = forName("windows-1252");
135            } catch (UnsupportedCharsetException e) {
136            }
137            try {
138                forName("iso-8859-9").actualHtmlEncoding = forName("windows-1254");
139            } catch (UnsupportedCharsetException e) {
140            }
141            try {
142                forName("iso-8859-11").actualHtmlEncoding = forName("windows-874");
143            } catch (UnsupportedCharsetException e) {
144            }
145            try {
146                forName("x-iso-8859-11").actualHtmlEncoding = forName("windows-874");
147            } catch (UnsupportedCharsetException e) {
148            }
149            try {
150                forName("tis-620").actualHtmlEncoding = forName("windows-874");
151            } catch (UnsupportedCharsetException e) {
152            }
153            try {
154                forName("gb_2312-80").actualHtmlEncoding = forName("gbk");
155            } catch (UnsupportedCharsetException e) {
156            }
157            try {
158                forName("gb2312").actualHtmlEncoding = forName("gbk");
159            } catch (UnsupportedCharsetException e) {
160            }
161            try {
162                encodingByCookedName.put("x-x-big5", forName("big5"));
163            } catch (UnsupportedCharsetException e) {
164            }
165            try {
166                encodingByCookedName.put("euc-kr", forName("windows-949"));
167            } catch (UnsupportedCharsetException e) {
168            }
169            try {
170                encodingByCookedName.put("ks_c_5601-1987", forName("windows-949"));
171            } catch (UnsupportedCharsetException e) {
172            }
173        }
174    
175        private static boolean isAsciiSupersetnessSensitive(int c) {
176            return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22)
177                    || (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F)
178                    || (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
179        }
180    
181        private static boolean isObscure(String lowerCasePreferredIanaName) {
182            return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1);
183        }
184    
185        private static boolean isBanned(String lowerCasePreferredIanaName) {
186            if (lowerCasePreferredIanaName.startsWith("xibm")) {
187                return true;
188            }
189            return (Arrays.binarySearch(BANNED, lowerCasePreferredIanaName) > -1);
190        }
191    
192        private static boolean isShouldNot(String lowerCasePreferredIanaName) {
193            return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1);
194        }
195    
196        /**
197         * @param testBuf
198         * @param cs
199         */
200        private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
201            CharsetDecoder dec = cs.newDecoder();
202            dec.onMalformedInput(CodingErrorAction.REPORT);
203            dec.onUnmappableCharacter(CodingErrorAction.REPORT);
204            Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
205            try {
206                for (int i = 0; i < 0x7F; i++) {
207                    if (isAsciiSupersetnessSensitive(i)) {
208                        if (r.read() != i) {
209                            return false;
210                        }
211                    } else {
212                        if (r.read() != 0x20) {
213                            return false;
214                        }
215                    }
216                }
217            } catch (IOException e) {
218                return false;
219            } catch (Exception e) {
220                return false;
221            } catch (CoderMalfunctionError e) {
222                return false;
223            }
224    
225            return true;
226        }
227    
228        private static boolean isLikelyEbcdic(String canonName,
229                boolean asciiSuperset) {
230            if (!asciiSuperset) {
231                return (canonName.startsWith("cp") || canonName.startsWith("ibm") || canonName.startsWith("xibm"));
232            } else {
233                return false;
234            }
235        }
236    
237        public static Encoding forName(String name) {
238            Encoding rv = encodingByCookedName.get(toNameKey(name));
239            if (rv == null) {
240                throw new UnsupportedCharsetException(name);
241            } else {
242                return rv;
243            }
244        }
245    
246        public static String toNameKey(String str) {
247            if (str == null) {
248                return null;
249            }
250            int j = 0;
251            char[] buf = new char[str.length()];
252            for (int i = 0; i < str.length(); i++) {
253                char c = str.charAt(i);
254                if (c >= 'A' && c <= 'Z') {
255                    c += 0x20;
256                }
257                if (!((c >= '\t' && c <= '\r') || (c >= '\u0020' && c <= '\u002F')
258                        || (c >= '\u003A' && c <= '\u0040')
259                        || (c >= '\u005B' && c <= '\u0060') || (c >= '\u007B' && c <= '\u007E'))) {
260                    buf[j] = c;
261                    j++;
262                }
263            }
264            return new String(buf, 0, j);
265        }
266    
267        public static String toAsciiLowerCase(String str) {
268            if (str == null) {
269                return null;
270            }
271            char[] buf = new char[str.length()];
272            for (int i = 0; i < str.length(); i++) {
273                char c = str.charAt(i);
274                if (c >= 'A' && c <= 'Z') {
275                    c += 0x20;
276                }
277                buf[i] = c;
278            }
279            return new String(buf);
280        }
281    
282        /**
283         * @param canonName
284         * @param charset
285         * @param asciiSuperset
286         * @param obscure
287         * @param shouldNot
288         * @param likelyEbcdic
289         */
290        private Encoding(final String canonName, final Charset charset,
291                final boolean asciiSuperset, final boolean obscure,
292                final boolean shouldNot, final boolean likelyEbcdic) {
293            this.canonName = canonName;
294            this.charset = charset;
295            this.asciiSuperset = asciiSuperset;
296            this.obscure = obscure;
297            this.shouldNot = shouldNot;
298            this.likelyEbcdic = likelyEbcdic;
299        }
300    
301        /**
302         * Returns the asciiSuperset.
303         * 
304         * @return the asciiSuperset
305         */
306        public boolean isAsciiSuperset() {
307            return asciiSuperset;
308        }
309    
310        /**
311         * Returns the canonName.
312         * 
313         * @return the canonName
314         */
315        public String getCanonName() {
316            return canonName;
317        }
318    
319        /**
320         * Returns the likelyEbcdic.
321         * 
322         * @return the likelyEbcdic
323         */
324        public boolean isLikelyEbcdic() {
325            return likelyEbcdic;
326        }
327    
328        /**
329         * Returns the obscure.
330         * 
331         * @return the obscure
332         */
333        public boolean isObscure() {
334            return obscure;
335        }
336    
337        /**
338         * Returns the shouldNot.
339         * 
340         * @return the shouldNot
341         */
342        public boolean isShouldNot() {
343            return shouldNot;
344        }
345    
346        public boolean isRegistered() {
347            return !canonName.startsWith("x-");
348        }
349    
350        /**
351         * @return
352         * @see java.nio.charset.Charset#canEncode()
353         */
354        public boolean canEncode() {
355            return charset.canEncode();
356        }
357    
358        /**
359         * @return
360         * @see java.nio.charset.Charset#newDecoder()
361         */
362        public CharsetDecoder newDecoder() {
363            return charset.newDecoder();
364        }
365    
366        /**
367         * @return
368         * @see java.nio.charset.Charset#newEncoder()
369         */
370        public CharsetEncoder newEncoder() {
371            return charset.newEncoder();
372        }
373    
374        /**
375         * Returns the actualHtmlEncoding.
376         * 
377         * @return the actualHtmlEncoding
378         */
379        public Encoding getActualHtmlEncoding() {
380            return actualHtmlEncoding;
381        }
382    
383        public static void main(String[] args) {
384            for (Map.Entry<String, Encoding> entry : encodingByCookedName.entrySet()) {
385                String name = entry.getKey();
386                Encoding enc = entry.getValue();
387                System.out.printf(
388                        "%21s: canon %21s, obs %5s, reg %5s, asc %5s, ebc %5s\n",
389                        name, enc.getCanonName(), enc.isObscure(),
390                        enc.isRegistered(), enc.isAsciiSuperset(),
391                        enc.isLikelyEbcdic());
392            }
393        }
394    
395    }