001    /*
002     * Copyright (c) 2006 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package nu.validator.htmlparser.impl;
024    
025    import java.io.ByteArrayInputStream;
026    import java.io.IOException;
027    import java.io.InputStreamReader;
028    import java.io.Reader;
029    import java.nio.charset.Charset;
030    import java.nio.charset.CharsetDecoder;
031    import java.nio.charset.CodingErrorAction;
032    import java.util.Arrays;
033    import java.util.Iterator;
034    import java.util.Map;
035    import java.util.SortedMap;
036    import java.util.SortedSet;
037    import java.util.TreeSet;
038    
039    public class EncodingInfo {
040    
041        private static String[] NOT_OBSCURE = {"Big5",
042            "Big5-HKSCS",
043            "EUC-JP",
044            "EUC-KR",
045            "GB18030",
046            "GBK",
047            "ISO-2022-JP",
048            "ISO-2022-KR",
049            "ISO-8859-1",
050            "ISO-8859-13",
051            "ISO-8859-15",
052            "ISO-8859-2",
053            "ISO-8859-3",
054            "ISO-8859-4",
055            "ISO-8859-5",
056            "ISO-8859-6",
057            "ISO-8859-7",
058            "ISO-8859-8",
059            "ISO-8859-9",
060            "KOI8-R",
061            "Shift_JIS",
062            "TIS-620",
063            "US-ASCII",
064            "UTF-16",
065            "UTF-16BE",
066            "UTF-16LE",
067            "UTF-8",
068            "windows-1250",
069            "windows-1251",
070            "windows-1252",
071            "windows-1253",
072            "windows-1254",
073            "windows-1255",
074            "windows-1256",
075            "windows-1257",
076            "windows-1258"};
077        
078        private static String[] asciiSuperset;
079    
080        private static String[] notAsciiSuperset;   
081    
082        static {
083            byte[] testBuf = new byte[0x63];
084            for (int i = 0; i < 0x60; i++) {
085                testBuf[i] = (byte) (i + 0x20);
086            }
087            testBuf[0x60] = (byte) '\n';
088            testBuf[0x61] = (byte) '\r';
089            testBuf[0x62] = (byte) '\t';
090    
091            SortedSet<String> asciiSupersetSet = new TreeSet<String>();
092            SortedSet<String> notAsciiSupersetSet = new TreeSet<String>();
093            
094            SortedMap charsets = Charset.availableCharsets();
095            for (Iterator iter = charsets.entrySet().iterator(); iter.hasNext();) {
096                Map.Entry entry = (Map.Entry) iter.next();
097                Charset cs = (Charset) entry.getValue();
098                if (asciiMapsToBasicLatin(testBuf, cs)) {
099                    asciiSupersetSet.add(cs.name().intern());
100                } else {
101                    notAsciiSupersetSet.add(cs.name().intern());
102                }
103            }
104            
105            asciiSuperset = (String[]) asciiSupersetSet.toArray(new String[0]);
106            notAsciiSuperset = (String[]) notAsciiSupersetSet.toArray(new String[0]);
107        }
108    
109        public static boolean isAsciiSuperset(String preferredIanaName) {
110            return (Arrays.binarySearch(asciiSuperset, preferredIanaName) > -1);
111        }
112    
113        public static boolean isNotAsciiSuperset(String preferredIanaName) {
114            return (Arrays.binarySearch(notAsciiSuperset, preferredIanaName) > -1);
115        }
116    
117        public static boolean isObscure(String preferredIanaName) {
118            return !(Arrays.binarySearch(NOT_OBSCURE, preferredIanaName) > -1);
119        }
120        
121        /**
122         * @param testBuf
123         * @param cs
124         */
125        private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
126            CharsetDecoder dec = cs.newDecoder();
127            dec.onMalformedInput(CodingErrorAction.REPORT);
128            dec.onUnmappableCharacter(CodingErrorAction.REPORT);
129            Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
130            try {
131                for (int i = 0; i < 0x60; i++) {
132                    if ((i + 0x20) != r.read()) {
133                        return false;
134                    }
135                }
136                if ('\n' != r.read()) {
137                    return false;
138                }
139                if ('\r' != r.read()) {
140                    return false;
141                }
142                if ('\t' != r.read()) {
143                    return false;
144                }        
145            } catch (IOException e) {
146                return false;
147            } catch (Exception e) {
148                return false;
149            }
150    
151            return true;
152        }
153    
154        public static void main(String[] args) {
155            System.out.println("ASCII maps to Basic Latin:");
156            for (int i = 0; i < asciiSuperset.length; i++) {
157                System.out.println(asciiSuperset[i]);            
158            }
159            System.out.println();
160            System.out.println("ASCII does not map to Basic Latin:");
161            for (int i = 0; i < notAsciiSuperset.length; i++) {
162                System.out.println(notAsciiSuperset[i]);            
163            }
164        }
165    }