001    /*
002     * Copyright (c) 2006 Henri Sivonen
003     * Copyright (c) 2007 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package org.whattf.datatype.data;
025    
026    import java.io.BufferedReader;
027    import java.io.IOException;
028    import java.io.InputStreamReader;
029    import java.net.URL;
030    import java.util.Arrays;
031    import java.util.HashMap;
032    import java.util.HashSet;
033    import java.util.Map;
034    import java.util.Set;
035    import java.util.SortedSet;
036    import java.util.TreeSet;
037    import java.util.regex.Pattern;
038    
039    public class LanguageData {
040        
041        private static final Pattern HYPHEN = Pattern.compile("-");
042        
043        private static final String[][] EMPTY_DOUBLE_STRING_ARRAY = {};
044        
045        private static final String[] EMPTY_STRING_ARRAY = {};
046    
047        private static final String PREFIX = "prefix: ";
048    
049        private static final String SUPPRESS_SCRIPT = "suppress-script: ";
050    
051        private static final String SUBTAG = "subtag: ";
052    
053        private static final String TAG = "tag: ";
054    
055        private static final String TYPE = "type: ";
056        
057        private static final String DEPRECATED = "deprecated: "; 
058        
059        private BufferedReader in;
060        
061        private SortedSet<String> languageSet = new TreeSet<String>();
062        
063        private SortedSet<String> scriptSet = new TreeSet<String>();
064        
065        private SortedSet<String> regionSet = new TreeSet<String>();
066        
067        private SortedSet<String> variantSet = new TreeSet<String>();
068        
069        private SortedSet<String> grandfatheredSet = new TreeSet<String>();
070    
071        private SortedSet<String> deprecatedSet = new TreeSet<String>();
072    
073        private Map<String, String> suppressedScriptByLanguageMap = new HashMap<String, String>();
074        
075        private Map<String, Set<String[]>> prefixesByVariantMap = new HashMap<String, Set<String[]>>();
076        
077        private String[] languages = null;
078        
079        private String[] scripts = null;
080        
081        private String[] regions = null;
082        
083        private String[] variants = null;
084        
085        private String[] grandfathered = null;
086    
087        private String[] deprecated = null;
088        
089        private int[] suppressedScriptByLanguage = null;
090        
091        private String[][][] prefixesByVariant = null;
092        
093        public LanguageData() throws IOException {
094            super();
095            URL url = new URL("http://www.iana.org/assignments/language-subtag-registry");
096            in = new BufferedReader(new InputStreamReader(url.openStream(), "UTF-8"));
097            consumeRegistry();
098            prepareArrays();
099        }
100        
101        private void consumeRegistry() throws IOException {
102            while(consumeRecord()) {
103              // spin
104            }
105            in.close();
106        }
107        
108        private void prepareArrays() throws IOException {
109            scripts = scriptSet.toArray(EMPTY_STRING_ARRAY);
110            regions = regionSet.toArray(EMPTY_STRING_ARRAY);
111            grandfathered = grandfatheredSet.toArray(EMPTY_STRING_ARRAY);
112            deprecated = deprecatedSet.toArray(EMPTY_STRING_ARRAY);
113            
114            int i = 0;
115            languages = new String[languageSet.size()];
116            suppressedScriptByLanguage = new int[languageSet.size()];
117            for (String language : languageSet) {
118                languages[i] = language;
119                String suppressed = suppressedScriptByLanguageMap.get(language);
120                if (suppressed == null) {
121                    suppressedScriptByLanguage[i] = -1;
122                } else {
123                    int index = Arrays.binarySearch(scripts, suppressed);
124                    if (index < 0) {
125                        throw new IOException("Malformed registry: reference to non-existent script.");
126                    }
127                    suppressedScriptByLanguage[i] = index;
128                }
129                i++;
130            }
131            
132            i = 0;
133            variants = new String[variantSet.size()];
134            prefixesByVariant = new String[variantSet.size()][][];
135            for (String variant : variantSet) {
136                variants[i] = variant;
137                Set<String[]> prefixes = prefixesByVariantMap.get(variant);
138                if (prefixes != null) {
139                    prefixesByVariant[i] = prefixes.toArray(EMPTY_DOUBLE_STRING_ARRAY);
140                } else {
141                    prefixesByVariant[i] = EMPTY_DOUBLE_STRING_ARRAY;
142                }
143                i++;
144            }
145        }
146        
147        private boolean consumeRecord() throws IOException {
148            boolean hasMore = true;
149            String type = null;
150            String subtag = null;
151            String suppressScript = null;
152            Set<String[]> prefixes = new HashSet<String[]>();
153            boolean depr = false;
154            String line = null;
155            for (;;) {
156                line = in.readLine();
157                if (line == null) {
158                    hasMore = false;
159                    break;
160                }
161                line = line.toLowerCase();
162                if ("%%".equals(line)) {
163                    break;
164                } else if (line.startsWith(TYPE)) {
165                    type = line.substring(TYPE.length()).trim().intern();
166                } else if (line.startsWith(SUBTAG)) {
167                    subtag = line.substring(SUBTAG.length()).trim().intern();
168                } else if (line.startsWith(TAG)) {
169                    subtag = line.substring(TAG.length()).trim().intern();
170                } else if (line.startsWith(SUPPRESS_SCRIPT)) {
171                    suppressScript = line.substring(SUPPRESS_SCRIPT.length()).trim().intern();
172                } else if (line.startsWith(PREFIX)) {
173                    String[] prefixSubtags = HYPHEN.split(line.substring(PREFIX.length()).trim());
174                    for (int i = 0; i < prefixSubtags.length; i++) {
175                        prefixSubtags[i] = prefixSubtags[i].intern();
176                    }
177                    prefixes.add(prefixSubtags);
178                } else if (line.startsWith(DEPRECATED)) {
179                    depr = true;
180                }
181            }
182            if (subtag == null) {
183                return hasMore;
184            }
185            if (depr) {
186                deprecatedSet.add(subtag);
187            }
188            if ("language" == type) {
189                languageSet.add(subtag);
190                suppressedScriptByLanguageMap.put(subtag, suppressScript);
191            } else if ("region" == type) {
192                regionSet.add(subtag);
193            } else if ("script" == type) {
194                scriptSet.add(subtag);
195            } else if ("variant" == type) {
196                variantSet.add(subtag);
197                prefixesByVariantMap.put(subtag, prefixes);
198            } else if ("grandfathered" == type) {
199                grandfatheredSet.add(subtag);
200            }
201            return hasMore;
202        }
203    
204        /**
205         * Returns the languages.
206         * 
207         * @return the languages
208         */
209        public String[] getLanguages() {
210            return languages;
211        }
212    
213        /**
214         * Returns the prefixesByVariant.
215         * 
216         * @return the prefixesByVariant
217         */
218        public String[][][] getPrefixesByVariant() {
219            return prefixesByVariant;
220        }
221    
222        /**
223         * Returns the regions.
224         * 
225         * @return the regions
226         */
227        public String[] getRegions() {
228            return regions;
229        }
230    
231        /**
232         * Returns the scripts.
233         * 
234         * @return the scripts
235         */
236        public String[] getScripts() {
237            return scripts;
238        }
239    
240        /**
241         * Returns the suppressedScriptByLanguage.
242         * 
243         * @return the suppressedScriptByLanguage
244         */
245        public int[] getSuppressedScriptByLanguage() {
246            return suppressedScriptByLanguage;
247        }
248    
249        /**
250         * Returns the variants.
251         * 
252         * @return the variants
253         */
254        public String[] getVariants() {
255            return variants;
256        }
257    
258        /**
259         * Returns the deprecated.
260         * 
261         * @return the deprecated
262         */
263        public String[] getDeprecated() {
264            return deprecated;
265        }
266    
267        /**
268         * Returns the grandfathered.
269         * 
270         * @return the grandfathered
271         */
272        public String[] getGrandfathered() {
273            return grandfathered;
274        }
275    }