001 /*
002 * Copyright (c) 2006 Henri Sivonen
003 * Copyright (c) 2007 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package org.whattf.datatype.data;
025
026 import java.io.BufferedReader;
027 import java.io.IOException;
028 import java.io.InputStreamReader;
029 import java.net.URL;
030 import java.util.Arrays;
031 import java.util.HashMap;
032 import java.util.HashSet;
033 import java.util.Map;
034 import java.util.Set;
035 import java.util.SortedSet;
036 import java.util.TreeSet;
037 import java.util.regex.Pattern;
038
039 public class LanguageData {
040
041 private static final Pattern HYPHEN = Pattern.compile("-");
042
043 private static final String[][] EMPTY_DOUBLE_STRING_ARRAY = {};
044
045 private static final String[] EMPTY_STRING_ARRAY = {};
046
047 private static final String PREFIX = "prefix: ";
048
049 private static final String SUPPRESS_SCRIPT = "suppress-script: ";
050
051 private static final String SUBTAG = "subtag: ";
052
053 private static final String TAG = "tag: ";
054
055 private static final String TYPE = "type: ";
056
057 private static final String DEPRECATED = "deprecated: ";
058
059 private BufferedReader in;
060
061 private SortedSet<String> languageSet = new TreeSet<String>();
062
063 private SortedSet<String> scriptSet = new TreeSet<String>();
064
065 private SortedSet<String> regionSet = new TreeSet<String>();
066
067 private SortedSet<String> variantSet = new TreeSet<String>();
068
069 private SortedSet<String> grandfatheredSet = new TreeSet<String>();
070
071 private SortedSet<String> deprecatedSet = new TreeSet<String>();
072
073 private Map<String, String> suppressedScriptByLanguageMap = new HashMap<String, String>();
074
075 private Map<String, Set<String[]>> prefixesByVariantMap = new HashMap<String, Set<String[]>>();
076
077 private String[] languages = null;
078
079 private String[] scripts = null;
080
081 private String[] regions = null;
082
083 private String[] variants = null;
084
085 private String[] grandfathered = null;
086
087 private String[] deprecated = null;
088
089 private int[] suppressedScriptByLanguage = null;
090
091 private String[][][] prefixesByVariant = null;
092
093 public LanguageData() throws IOException {
094 super();
095 URL url = new URL("http://www.iana.org/assignments/language-subtag-registry");
096 in = new BufferedReader(new InputStreamReader(url.openStream(), "UTF-8"));
097 consumeRegistry();
098 prepareArrays();
099 }
100
101 private void consumeRegistry() throws IOException {
102 while(consumeRecord()) {
103 // spin
104 }
105 in.close();
106 }
107
108 private void prepareArrays() throws IOException {
109 scripts = scriptSet.toArray(EMPTY_STRING_ARRAY);
110 regions = regionSet.toArray(EMPTY_STRING_ARRAY);
111 grandfathered = grandfatheredSet.toArray(EMPTY_STRING_ARRAY);
112 deprecated = deprecatedSet.toArray(EMPTY_STRING_ARRAY);
113
114 int i = 0;
115 languages = new String[languageSet.size()];
116 suppressedScriptByLanguage = new int[languageSet.size()];
117 for (String language : languageSet) {
118 languages[i] = language;
119 String suppressed = suppressedScriptByLanguageMap.get(language);
120 if (suppressed == null) {
121 suppressedScriptByLanguage[i] = -1;
122 } else {
123 int index = Arrays.binarySearch(scripts, suppressed);
124 if (index < 0) {
125 throw new IOException("Malformed registry: reference to non-existent script.");
126 }
127 suppressedScriptByLanguage[i] = index;
128 }
129 i++;
130 }
131
132 i = 0;
133 variants = new String[variantSet.size()];
134 prefixesByVariant = new String[variantSet.size()][][];
135 for (String variant : variantSet) {
136 variants[i] = variant;
137 Set<String[]> prefixes = prefixesByVariantMap.get(variant);
138 if (prefixes != null) {
139 prefixesByVariant[i] = prefixes.toArray(EMPTY_DOUBLE_STRING_ARRAY);
140 } else {
141 prefixesByVariant[i] = EMPTY_DOUBLE_STRING_ARRAY;
142 }
143 i++;
144 }
145 }
146
147 private boolean consumeRecord() throws IOException {
148 boolean hasMore = true;
149 String type = null;
150 String subtag = null;
151 String suppressScript = null;
152 Set<String[]> prefixes = new HashSet<String[]>();
153 boolean depr = false;
154 String line = null;
155 for (;;) {
156 line = in.readLine();
157 if (line == null) {
158 hasMore = false;
159 break;
160 }
161 line = line.toLowerCase();
162 if ("%%".equals(line)) {
163 break;
164 } else if (line.startsWith(TYPE)) {
165 type = line.substring(TYPE.length()).trim().intern();
166 } else if (line.startsWith(SUBTAG)) {
167 subtag = line.substring(SUBTAG.length()).trim().intern();
168 } else if (line.startsWith(TAG)) {
169 subtag = line.substring(TAG.length()).trim().intern();
170 } else if (line.startsWith(SUPPRESS_SCRIPT)) {
171 suppressScript = line.substring(SUPPRESS_SCRIPT.length()).trim().intern();
172 } else if (line.startsWith(PREFIX)) {
173 String[] prefixSubtags = HYPHEN.split(line.substring(PREFIX.length()).trim());
174 for (int i = 0; i < prefixSubtags.length; i++) {
175 prefixSubtags[i] = prefixSubtags[i].intern();
176 }
177 prefixes.add(prefixSubtags);
178 } else if (line.startsWith(DEPRECATED)) {
179 depr = true;
180 }
181 }
182 if (subtag == null) {
183 return hasMore;
184 }
185 if (depr) {
186 deprecatedSet.add(subtag);
187 }
188 if ("language" == type) {
189 languageSet.add(subtag);
190 suppressedScriptByLanguageMap.put(subtag, suppressScript);
191 } else if ("region" == type) {
192 regionSet.add(subtag);
193 } else if ("script" == type) {
194 scriptSet.add(subtag);
195 } else if ("variant" == type) {
196 variantSet.add(subtag);
197 prefixesByVariantMap.put(subtag, prefixes);
198 } else if ("grandfathered" == type) {
199 grandfatheredSet.add(subtag);
200 }
201 return hasMore;
202 }
203
204 /**
205 * Returns the languages.
206 *
207 * @return the languages
208 */
209 public String[] getLanguages() {
210 return languages;
211 }
212
213 /**
214 * Returns the prefixesByVariant.
215 *
216 * @return the prefixesByVariant
217 */
218 public String[][][] getPrefixesByVariant() {
219 return prefixesByVariant;
220 }
221
222 /**
223 * Returns the regions.
224 *
225 * @return the regions
226 */
227 public String[] getRegions() {
228 return regions;
229 }
230
231 /**
232 * Returns the scripts.
233 *
234 * @return the scripts
235 */
236 public String[] getScripts() {
237 return scripts;
238 }
239
240 /**
241 * Returns the suppressedScriptByLanguage.
242 *
243 * @return the suppressedScriptByLanguage
244 */
245 public int[] getSuppressedScriptByLanguage() {
246 return suppressedScriptByLanguage;
247 }
248
249 /**
250 * Returns the variants.
251 *
252 * @return the variants
253 */
254 public String[] getVariants() {
255 return variants;
256 }
257
258 /**
259 * Returns the deprecated.
260 *
261 * @return the deprecated
262 */
263 public String[] getDeprecated() {
264 return deprecated;
265 }
266
267 /**
268 * Returns the grandfathered.
269 *
270 * @return the grandfathered
271 */
272 public String[] getGrandfathered() {
273 return grandfathered;
274 }
275 }