001 /* 002 * Copyright (c) 2006 Henri Sivonen 003 * Copyright (c) 2007 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package org.whattf.datatype.data; 025 026 import java.io.BufferedReader; 027 import java.io.IOException; 028 import java.io.InputStreamReader; 029 import java.net.URL; 030 import java.util.Arrays; 031 import java.util.HashMap; 032 import java.util.HashSet; 033 import java.util.Map; 034 import java.util.Set; 035 import java.util.SortedSet; 036 import java.util.TreeSet; 037 import java.util.regex.Pattern; 038 039 public class LanguageData { 040 041 private static final Pattern HYPHEN = Pattern.compile("-"); 042 043 private static final String[][] EMPTY_DOUBLE_STRING_ARRAY = {}; 044 045 private static final String[] EMPTY_STRING_ARRAY = {}; 046 047 private static final String PREFIX = "prefix: "; 048 049 private static final String SUPPRESS_SCRIPT = "suppress-script: "; 050 051 private static final String SUBTAG = "subtag: "; 052 053 private static final String TAG = "tag: "; 054 055 private static final String TYPE = "type: "; 056 057 private static final String DEPRECATED = "deprecated: "; 058 059 private BufferedReader in; 060 061 private SortedSet<String> languageSet = new TreeSet<String>(); 062 063 private SortedSet<String> scriptSet = new TreeSet<String>(); 064 065 private SortedSet<String> regionSet = new TreeSet<String>(); 066 067 private SortedSet<String> variantSet = new TreeSet<String>(); 068 069 private SortedSet<String> grandfatheredSet = new TreeSet<String>(); 070 071 private SortedSet<String> deprecatedSet = new TreeSet<String>(); 072 073 private Map<String, String> suppressedScriptByLanguageMap = new HashMap<String, String>(); 074 075 private Map<String, Set<String[]>> prefixesByVariantMap = new HashMap<String, Set<String[]>>(); 076 077 private String[] languages = null; 078 079 private String[] scripts = null; 080 081 private String[] regions = null; 082 083 private String[] variants = null; 084 085 private String[] grandfathered = null; 086 087 private String[] deprecated = null; 088 089 private int[] suppressedScriptByLanguage = null; 090 091 private String[][][] prefixesByVariant = null; 092 093 public LanguageData() throws IOException { 094 super(); 095 URL url = new URL("http://www.iana.org/assignments/language-subtag-registry"); 096 in = new BufferedReader(new InputStreamReader(url.openStream(), "UTF-8")); 097 consumeRegistry(); 098 prepareArrays(); 099 } 100 101 private void consumeRegistry() throws IOException { 102 while(consumeRecord()) { 103 // spin 104 } 105 in.close(); 106 } 107 108 private void prepareArrays() throws IOException { 109 scripts = scriptSet.toArray(EMPTY_STRING_ARRAY); 110 regions = regionSet.toArray(EMPTY_STRING_ARRAY); 111 grandfathered = grandfatheredSet.toArray(EMPTY_STRING_ARRAY); 112 deprecated = deprecatedSet.toArray(EMPTY_STRING_ARRAY); 113 114 int i = 0; 115 languages = new String[languageSet.size()]; 116 suppressedScriptByLanguage = new int[languageSet.size()]; 117 for (String language : languageSet) { 118 languages[i] = language; 119 String suppressed = suppressedScriptByLanguageMap.get(language); 120 if (suppressed == null) { 121 suppressedScriptByLanguage[i] = -1; 122 } else { 123 int index = Arrays.binarySearch(scripts, suppressed); 124 if (index < 0) { 125 throw new IOException("Malformed registry: reference to non-existent script."); 126 } 127 suppressedScriptByLanguage[i] = index; 128 } 129 i++; 130 } 131 132 i = 0; 133 variants = new String[variantSet.size()]; 134 prefixesByVariant = new String[variantSet.size()][][]; 135 for (String variant : variantSet) { 136 variants[i] = variant; 137 Set<String[]> prefixes = prefixesByVariantMap.get(variant); 138 if (prefixes != null) { 139 prefixesByVariant[i] = prefixes.toArray(EMPTY_DOUBLE_STRING_ARRAY); 140 } else { 141 prefixesByVariant[i] = EMPTY_DOUBLE_STRING_ARRAY; 142 } 143 i++; 144 } 145 } 146 147 private boolean consumeRecord() throws IOException { 148 boolean hasMore = true; 149 String type = null; 150 String subtag = null; 151 String suppressScript = null; 152 Set<String[]> prefixes = new HashSet<String[]>(); 153 boolean depr = false; 154 String line = null; 155 for (;;) { 156 line = in.readLine(); 157 if (line == null) { 158 hasMore = false; 159 break; 160 } 161 line = line.toLowerCase(); 162 if ("%%".equals(line)) { 163 break; 164 } else if (line.startsWith(TYPE)) { 165 type = line.substring(TYPE.length()).trim().intern(); 166 } else if (line.startsWith(SUBTAG)) { 167 subtag = line.substring(SUBTAG.length()).trim().intern(); 168 } else if (line.startsWith(TAG)) { 169 subtag = line.substring(TAG.length()).trim().intern(); 170 } else if (line.startsWith(SUPPRESS_SCRIPT)) { 171 suppressScript = line.substring(SUPPRESS_SCRIPT.length()).trim().intern(); 172 } else if (line.startsWith(PREFIX)) { 173 String[] prefixSubtags = HYPHEN.split(line.substring(PREFIX.length()).trim()); 174 for (int i = 0; i < prefixSubtags.length; i++) { 175 prefixSubtags[i] = prefixSubtags[i].intern(); 176 } 177 prefixes.add(prefixSubtags); 178 } else if (line.startsWith(DEPRECATED)) { 179 depr = true; 180 } 181 } 182 if (subtag == null) { 183 return hasMore; 184 } 185 if (depr) { 186 deprecatedSet.add(subtag); 187 } 188 if ("language" == type) { 189 languageSet.add(subtag); 190 suppressedScriptByLanguageMap.put(subtag, suppressScript); 191 } else if ("region" == type) { 192 regionSet.add(subtag); 193 } else if ("script" == type) { 194 scriptSet.add(subtag); 195 } else if ("variant" == type) { 196 variantSet.add(subtag); 197 prefixesByVariantMap.put(subtag, prefixes); 198 } else if ("grandfathered" == type) { 199 grandfatheredSet.add(subtag); 200 } 201 return hasMore; 202 } 203 204 /** 205 * Returns the languages. 206 * 207 * @return the languages 208 */ 209 public String[] getLanguages() { 210 return languages; 211 } 212 213 /** 214 * Returns the prefixesByVariant. 215 * 216 * @return the prefixesByVariant 217 */ 218 public String[][][] getPrefixesByVariant() { 219 return prefixesByVariant; 220 } 221 222 /** 223 * Returns the regions. 224 * 225 * @return the regions 226 */ 227 public String[] getRegions() { 228 return regions; 229 } 230 231 /** 232 * Returns the scripts. 233 * 234 * @return the scripts 235 */ 236 public String[] getScripts() { 237 return scripts; 238 } 239 240 /** 241 * Returns the suppressedScriptByLanguage. 242 * 243 * @return the suppressedScriptByLanguage 244 */ 245 public int[] getSuppressedScriptByLanguage() { 246 return suppressedScriptByLanguage; 247 } 248 249 /** 250 * Returns the variants. 251 * 252 * @return the variants 253 */ 254 public String[] getVariants() { 255 return variants; 256 } 257 258 /** 259 * Returns the deprecated. 260 * 261 * @return the deprecated 262 */ 263 public String[] getDeprecated() { 264 return deprecated; 265 } 266 267 /** 268 * Returns the grandfathered. 269 * 270 * @return the grandfathered 271 */ 272 public String[] getGrandfathered() { 273 return grandfathered; 274 } 275 }