001 /* 002 * Copyright (c) 2006 Henri Sivonen 003 * Copyright (c) 2007 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package org.whattf.datatype; 025 026 import java.io.IOException; 027 import java.util.Arrays; 028 import java.util.regex.Pattern; 029 030 import org.relaxng.datatype.DatatypeException; 031 import org.whattf.datatype.data.LanguageData; 032 033 /** 034 * 035 * @version $Id: Language.java 219 2007-10-22 19:38:04Z hsivonen $ 036 * @author hsivonen 037 */ 038 public final class Language extends AbstractDatatype { 039 040 /** 041 * The singleton instance. 042 */ 043 public static final Language THE_INSTANCE = new Language(); 044 045 private static final Pattern HYPHEN = Pattern.compile("-"); 046 047 private static String[] languages = null; 048 049 private static String[] scripts = null; 050 051 private static String[] regions = null; 052 053 private static String[] variants = null; 054 055 private static String[] grandfathered = null; 056 057 private static String[] deprecated = null; 058 059 private static int[] suppressedScriptByLanguage = null; 060 061 private static String[][][] prefixesByVariant = null; 062 063 static { 064 try { 065 LanguageData data = new LanguageData(); 066 languages = data.getLanguages(); 067 scripts = data.getScripts(); 068 regions = data.getRegions(); 069 variants = data.getVariants(); 070 grandfathered = data.getGrandfathered(); 071 deprecated = data.getDeprecated(); 072 suppressedScriptByLanguage = data.getSuppressedScriptByLanguage(); 073 prefixesByVariant = data.getPrefixesByVariant(); 074 } catch (IOException e) { 075 throw new RuntimeException(e); 076 } 077 } 078 079 /** 080 * Package-private constructor 081 */ 082 private Language() { 083 super(); 084 } 085 086 public void checkValid(CharSequence lit) 087 throws DatatypeException { 088 String literal = lit.toString(); 089 if (literal.length() == 0) { 090 throw new DatatypeException( 091 "The empty string is not a valid language tag."); 092 } 093 literal = toAsciiLowerCase(literal); 094 if (isGrandfathered(literal)) { 095 if (isDeprecated(literal)) { 096 throw new DatatypeException( 097 "The grandfathered language tag \u201C" + literal + "\u201D is deprecated."); 098 } 099 return; 100 } 101 if (literal.startsWith("-")) { 102 throw new DatatypeException( 103 "Language tag must not start with HYPHEN-MINUS."); 104 } 105 if (literal.endsWith("-")) { 106 throw new DatatypeException( 107 "Language tag must not end with HYPHEN-MINUS."); 108 } 109 110 String[] subtags = HYPHEN.split(literal); 111 112 for (int j = 0; j < subtags.length; j++) { 113 int len = subtags[j].length(); 114 if (len == 0) { 115 throw new DatatypeException( 116 "Zero-length subtag."); 117 } else if (len > 8) { 118 throw new DatatypeException( 119 "Subtags must next exceed 8 characters in length."); 120 } 121 } 122 123 // Language 124 125 int i = 0; 126 String subtag = subtags[i]; 127 int len = subtag.length(); 128 if ("x".equals(subtag)) { 129 checkPrivateUse(i, subtags); 130 return; 131 } 132 if ((len == 2 || len == 3) && isLowerCaseAlpha(subtag)) { 133 if (!isLanguage(subtag)) { 134 throw new DatatypeException( 135 "Bad ISO language part in language tag."); 136 } 137 if (isDeprecated(subtag)) { 138 throw new DatatypeException( 139 "The language subtag \u201C" + subtag + "\u201D is deprecated."); 140 } 141 i++; 142 if (i == subtags.length) { 143 return; 144 } 145 subtag = subtags[i]; 146 len = subtag.length(); 147 if (len == 3) { 148 throw new DatatypeException( 149 "Found reserved language extension subtag."); 150 } 151 } else if (len == 4 && isLowerCaseAlpha(subtag)) { 152 throw new DatatypeException("Found reserved language tag."); 153 } else if (len == 5 && isLowerCaseAlpha(subtag)) { 154 if (!isLanguage(subtag)) { 155 throw new DatatypeException( 156 "Bad IANA language part in language tag."); 157 } 158 if (isDeprecated(subtag)) { 159 throw new DatatypeException( 160 "The language subtag \u201C" + subtag + "\u201D is deprecated."); 161 } 162 i++; 163 if (i == subtags.length) { 164 return; 165 } 166 subtag = subtags[i]; 167 len = subtag.length(); 168 } 169 170 // Script? 171 172 if ("x".equals(subtag)) { 173 checkPrivateUse(i, subtags); 174 return; 175 } 176 if (subtag.length() == 4) { 177 if (!isScript(subtag)) { 178 throw new DatatypeException("Bad script subtag."); 179 } 180 if (isDeprecated(subtag)) { 181 throw new DatatypeException( 182 "The script subtag \u201C" + subtag + "\u201D is deprecated."); 183 } 184 if (shouldSuppressScript(subtags[0], subtag)) { 185 throw new DatatypeException("Language tag should omit the default script for the language."); 186 } 187 i++; 188 if (i == subtags.length) { 189 return; 190 } 191 subtag = subtags[i]; 192 len = subtag.length(); 193 } 194 195 // Region 196 197 if ((len == 3 && isDigit(subtag)) 198 || (len == 2 && isLowerCaseAlpha(subtag))) { 199 if (!isRegion(subtag)) { 200 throw new DatatypeException("Bad region subtag."); 201 } 202 if (isDeprecated(subtag)) { 203 throw new DatatypeException( 204 "The region subtag \u201C" + subtag + "\u201D is deprecated."); 205 } 206 i++; 207 if (i == subtags.length) { 208 return; 209 } 210 subtag = subtags[i]; 211 len = subtag.length(); 212 } 213 214 // Variant 215 216 for (;;) { 217 if ("x".equals(subtag)) { 218 checkPrivateUse(i, subtags); 219 return; 220 } 221 // cutting corners here a bit since there are no extensions at this time 222 if (len == 1) { 223 throw new DatatypeException("Unknown extension."); 224 } else { 225 if (!isVariant(subtag)) { 226 throw new DatatypeException("Bad variant subtag."); 227 } 228 if (isDeprecated(subtag)) { 229 throw new DatatypeException( 230 "The variant subtag \u201C" + subtag + "\u201D is deprecated."); 231 } 232 if (!hasGoodPrefix(subtags, i)) { 233 throw new DatatypeException("Variant lacks required prefix."); 234 } 235 } 236 i++; 237 if (i == subtags.length) { 238 return; 239 } 240 subtag = subtags[i]; 241 len = subtag.length(); 242 } 243 } 244 245 private boolean hasGoodPrefix(String[] subtags, int i) { 246 String variant = subtags[i]; 247 int index = Arrays.binarySearch(variants, variant); 248 assert index >= 0; 249 String[][] prefixes = prefixesByVariant[index]; 250 if (prefixes.length == 0) { 251 return true; 252 } 253 for (int j = 0; j < prefixes.length; j++) { 254 String[] prefix = prefixes[j]; 255 if (prefixMatches(prefix, subtags, i)) { 256 return true; 257 } 258 } 259 return false; 260 } 261 262 private boolean prefixMatches(String[] prefix, String[] subtags, int limit) { 263 for (int i = 0; i < prefix.length; i++) { 264 String prefixComponent = prefix[i]; 265 if (!subtagsContainPrefixComponent(prefixComponent, subtags, limit)) { 266 return false; 267 } 268 } 269 return true; 270 } 271 272 private boolean subtagsContainPrefixComponent(String prefixComponent, String[] subtags, int limit) { 273 for (int i = 0; i < limit; i++) { 274 String subtag = subtags[i]; 275 if (subtag.equals(prefixComponent)) { 276 return true; 277 } 278 } 279 return false; 280 } 281 282 private boolean shouldSuppressScript(String language, String script) { 283 int langIndex = Arrays.binarySearch(languages, language); 284 assert langIndex > -1; 285 int scriptIndex = suppressedScriptByLanguage[langIndex]; 286 if (scriptIndex < 0) { 287 return false; 288 } else { 289 return scripts[scriptIndex].equals(script); 290 } 291 } 292 293 private boolean isVariant(String subtag) { 294 return (Arrays.binarySearch(variants, subtag) > -1); 295 } 296 297 private boolean isRegion(String subtag) { 298 return (Arrays.binarySearch(regions, subtag) > -1) || "aa".equals(subtag) 299 || ("qm".compareTo(subtag) <= 0 && "qz".compareTo(subtag) >= 0) 300 || ("xa".compareTo(subtag) <= 0 && "xz".compareTo(subtag) >= 0) 301 || "zz".equals(subtag); 302 } 303 304 private boolean isScript(String subtag) { 305 return (Arrays.binarySearch(scripts, subtag) > -1) 306 || ("qaaa".compareTo(subtag) <= 0 && "qabx".compareTo(subtag) >= 0); 307 } 308 309 private boolean isLanguage(String subtag) { 310 return (Arrays.binarySearch(languages, subtag) > -1) 311 || ("qaa".compareTo(subtag) <= 0 && "qtz".compareTo(subtag) >= 0); 312 } 313 314 private void checkPrivateUse(int i, String[] subtags) 315 throws DatatypeException { 316 int len = subtags.length; 317 i++; 318 if (i == len) { 319 throw new DatatypeException("No subtags in private use sequence."); 320 } 321 while (i < len) { 322 String subtag = subtags[i]; 323 if (!isLowerCaseAlphaNumeric(subtag)) { 324 throw new DatatypeException( 325 "Bad character in private use subtag."); 326 } 327 i++; 328 } 329 } 330 331 private final boolean isLowerCaseAlphaNumeric(char c) { 332 return isLowerCaseAlpha(c) || isDigit(c); 333 } 334 335 private final boolean isLowerCaseAlphaNumeric(String str) { 336 for (int i = 0; i < str.length(); i++) { 337 if (!isLowerCaseAlphaNumeric(str.charAt(i))) { 338 return false; 339 } 340 } 341 return true; 342 } 343 344 /** 345 * @param c 346 * @return 347 */ 348 private final boolean isDigit(char c) { 349 return (c >= '0' && c <= '9'); 350 } 351 352 private final boolean isDigit(String str) { 353 for (int i = 0; i < str.length(); i++) { 354 if (!isDigit(str.charAt(i))) { 355 return false; 356 } 357 } 358 return true; 359 } 360 361 /** 362 * @param c 363 * @return 364 */ 365 private final boolean isLowerCaseAlpha(char c) { 366 return (c >= 'a' && c <= 'z'); 367 } 368 369 private final boolean isLowerCaseAlpha(String str) { 370 for (int i = 0; i < str.length(); i++) { 371 if (!isLowerCaseAlpha(str.charAt(i))) { 372 return false; 373 } 374 } 375 return true; 376 } 377 378 private boolean isGrandfathered(String literal) { 379 return Arrays.binarySearch(grandfathered, literal) > -1; 380 } 381 382 private boolean isDeprecated(String subtag) { 383 return Arrays.binarySearch(deprecated, subtag) > -1; 384 } 385 }