001 /*
002 * Copyright (c) 2006 Henri Sivonen
003 * Copyright (c) 2007 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package org.whattf.datatype;
025
026 import java.io.IOException;
027 import java.util.Arrays;
028 import java.util.regex.Pattern;
029
030 import org.relaxng.datatype.DatatypeException;
031 import org.whattf.datatype.data.LanguageData;
032
033 /**
034 *
035 * @version $Id: Language.java 219 2007-10-22 19:38:04Z hsivonen $
036 * @author hsivonen
037 */
038 public final class Language extends AbstractDatatype {
039
040 /**
041 * The singleton instance.
042 */
043 public static final Language THE_INSTANCE = new Language();
044
045 private static final Pattern HYPHEN = Pattern.compile("-");
046
047 private static String[] languages = null;
048
049 private static String[] scripts = null;
050
051 private static String[] regions = null;
052
053 private static String[] variants = null;
054
055 private static String[] grandfathered = null;
056
057 private static String[] deprecated = null;
058
059 private static int[] suppressedScriptByLanguage = null;
060
061 private static String[][][] prefixesByVariant = null;
062
063 static {
064 try {
065 LanguageData data = new LanguageData();
066 languages = data.getLanguages();
067 scripts = data.getScripts();
068 regions = data.getRegions();
069 variants = data.getVariants();
070 grandfathered = data.getGrandfathered();
071 deprecated = data.getDeprecated();
072 suppressedScriptByLanguage = data.getSuppressedScriptByLanguage();
073 prefixesByVariant = data.getPrefixesByVariant();
074 } catch (IOException e) {
075 throw new RuntimeException(e);
076 }
077 }
078
079 /**
080 * Package-private constructor
081 */
082 private Language() {
083 super();
084 }
085
086 public void checkValid(CharSequence lit)
087 throws DatatypeException {
088 String literal = lit.toString();
089 if (literal.length() == 0) {
090 throw new DatatypeException(
091 "The empty string is not a valid language tag.");
092 }
093 literal = toAsciiLowerCase(literal);
094 if (isGrandfathered(literal)) {
095 if (isDeprecated(literal)) {
096 throw new DatatypeException(
097 "The grandfathered language tag \u201C" + literal + "\u201D is deprecated.");
098 }
099 return;
100 }
101 if (literal.startsWith("-")) {
102 throw new DatatypeException(
103 "Language tag must not start with HYPHEN-MINUS.");
104 }
105 if (literal.endsWith("-")) {
106 throw new DatatypeException(
107 "Language tag must not end with HYPHEN-MINUS.");
108 }
109
110 String[] subtags = HYPHEN.split(literal);
111
112 for (int j = 0; j < subtags.length; j++) {
113 int len = subtags[j].length();
114 if (len == 0) {
115 throw new DatatypeException(
116 "Zero-length subtag.");
117 } else if (len > 8) {
118 throw new DatatypeException(
119 "Subtags must next exceed 8 characters in length.");
120 }
121 }
122
123 // Language
124
125 int i = 0;
126 String subtag = subtags[i];
127 int len = subtag.length();
128 if ("x".equals(subtag)) {
129 checkPrivateUse(i, subtags);
130 return;
131 }
132 if ((len == 2 || len == 3) && isLowerCaseAlpha(subtag)) {
133 if (!isLanguage(subtag)) {
134 throw new DatatypeException(
135 "Bad ISO language part in language tag.");
136 }
137 if (isDeprecated(subtag)) {
138 throw new DatatypeException(
139 "The language subtag \u201C" + subtag + "\u201D is deprecated.");
140 }
141 i++;
142 if (i == subtags.length) {
143 return;
144 }
145 subtag = subtags[i];
146 len = subtag.length();
147 if (len == 3) {
148 throw new DatatypeException(
149 "Found reserved language extension subtag.");
150 }
151 } else if (len == 4 && isLowerCaseAlpha(subtag)) {
152 throw new DatatypeException("Found reserved language tag.");
153 } else if (len == 5 && isLowerCaseAlpha(subtag)) {
154 if (!isLanguage(subtag)) {
155 throw new DatatypeException(
156 "Bad IANA language part in language tag.");
157 }
158 if (isDeprecated(subtag)) {
159 throw new DatatypeException(
160 "The language subtag \u201C" + subtag + "\u201D is deprecated.");
161 }
162 i++;
163 if (i == subtags.length) {
164 return;
165 }
166 subtag = subtags[i];
167 len = subtag.length();
168 }
169
170 // Script?
171
172 if ("x".equals(subtag)) {
173 checkPrivateUse(i, subtags);
174 return;
175 }
176 if (subtag.length() == 4) {
177 if (!isScript(subtag)) {
178 throw new DatatypeException("Bad script subtag.");
179 }
180 if (isDeprecated(subtag)) {
181 throw new DatatypeException(
182 "The script subtag \u201C" + subtag + "\u201D is deprecated.");
183 }
184 if (shouldSuppressScript(subtags[0], subtag)) {
185 throw new DatatypeException("Language tag should omit the default script for the language.");
186 }
187 i++;
188 if (i == subtags.length) {
189 return;
190 }
191 subtag = subtags[i];
192 len = subtag.length();
193 }
194
195 // Region
196
197 if ((len == 3 && isDigit(subtag))
198 || (len == 2 && isLowerCaseAlpha(subtag))) {
199 if (!isRegion(subtag)) {
200 throw new DatatypeException("Bad region subtag.");
201 }
202 if (isDeprecated(subtag)) {
203 throw new DatatypeException(
204 "The region subtag \u201C" + subtag + "\u201D is deprecated.");
205 }
206 i++;
207 if (i == subtags.length) {
208 return;
209 }
210 subtag = subtags[i];
211 len = subtag.length();
212 }
213
214 // Variant
215
216 for (;;) {
217 if ("x".equals(subtag)) {
218 checkPrivateUse(i, subtags);
219 return;
220 }
221 // cutting corners here a bit since there are no extensions at this time
222 if (len == 1) {
223 throw new DatatypeException("Unknown extension.");
224 } else {
225 if (!isVariant(subtag)) {
226 throw new DatatypeException("Bad variant subtag.");
227 }
228 if (isDeprecated(subtag)) {
229 throw new DatatypeException(
230 "The variant subtag \u201C" + subtag + "\u201D is deprecated.");
231 }
232 if (!hasGoodPrefix(subtags, i)) {
233 throw new DatatypeException("Variant lacks required prefix.");
234 }
235 }
236 i++;
237 if (i == subtags.length) {
238 return;
239 }
240 subtag = subtags[i];
241 len = subtag.length();
242 }
243 }
244
245 private boolean hasGoodPrefix(String[] subtags, int i) {
246 String variant = subtags[i];
247 int index = Arrays.binarySearch(variants, variant);
248 assert index >= 0;
249 String[][] prefixes = prefixesByVariant[index];
250 if (prefixes.length == 0) {
251 return true;
252 }
253 for (int j = 0; j < prefixes.length; j++) {
254 String[] prefix = prefixes[j];
255 if (prefixMatches(prefix, subtags, i)) {
256 return true;
257 }
258 }
259 return false;
260 }
261
262 private boolean prefixMatches(String[] prefix, String[] subtags, int limit) {
263 for (int i = 0; i < prefix.length; i++) {
264 String prefixComponent = prefix[i];
265 if (!subtagsContainPrefixComponent(prefixComponent, subtags, limit)) {
266 return false;
267 }
268 }
269 return true;
270 }
271
272 private boolean subtagsContainPrefixComponent(String prefixComponent, String[] subtags, int limit) {
273 for (int i = 0; i < limit; i++) {
274 String subtag = subtags[i];
275 if (subtag.equals(prefixComponent)) {
276 return true;
277 }
278 }
279 return false;
280 }
281
282 private boolean shouldSuppressScript(String language, String script) {
283 int langIndex = Arrays.binarySearch(languages, language);
284 assert langIndex > -1;
285 int scriptIndex = suppressedScriptByLanguage[langIndex];
286 if (scriptIndex < 0) {
287 return false;
288 } else {
289 return scripts[scriptIndex].equals(script);
290 }
291 }
292
293 private boolean isVariant(String subtag) {
294 return (Arrays.binarySearch(variants, subtag) > -1);
295 }
296
297 private boolean isRegion(String subtag) {
298 return (Arrays.binarySearch(regions, subtag) > -1) || "aa".equals(subtag)
299 || ("qm".compareTo(subtag) <= 0 && "qz".compareTo(subtag) >= 0)
300 || ("xa".compareTo(subtag) <= 0 && "xz".compareTo(subtag) >= 0)
301 || "zz".equals(subtag);
302 }
303
304 private boolean isScript(String subtag) {
305 return (Arrays.binarySearch(scripts, subtag) > -1)
306 || ("qaaa".compareTo(subtag) <= 0 && "qabx".compareTo(subtag) >= 0);
307 }
308
309 private boolean isLanguage(String subtag) {
310 return (Arrays.binarySearch(languages, subtag) > -1)
311 || ("qaa".compareTo(subtag) <= 0 && "qtz".compareTo(subtag) >= 0);
312 }
313
314 private void checkPrivateUse(int i, String[] subtags)
315 throws DatatypeException {
316 int len = subtags.length;
317 i++;
318 if (i == len) {
319 throw new DatatypeException("No subtags in private use sequence.");
320 }
321 while (i < len) {
322 String subtag = subtags[i];
323 if (!isLowerCaseAlphaNumeric(subtag)) {
324 throw new DatatypeException(
325 "Bad character in private use subtag.");
326 }
327 i++;
328 }
329 }
330
331 private final boolean isLowerCaseAlphaNumeric(char c) {
332 return isLowerCaseAlpha(c) || isDigit(c);
333 }
334
335 private final boolean isLowerCaseAlphaNumeric(String str) {
336 for (int i = 0; i < str.length(); i++) {
337 if (!isLowerCaseAlphaNumeric(str.charAt(i))) {
338 return false;
339 }
340 }
341 return true;
342 }
343
344 /**
345 * @param c
346 * @return
347 */
348 private final boolean isDigit(char c) {
349 return (c >= '0' && c <= '9');
350 }
351
352 private final boolean isDigit(String str) {
353 for (int i = 0; i < str.length(); i++) {
354 if (!isDigit(str.charAt(i))) {
355 return false;
356 }
357 }
358 return true;
359 }
360
361 /**
362 * @param c
363 * @return
364 */
365 private final boolean isLowerCaseAlpha(char c) {
366 return (c >= 'a' && c <= 'z');
367 }
368
369 private final boolean isLowerCaseAlpha(String str) {
370 for (int i = 0; i < str.length(); i++) {
371 if (!isLowerCaseAlpha(str.charAt(i))) {
372 return false;
373 }
374 }
375 return true;
376 }
377
378 private boolean isGrandfathered(String literal) {
379 return Arrays.binarySearch(grandfathered, literal) > -1;
380 }
381
382 private boolean isDeprecated(String subtag) {
383 return Arrays.binarySearch(deprecated, subtag) > -1;
384 }
385 }