001    /*
002     * Copyright (c) 2006 Henri Sivonen
003     * Copyright (c) 2007 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package org.whattf.datatype;
025    
026    import java.io.IOException;
027    import java.util.Arrays;
028    import java.util.regex.Pattern;
029    
030    import org.relaxng.datatype.DatatypeException;
031    import org.whattf.datatype.data.LanguageData;
032    
033    /**
034     * 
035     * @version $Id: Language.java 219 2007-10-22 19:38:04Z hsivonen $
036     * @author hsivonen
037     */
038    public final class Language extends AbstractDatatype {
039    
040        /**
041         * The singleton instance.
042         */
043        public static final Language THE_INSTANCE = new Language();
044        
045        private static final Pattern HYPHEN = Pattern.compile("-");
046        
047        private static String[] languages = null;
048        
049        private static String[] scripts = null;
050        
051        private static String[] regions = null;
052        
053        private static String[] variants = null;
054        
055        private static String[] grandfathered = null;
056        
057        private static String[] deprecated = null;
058    
059        private static int[] suppressedScriptByLanguage = null;
060        
061        private static String[][][] prefixesByVariant = null;
062        
063        static {
064            try {
065                LanguageData data = new LanguageData();
066                languages = data.getLanguages();
067                scripts = data.getScripts();
068                regions = data.getRegions();
069                variants = data.getVariants();
070                grandfathered = data.getGrandfathered();
071                deprecated = data.getDeprecated();
072                suppressedScriptByLanguage = data.getSuppressedScriptByLanguage();
073                prefixesByVariant = data.getPrefixesByVariant();
074            } catch (IOException e) {
075                throw new RuntimeException(e);
076            }
077        }
078    
079        /**
080         * Package-private constructor
081         */
082        private Language() {
083            super();
084        }
085    
086        public void checkValid(CharSequence lit)
087                throws DatatypeException {
088            String literal = lit.toString();
089            if (literal.length() == 0) {
090                throw new DatatypeException(
091                        "The empty string is not a valid language tag.");
092            }
093            literal = toAsciiLowerCase(literal);
094            if (isGrandfathered(literal)) {
095                if (isDeprecated(literal)) {
096                    throw new DatatypeException(
097                    "The grandfathered language tag \u201C" + literal + "\u201D is deprecated.");                
098                }
099                return;
100            }
101            if (literal.startsWith("-")) {
102                throw new DatatypeException(
103                        "Language tag must not start with HYPHEN-MINUS.");
104            }
105            if (literal.endsWith("-")) {
106                throw new DatatypeException(
107                        "Language tag must not end with HYPHEN-MINUS.");
108            }
109            
110            String[] subtags = HYPHEN.split(literal);
111            
112            for (int j = 0; j < subtags.length; j++) {
113                int len = subtags[j].length();
114                if (len == 0) {
115                    throw new DatatypeException(
116                    "Zero-length subtag.");                                
117                } else if (len > 8) {
118                    throw new DatatypeException(
119                    "Subtags must next exceed 8 characters in length.");                                                
120                }
121            }
122            
123            // Language
124            
125            int i = 0;
126            String subtag = subtags[i];
127            int len = subtag.length();
128            if ("x".equals(subtag)) {
129                checkPrivateUse(i, subtags);
130                return;
131            }
132            if ((len == 2 || len == 3) && isLowerCaseAlpha(subtag)) {
133                if (!isLanguage(subtag)) {
134                    throw new DatatypeException(
135                            "Bad ISO language part in language tag.");
136                }
137                if (isDeprecated(subtag)) {
138                    throw new DatatypeException(
139                    "The language subtag \u201C" + subtag + "\u201D is deprecated.");                
140                }
141                i++;
142                if (i == subtags.length) {
143                    return;
144                }
145                subtag = subtags[i];
146                len = subtag.length();
147                if (len == 3) {
148                    throw new DatatypeException(
149                            "Found reserved language extension subtag.");
150                }
151            } else if (len == 4 && isLowerCaseAlpha(subtag)) {
152                throw new DatatypeException("Found reserved language tag.");
153            } else if (len == 5 && isLowerCaseAlpha(subtag)) {
154                if (!isLanguage(subtag)) {
155                    throw new DatatypeException(
156                            "Bad IANA language part in language tag.");
157                }
158                if (isDeprecated(subtag)) {
159                    throw new DatatypeException(
160                    "The language subtag \u201C" + subtag + "\u201D is deprecated.");                
161                }
162                i++;
163                if (i == subtags.length) {
164                    return;
165                }
166                subtag = subtags[i];
167                len = subtag.length();
168            }
169            
170            // Script?
171            
172            if ("x".equals(subtag)) {
173                checkPrivateUse(i, subtags);
174                return;
175            }
176            if (subtag.length() == 4) {
177                if (!isScript(subtag)) {
178                    throw new DatatypeException("Bad script subtag.");
179                }
180                if (isDeprecated(subtag)) {
181                    throw new DatatypeException(
182                    "The script subtag \u201C" + subtag + "\u201D is deprecated.");                
183                }
184                if (shouldSuppressScript(subtags[0], subtag)) {
185                    throw new DatatypeException("Language tag should omit the default script for the language.");                
186                }
187                i++;
188                if (i == subtags.length) {
189                    return;
190                }
191                subtag = subtags[i];
192                len = subtag.length();
193            }
194            
195            // Region
196            
197            if ((len == 3 && isDigit(subtag))
198                    || (len == 2 && isLowerCaseAlpha(subtag))) {
199                if (!isRegion(subtag)) {
200                    throw new DatatypeException("Bad region subtag.");
201                }
202                if (isDeprecated(subtag)) {
203                    throw new DatatypeException(
204                    "The region subtag \u201C" + subtag + "\u201D is deprecated.");                
205                }
206                i++;
207                if (i == subtags.length) {
208                    return;
209                }
210                subtag = subtags[i];
211                len = subtag.length();
212            }
213            
214            // Variant
215            
216            for (;;) {
217                if ("x".equals(subtag)) {
218                    checkPrivateUse(i, subtags);
219                    return;
220                }
221                // cutting corners here a bit since there are no extensions at this time
222                if (len == 1) {
223                    throw new DatatypeException("Unknown extension.");
224                } else {
225                    if (!isVariant(subtag)) {
226                        throw new DatatypeException("Bad variant subtag.");
227                    }
228                    if (isDeprecated(subtag)) {
229                        throw new DatatypeException(
230                        "The variant subtag \u201C" + subtag + "\u201D is deprecated.");                
231                    }
232                    if (!hasGoodPrefix(subtags, i)) {
233                        throw new DatatypeException("Variant lacks required prefix.");                    
234                    }
235                }
236                i++;
237                if (i == subtags.length) {
238                    return;
239                }
240                subtag = subtags[i];
241                len = subtag.length();
242            }
243        }
244    
245        private boolean hasGoodPrefix(String[] subtags, int i) {
246            String variant = subtags[i];
247            int index = Arrays.binarySearch(variants, variant);
248            assert index >= 0;
249            String[][] prefixes = prefixesByVariant[index];
250            if (prefixes.length == 0) {
251                return true;
252            }
253            for (int j = 0; j < prefixes.length; j++) {
254                String[] prefix = prefixes[j];
255                if (prefixMatches(prefix, subtags, i)) {
256                    return true;
257                }
258            }
259            return false;
260        }
261    
262        private boolean prefixMatches(String[] prefix, String[] subtags, int limit) {
263            for (int i = 0; i < prefix.length; i++) {
264                String prefixComponent = prefix[i];
265                if (!subtagsContainPrefixComponent(prefixComponent, subtags, limit)) {
266                    return false;   
267                }
268            }
269            return true;
270        }
271    
272        private boolean subtagsContainPrefixComponent(String prefixComponent, String[] subtags, int limit) {
273            for (int i = 0; i < limit; i++) {
274                String subtag = subtags[i];
275                if (subtag.equals(prefixComponent)) {
276                    return true;
277                }
278            }
279            return false;
280        }
281    
282        private boolean shouldSuppressScript(String language, String script) {
283            int langIndex = Arrays.binarySearch(languages, language);
284            assert langIndex > -1;
285            int scriptIndex = suppressedScriptByLanguage[langIndex];
286            if (scriptIndex < 0) {
287                return false;
288            } else {
289                return scripts[scriptIndex].equals(script);
290            }
291        }
292    
293        private boolean isVariant(String subtag) {
294            return (Arrays.binarySearch(variants, subtag) > -1);
295        }
296    
297        private boolean isRegion(String subtag) {
298            return (Arrays.binarySearch(regions, subtag) > -1) || "aa".equals(subtag)
299                    || ("qm".compareTo(subtag) <= 0 && "qz".compareTo(subtag) >= 0)
300                    || ("xa".compareTo(subtag) <= 0 && "xz".compareTo(subtag) >= 0)
301                    || "zz".equals(subtag);
302        }
303    
304        private boolean isScript(String subtag) {
305            return (Arrays.binarySearch(scripts, subtag) > -1)
306                    || ("qaaa".compareTo(subtag) <= 0 && "qabx".compareTo(subtag) >= 0);
307        }
308    
309        private boolean isLanguage(String subtag) {
310            return (Arrays.binarySearch(languages, subtag) > -1)
311                    || ("qaa".compareTo(subtag) <= 0 && "qtz".compareTo(subtag) >= 0);
312        }
313    
314        private void checkPrivateUse(int i, String[] subtags)
315                throws DatatypeException {
316            int len = subtags.length;
317            i++;
318            if (i == len) {
319                throw new DatatypeException("No subtags in private use sequence.");
320            }
321            while (i < len) {
322                String subtag = subtags[i];
323                if (!isLowerCaseAlphaNumeric(subtag)) {
324                    throw new DatatypeException(
325                            "Bad character in private use subtag.");
326                }
327                i++;
328            }
329        }
330    
331        private final boolean isLowerCaseAlphaNumeric(char c) {
332            return isLowerCaseAlpha(c) || isDigit(c);
333        }
334    
335        private final boolean isLowerCaseAlphaNumeric(String str) {
336            for (int i = 0; i < str.length(); i++) {
337                if (!isLowerCaseAlphaNumeric(str.charAt(i))) {
338                    return false;
339                }
340            }
341            return true;
342        }
343    
344        /**
345         * @param c
346         * @return
347         */
348        private final boolean isDigit(char c) {
349            return (c >= '0' && c <= '9');
350        }
351    
352        private final boolean isDigit(String str) {
353            for (int i = 0; i < str.length(); i++) {
354                if (!isDigit(str.charAt(i))) {
355                    return false;
356                }
357            }
358            return true;
359        }
360    
361        /**
362         * @param c
363         * @return
364         */
365        private final boolean isLowerCaseAlpha(char c) {
366            return (c >= 'a' && c <= 'z');
367        }
368    
369        private final boolean isLowerCaseAlpha(String str) {
370            for (int i = 0; i < str.length(); i++) {
371                if (!isLowerCaseAlpha(str.charAt(i))) {
372                    return false;
373                }
374            }
375            return true;
376        }
377    
378        private boolean isGrandfathered(String literal) {
379            return Arrays.binarySearch(grandfathered, literal) > -1;
380        }
381        
382        private boolean isDeprecated(String subtag) {
383            return Arrays.binarySearch(deprecated, subtag) > -1;
384        }
385    }