001    /*
002     * Copyright (c) 2006, 2007 Henri Sivonen
003     * Copyright (c) 2007 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.extra;
025    
026    import nu.validator.htmlparser.common.CharacterHandler;
027    
028    import org.xml.sax.ErrorHandler;
029    import org.xml.sax.Locator;
030    import org.xml.sax.SAXException;
031    import org.xml.sax.SAXParseException;
032    
033    import com.ibm.icu.lang.UCharacter;
034    import com.ibm.icu.text.Normalizer;
035    import com.ibm.icu.text.UnicodeSet;
036    
037    /**
038     * @version $Id$
039     * @author hsivonen
040     */
041    public final class NormalizationChecker implements CharacterHandler {
042    
043        private ErrorHandler errorHandler;
044    
045        private Locator locator;
046    
047        /**
048         * A thread-safe set of composing characters as per Charmod Norm.
049         */
050        @SuppressWarnings("deprecation")
051        private static final UnicodeSet COMPOSING_CHARACTERS = (UnicodeSet) new UnicodeSet(
052                "[[:nfc_qc=maybe:][:^ccc=0:]]").freeze();
053    
054        // see http://sourceforge.net/mailarchive/message.php?msg_id=37279908
055    
056        /**
057         * A buffer for holding sequences overlap the SAX buffer boundary.
058         */
059        private char[] buf = new char[128];
060    
061        /**
062         * A holder for the original buffer (for the memory leak prevention 
063         * mechanism).
064         */
065        private char[] bufHolder = null;
066    
067        /**
068         * The current used length of the buffer, i.e. the index of the first slot 
069         * that does not hold current data.
070         */
071        private int pos;
072    
073        /**
074         * Indicates whether the checker the next call to <code>characters()</code> 
075         * is the first call in a run.
076         */
077        private boolean atStartOfRun;
078    
079        /**
080         * Indicates whether the current run has already caused an error.
081         */
082        private boolean alreadyComplainedAboutThisRun;
083    
084        /**
085         * Emit an error. The locator is used.
086         * 
087         * @param message the error message
088         * @throws SAXException if something goes wrong
089         */
090        public void err(String message) throws SAXException {
091            if (errorHandler != null) {
092                SAXParseException spe = new SAXParseException(message, locator);
093                errorHandler.error(spe);
094            }
095        }
096    
097        /**
098         * Returns <code>true</code> if the argument is a composing BMP character 
099         * or a surrogate and <code>false</code> otherwise.
100         * 
101         * @param c a UTF-16 code unit
102         * @return <code>true</code> if the argument is a composing BMP character 
103         * or a surrogate and <code>false</code> otherwise
104         */
105        private static boolean isComposingCharOrSurrogate(char c) {
106            if (UCharacter.isHighSurrogate(c) || UCharacter.isLowSurrogate(c)) {
107                return true;
108            }
109            return isComposingChar(c);
110        }
111    
112        /**
113         * Returns <code>true</code> if the argument is a composing character 
114         * and <code>false</code> otherwise.
115         * 
116         * @param c a Unicode code point
117         * @return <code>true</code> if the argument is a composing character 
118         * <code>false</code> otherwise
119         */
120        private static boolean isComposingChar(int c) {
121            return COMPOSING_CHARACTERS.contains(c);
122        }
123    
124        /**
125         * Constructor with mode selection.
126         * 
127         * @param sourceTextMode whether the source text-related messages 
128         * should be enabled.
129         */
130        public NormalizationChecker(Locator locator) {
131            super();
132            start();
133        }
134    
135        /**
136         * @see nu.validator.htmlparser.common.CharacterHandler#start()
137         */
138        public void start() {
139            atStartOfRun = true;
140            alreadyComplainedAboutThisRun = false;
141            pos = 0;
142        }
143    
144        /**
145         * @see nu.validator.htmlparser.common.CharacterHandler#characters(char[], int, int)
146         */
147        public void characters(char[] ch, int start, int length)
148                throws SAXException {
149            if (alreadyComplainedAboutThisRun) {
150                return;
151            }
152            if (atStartOfRun) {
153                char c = ch[start];
154                if (pos == 1) {
155                    // there's a single high surrogate in buf
156                    if (isComposingChar(UCharacter.getCodePoint(buf[0], c))) {
157                        err("Text run starts with a composing character.");
158                    }
159                    atStartOfRun = false;
160                } else {
161                    if (length == 1 && UCharacter.isHighSurrogate(c)) {
162                        buf[0] = c;
163                        pos = 1;
164                        return;
165                    } else {
166                        if (UCharacter.isHighSurrogate(c)) {
167                            if (isComposingChar(UCharacter.getCodePoint(c,
168                                    ch[start + 1]))) {
169                                err("Text run starts with a composing character.");
170                            }
171                        } else {
172                            if (isComposingCharOrSurrogate(c)) {
173                                err("Text run starts with a composing character.");
174                            }
175                        }
176                        atStartOfRun = false;
177                    }
178                }
179            }
180            int i = start;
181            int stop = start + length;
182            if (pos > 0) {
183                // there's stuff in buf
184                while (i < stop && isComposingCharOrSurrogate(ch[i])) {
185                    i++;
186                }
187                appendToBuf(ch, start, i);
188                if (i == stop) {
189                    return;
190                } else {
191                    if (!Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
192                        errAboutTextRun();
193                    }
194                    pos = 0;
195                }
196            }
197            if (i < stop) {
198                start = i;
199                i = stop - 1;
200                while (i > start && isComposingCharOrSurrogate(ch[i])) {
201                    i--;
202                }
203                if (i > start) {
204                    if (!Normalizer.isNormalized(ch, start, i, Normalizer.NFC, 0)) {
205                        errAboutTextRun();
206                    }
207                }
208                appendToBuf(ch, i, stop);
209            }
210        }
211    
212        /**
213         * Emits an error stating that the current text run or the source 
214         * text is not in NFC.
215         * 
216         * @throws SAXException if the <code>ErrorHandler</code> throws
217         */
218        private void errAboutTextRun() throws SAXException {
219            err("Source text is not in Unicode Normalization Form C.");
220            alreadyComplainedAboutThisRun = true;
221        }
222    
223        /**
224         * Appends a slice of an UTF-16 code unit array to the internal 
225         * buffer.
226         * 
227         * @param ch the array from which to copy
228         * @param start the index of the first element that is copied
229         * @param end the index of the first element that is not copied
230         */
231        private void appendToBuf(char[] ch, int start, int end) {
232            if (start == end) {
233                return;
234            }
235            int neededBufLen = pos + (end - start);
236            if (neededBufLen > buf.length) {
237                char[] newBuf = new char[neededBufLen];
238                System.arraycopy(buf, 0, newBuf, 0, pos);
239                if (bufHolder == null) {
240                    bufHolder = buf; // keep the original around
241                }
242                buf = newBuf;
243            }
244            System.arraycopy(ch, start, buf, pos, end - start);
245            pos += (end - start);
246        }
247    
248        /**
249         * @see nu.validator.htmlparser.common.CharacterHandler#end()
250         */
251        public void end() throws SAXException {
252            if (!alreadyComplainedAboutThisRun
253                    && !Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
254                errAboutTextRun();
255            }
256            if (bufHolder != null) {
257                // restore the original small buffer to avoid leaking
258                // memory if this checker is recycled
259                buf = bufHolder;
260                bufHolder = null;
261            }
262        }
263    
264        public void setErrorHandler(ErrorHandler errorHandler) {
265            this.errorHandler = errorHandler;
266        }
267    
268    }