001    /*
002     * Copyright (c) 2006, 2007 Henri Sivonen
003     * Copyright (c) 2007 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.impl;
025    
026    import org.xml.sax.ErrorHandler;
027    import org.xml.sax.Locator;
028    import org.xml.sax.SAXException;
029    import org.xml.sax.SAXParseException;
030    
031    import com.ibm.icu.lang.UCharacter;
032    import com.ibm.icu.text.Normalizer;
033    import com.ibm.icu.text.UnicodeSet;
034    
035    /**
036     * @version $Id: NormalizationChecker.java 155 2007-09-13 11:54:32Z hsivonen $
037     * @author hsivonen
038     */
039    public final class NormalizationChecker implements CharacterHandler {
040    
041        private ErrorHandler errorHandler;
042    
043        private Locator locator;
044    
045        /**
046         * A thread-safe set of composing characters as per Charmod Norm.
047         */
048        @SuppressWarnings("deprecation")
049        private static final UnicodeSet COMPOSING_CHARACTERS = (UnicodeSet) new UnicodeSet(
050                "[[:nfc_qc=maybe:][:^ccc=0:]]").freeze();
051    
052        // see http://sourceforge.net/mailarchive/message.php?msg_id=37279908
053    
054        /**
055         * A buffer for holding sequences overlap the SAX buffer boundary.
056         */
057        private char[] buf = new char[128];
058    
059        /**
060         * A holder for the original buffer (for the memory leak prevention 
061         * mechanism).
062         */
063        private char[] bufHolder = null;
064    
065        /**
066         * The current used length of the buffer, i.e. the index of the first slot 
067         * that does not hold current data.
068         */
069        private int pos;
070    
071        /**
072         * Indicates whether the checker the next call to <code>characters()</code> 
073         * is the first call in a run.
074         */
075        private boolean atStartOfRun;
076    
077        /**
078         * Indicates whether the current run has already caused an error.
079         */
080        private boolean alreadyComplainedAboutThisRun;
081    
082        /**
083         * Emit an error. The locator is used.
084         * 
085         * @param message the error message
086         * @throws SAXException if something goes wrong
087         */
088        public void err(String message) throws SAXException {
089            if (errorHandler != null) {
090                SAXParseException spe = new SAXParseException(message, locator);
091                errorHandler.error(spe);
092            }
093        }
094    
095        /**
096         * Returns <code>true</code> if the argument is a composing BMP character 
097         * or a surrogate and <code>false</code> otherwise.
098         * 
099         * @param c a UTF-16 code unit
100         * @return <code>true</code> if the argument is a composing BMP character 
101         * or a surrogate and <code>false</code> otherwise
102         */
103        private static boolean isComposingCharOrSurrogate(char c) {
104            if (UCharacter.isHighSurrogate(c) || UCharacter.isLowSurrogate(c)) {
105                return true;
106            }
107            return isComposingChar(c);
108        }
109    
110        /**
111         * Returns <code>true</code> if the argument is a composing character 
112         * and <code>false</code> otherwise.
113         * 
114         * @param c a Unicode code point
115         * @return <code>true</code> if the argument is a composing character 
116         * <code>false</code> otherwise
117         */
118        private static boolean isComposingChar(int c) {
119            return COMPOSING_CHARACTERS.contains(c);
120        }
121    
122        /**
123         * Constructor with mode selection.
124         * 
125         * @param sourceTextMode whether the source text-related messages 
126         * should be enabled.
127         */
128        public NormalizationChecker(Locator locator) {
129            super();
130            start();
131        }
132    
133        /**
134         * @see nu.validator.htmlparser.impl.CharacterHandler#start()
135         */
136        public void start() {
137            atStartOfRun = true;
138            alreadyComplainedAboutThisRun = false;
139            pos = 0;
140        }
141    
142        /**
143         * @see nu.validator.htmlparser.impl.CharacterHandler#characters(char[], int, int)
144         */
145        public void characters(char[] ch, int start, int length)
146                throws SAXException {
147            if (alreadyComplainedAboutThisRun) {
148                return;
149            }
150            if (atStartOfRun) {
151                char c = ch[start];
152                if (pos == 1) {
153                    // there's a single high surrogate in buf
154                    if (isComposingChar(UCharacter.getCodePoint(buf[0], c))) {
155                        err("Text run starts with a composing character.");
156                    }
157                    atStartOfRun = false;
158                } else {
159                    if (length == 1 && UCharacter.isHighSurrogate(c)) {
160                        buf[0] = c;
161                        pos = 1;
162                        return;
163                    } else {
164                        if (UCharacter.isHighSurrogate(c)) {
165                            if (isComposingChar(UCharacter.getCodePoint(c,
166                                    ch[start + 1]))) {
167                                err("Text run starts with a composing character.");
168                            }
169                        } else {
170                            if (isComposingCharOrSurrogate(c)) {
171                                err("Text run starts with a composing character.");
172                            }
173                        }
174                        atStartOfRun = false;
175                    }
176                }
177            }
178            int i = start;
179            int stop = start + length;
180            if (pos > 0) {
181                // there's stuff in buf
182                while (i < stop && isComposingCharOrSurrogate(ch[i])) {
183                    i++;
184                }
185                appendToBuf(ch, start, i);
186                if (i == stop) {
187                    return;
188                } else {
189                    if (!Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
190                        errAboutTextRun();
191                    }
192                    pos = 0;
193                }
194            }
195            if (i < stop) {
196                start = i;
197                i = stop - 1;
198                while (i > start && isComposingCharOrSurrogate(ch[i])) {
199                    i--;
200                }
201                if (i > start) {
202                    if (!Normalizer.isNormalized(ch, start, i, Normalizer.NFC, 0)) {
203                        errAboutTextRun();
204                    }
205                }
206                appendToBuf(ch, i, stop);
207            }
208        }
209    
210        /**
211         * Emits an error stating that the current text run or the source 
212         * text is not in NFC.
213         * 
214         * @throws SAXException if the <code>ErrorHandler</code> throws
215         */
216        private void errAboutTextRun() throws SAXException {
217            err("Source text is not in Unicode Normalization Form C.");
218            alreadyComplainedAboutThisRun = true;
219        }
220    
221        /**
222         * Appends a slice of an UTF-16 code unit array to the internal 
223         * buffer.
224         * 
225         * @param ch the array from which to copy
226         * @param start the index of the first element that is copied
227         * @param end the index of the first element that is not copied
228         */
229        private void appendToBuf(char[] ch, int start, int end) {
230            if (start == end) {
231                return;
232            }
233            int neededBufLen = pos + (end - start);
234            if (neededBufLen > buf.length) {
235                char[] newBuf = new char[neededBufLen];
236                System.arraycopy(buf, 0, newBuf, 0, pos);
237                if (bufHolder == null) {
238                    bufHolder = buf; // keep the original around
239                }
240                buf = newBuf;
241            }
242            System.arraycopy(ch, start, buf, pos, end - start);
243            pos += (end - start);
244        }
245    
246        /**
247         * @see nu.validator.htmlparser.impl.CharacterHandler#end()
248         */
249        public void end() throws SAXException {
250            if (!alreadyComplainedAboutThisRun
251                    && !Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
252                errAboutTextRun();
253            }
254            if (bufHolder != null) {
255                // restore the original small buffer to avoid leaking
256                // memory if this checker is recycled
257                buf = bufHolder;
258                bufHolder = null;
259            }
260        }
261    
262        public void setErrorHandler(ErrorHandler errorHandler) {
263            this.errorHandler = errorHandler;
264        }
265    
266    }