001 /*
002 * Copyright (c) 2006, 2007 Henri Sivonen
003 * Copyright (c) 2007 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.impl;
025
026 import org.xml.sax.ErrorHandler;
027 import org.xml.sax.Locator;
028 import org.xml.sax.SAXException;
029 import org.xml.sax.SAXParseException;
030
031 import com.ibm.icu.lang.UCharacter;
032 import com.ibm.icu.text.Normalizer;
033 import com.ibm.icu.text.UnicodeSet;
034
035 /**
036 * @version $Id: NormalizationChecker.java 155 2007-09-13 11:54:32Z hsivonen $
037 * @author hsivonen
038 */
039 public final class NormalizationChecker implements CharacterHandler {
040
041 private ErrorHandler errorHandler;
042
043 private Locator locator;
044
045 /**
046 * A thread-safe set of composing characters as per Charmod Norm.
047 */
048 @SuppressWarnings("deprecation")
049 private static final UnicodeSet COMPOSING_CHARACTERS = (UnicodeSet) new UnicodeSet(
050 "[[:nfc_qc=maybe:][:^ccc=0:]]").freeze();
051
052 // see http://sourceforge.net/mailarchive/message.php?msg_id=37279908
053
054 /**
055 * A buffer for holding sequences overlap the SAX buffer boundary.
056 */
057 private char[] buf = new char[128];
058
059 /**
060 * A holder for the original buffer (for the memory leak prevention
061 * mechanism).
062 */
063 private char[] bufHolder = null;
064
065 /**
066 * The current used length of the buffer, i.e. the index of the first slot
067 * that does not hold current data.
068 */
069 private int pos;
070
071 /**
072 * Indicates whether the checker the next call to <code>characters()</code>
073 * is the first call in a run.
074 */
075 private boolean atStartOfRun;
076
077 /**
078 * Indicates whether the current run has already caused an error.
079 */
080 private boolean alreadyComplainedAboutThisRun;
081
082 /**
083 * Emit an error. The locator is used.
084 *
085 * @param message the error message
086 * @throws SAXException if something goes wrong
087 */
088 public void err(String message) throws SAXException {
089 if (errorHandler != null) {
090 SAXParseException spe = new SAXParseException(message, locator);
091 errorHandler.error(spe);
092 }
093 }
094
095 /**
096 * Returns <code>true</code> if the argument is a composing BMP character
097 * or a surrogate and <code>false</code> otherwise.
098 *
099 * @param c a UTF-16 code unit
100 * @return <code>true</code> if the argument is a composing BMP character
101 * or a surrogate and <code>false</code> otherwise
102 */
103 private static boolean isComposingCharOrSurrogate(char c) {
104 if (UCharacter.isHighSurrogate(c) || UCharacter.isLowSurrogate(c)) {
105 return true;
106 }
107 return isComposingChar(c);
108 }
109
110 /**
111 * Returns <code>true</code> if the argument is a composing character
112 * and <code>false</code> otherwise.
113 *
114 * @param c a Unicode code point
115 * @return <code>true</code> if the argument is a composing character
116 * <code>false</code> otherwise
117 */
118 private static boolean isComposingChar(int c) {
119 return COMPOSING_CHARACTERS.contains(c);
120 }
121
122 /**
123 * Constructor with mode selection.
124 *
125 * @param sourceTextMode whether the source text-related messages
126 * should be enabled.
127 */
128 public NormalizationChecker(Locator locator) {
129 super();
130 start();
131 }
132
133 /**
134 * @see nu.validator.htmlparser.impl.CharacterHandler#start()
135 */
136 public void start() {
137 atStartOfRun = true;
138 alreadyComplainedAboutThisRun = false;
139 pos = 0;
140 }
141
142 /**
143 * @see nu.validator.htmlparser.impl.CharacterHandler#characters(char[], int, int)
144 */
145 public void characters(char[] ch, int start, int length)
146 throws SAXException {
147 if (alreadyComplainedAboutThisRun) {
148 return;
149 }
150 if (atStartOfRun) {
151 char c = ch[start];
152 if (pos == 1) {
153 // there's a single high surrogate in buf
154 if (isComposingChar(UCharacter.getCodePoint(buf[0], c))) {
155 err("Text run starts with a composing character.");
156 }
157 atStartOfRun = false;
158 } else {
159 if (length == 1 && UCharacter.isHighSurrogate(c)) {
160 buf[0] = c;
161 pos = 1;
162 return;
163 } else {
164 if (UCharacter.isHighSurrogate(c)) {
165 if (isComposingChar(UCharacter.getCodePoint(c,
166 ch[start + 1]))) {
167 err("Text run starts with a composing character.");
168 }
169 } else {
170 if (isComposingCharOrSurrogate(c)) {
171 err("Text run starts with a composing character.");
172 }
173 }
174 atStartOfRun = false;
175 }
176 }
177 }
178 int i = start;
179 int stop = start + length;
180 if (pos > 0) {
181 // there's stuff in buf
182 while (i < stop && isComposingCharOrSurrogate(ch[i])) {
183 i++;
184 }
185 appendToBuf(ch, start, i);
186 if (i == stop) {
187 return;
188 } else {
189 if (!Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
190 errAboutTextRun();
191 }
192 pos = 0;
193 }
194 }
195 if (i < stop) {
196 start = i;
197 i = stop - 1;
198 while (i > start && isComposingCharOrSurrogate(ch[i])) {
199 i--;
200 }
201 if (i > start) {
202 if (!Normalizer.isNormalized(ch, start, i, Normalizer.NFC, 0)) {
203 errAboutTextRun();
204 }
205 }
206 appendToBuf(ch, i, stop);
207 }
208 }
209
210 /**
211 * Emits an error stating that the current text run or the source
212 * text is not in NFC.
213 *
214 * @throws SAXException if the <code>ErrorHandler</code> throws
215 */
216 private void errAboutTextRun() throws SAXException {
217 err("Source text is not in Unicode Normalization Form C.");
218 alreadyComplainedAboutThisRun = true;
219 }
220
221 /**
222 * Appends a slice of an UTF-16 code unit array to the internal
223 * buffer.
224 *
225 * @param ch the array from which to copy
226 * @param start the index of the first element that is copied
227 * @param end the index of the first element that is not copied
228 */
229 private void appendToBuf(char[] ch, int start, int end) {
230 if (start == end) {
231 return;
232 }
233 int neededBufLen = pos + (end - start);
234 if (neededBufLen > buf.length) {
235 char[] newBuf = new char[neededBufLen];
236 System.arraycopy(buf, 0, newBuf, 0, pos);
237 if (bufHolder == null) {
238 bufHolder = buf; // keep the original around
239 }
240 buf = newBuf;
241 }
242 System.arraycopy(ch, start, buf, pos, end - start);
243 pos += (end - start);
244 }
245
246 /**
247 * @see nu.validator.htmlparser.impl.CharacterHandler#end()
248 */
249 public void end() throws SAXException {
250 if (!alreadyComplainedAboutThisRun
251 && !Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
252 errAboutTextRun();
253 }
254 if (bufHolder != null) {
255 // restore the original small buffer to avoid leaking
256 // memory if this checker is recycled
257 buf = bufHolder;
258 bufHolder = null;
259 }
260 }
261
262 public void setErrorHandler(ErrorHandler errorHandler) {
263 this.errorHandler = errorHandler;
264 }
265
266 }