001 /*
002 * Copyright (c) 2006, 2007 Henri Sivonen
003 * Copyright (c) 2007 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.extra;
025
026 import nu.validator.htmlparser.common.CharacterHandler;
027
028 import org.xml.sax.ErrorHandler;
029 import org.xml.sax.Locator;
030 import org.xml.sax.SAXException;
031 import org.xml.sax.SAXParseException;
032
033 import com.ibm.icu.lang.UCharacter;
034 import com.ibm.icu.text.Normalizer;
035 import com.ibm.icu.text.UnicodeSet;
036
037 /**
038 * @version $Id$
039 * @author hsivonen
040 */
041 public final class NormalizationChecker implements CharacterHandler {
042
043 private ErrorHandler errorHandler;
044
045 private Locator locator;
046
047 /**
048 * A thread-safe set of composing characters as per Charmod Norm.
049 */
050 @SuppressWarnings("deprecation")
051 private static final UnicodeSet COMPOSING_CHARACTERS = (UnicodeSet) new UnicodeSet(
052 "[[:nfc_qc=maybe:][:^ccc=0:]]").freeze();
053
054 // see http://sourceforge.net/mailarchive/message.php?msg_id=37279908
055
056 /**
057 * A buffer for holding sequences overlap the SAX buffer boundary.
058 */
059 private char[] buf = new char[128];
060
061 /**
062 * A holder for the original buffer (for the memory leak prevention
063 * mechanism).
064 */
065 private char[] bufHolder = null;
066
067 /**
068 * The current used length of the buffer, i.e. the index of the first slot
069 * that does not hold current data.
070 */
071 private int pos;
072
073 /**
074 * Indicates whether the checker the next call to <code>characters()</code>
075 * is the first call in a run.
076 */
077 private boolean atStartOfRun;
078
079 /**
080 * Indicates whether the current run has already caused an error.
081 */
082 private boolean alreadyComplainedAboutThisRun;
083
084 /**
085 * Emit an error. The locator is used.
086 *
087 * @param message the error message
088 * @throws SAXException if something goes wrong
089 */
090 public void err(String message) throws SAXException {
091 if (errorHandler != null) {
092 SAXParseException spe = new SAXParseException(message, locator);
093 errorHandler.error(spe);
094 }
095 }
096
097 /**
098 * Returns <code>true</code> if the argument is a composing BMP character
099 * or a surrogate and <code>false</code> otherwise.
100 *
101 * @param c a UTF-16 code unit
102 * @return <code>true</code> if the argument is a composing BMP character
103 * or a surrogate and <code>false</code> otherwise
104 */
105 private static boolean isComposingCharOrSurrogate(char c) {
106 if (UCharacter.isHighSurrogate(c) || UCharacter.isLowSurrogate(c)) {
107 return true;
108 }
109 return isComposingChar(c);
110 }
111
112 /**
113 * Returns <code>true</code> if the argument is a composing character
114 * and <code>false</code> otherwise.
115 *
116 * @param c a Unicode code point
117 * @return <code>true</code> if the argument is a composing character
118 * <code>false</code> otherwise
119 */
120 private static boolean isComposingChar(int c) {
121 return COMPOSING_CHARACTERS.contains(c);
122 }
123
124 /**
125 * Constructor with mode selection.
126 *
127 * @param sourceTextMode whether the source text-related messages
128 * should be enabled.
129 */
130 public NormalizationChecker(Locator locator) {
131 super();
132 start();
133 }
134
135 /**
136 * @see nu.validator.htmlparser.common.CharacterHandler#start()
137 */
138 public void start() {
139 atStartOfRun = true;
140 alreadyComplainedAboutThisRun = false;
141 pos = 0;
142 }
143
144 /**
145 * @see nu.validator.htmlparser.common.CharacterHandler#characters(char[], int, int)
146 */
147 public void characters(char[] ch, int start, int length)
148 throws SAXException {
149 if (alreadyComplainedAboutThisRun) {
150 return;
151 }
152 if (atStartOfRun) {
153 char c = ch[start];
154 if (pos == 1) {
155 // there's a single high surrogate in buf
156 if (isComposingChar(UCharacter.getCodePoint(buf[0], c))) {
157 err("Text run starts with a composing character.");
158 }
159 atStartOfRun = false;
160 } else {
161 if (length == 1 && UCharacter.isHighSurrogate(c)) {
162 buf[0] = c;
163 pos = 1;
164 return;
165 } else {
166 if (UCharacter.isHighSurrogate(c)) {
167 if (isComposingChar(UCharacter.getCodePoint(c,
168 ch[start + 1]))) {
169 err("Text run starts with a composing character.");
170 }
171 } else {
172 if (isComposingCharOrSurrogate(c)) {
173 err("Text run starts with a composing character.");
174 }
175 }
176 atStartOfRun = false;
177 }
178 }
179 }
180 int i = start;
181 int stop = start + length;
182 if (pos > 0) {
183 // there's stuff in buf
184 while (i < stop && isComposingCharOrSurrogate(ch[i])) {
185 i++;
186 }
187 appendToBuf(ch, start, i);
188 if (i == stop) {
189 return;
190 } else {
191 if (!Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
192 errAboutTextRun();
193 }
194 pos = 0;
195 }
196 }
197 if (i < stop) {
198 start = i;
199 i = stop - 1;
200 while (i > start && isComposingCharOrSurrogate(ch[i])) {
201 i--;
202 }
203 if (i > start) {
204 if (!Normalizer.isNormalized(ch, start, i, Normalizer.NFC, 0)) {
205 errAboutTextRun();
206 }
207 }
208 appendToBuf(ch, i, stop);
209 }
210 }
211
212 /**
213 * Emits an error stating that the current text run or the source
214 * text is not in NFC.
215 *
216 * @throws SAXException if the <code>ErrorHandler</code> throws
217 */
218 private void errAboutTextRun() throws SAXException {
219 err("Source text is not in Unicode Normalization Form C.");
220 alreadyComplainedAboutThisRun = true;
221 }
222
223 /**
224 * Appends a slice of an UTF-16 code unit array to the internal
225 * buffer.
226 *
227 * @param ch the array from which to copy
228 * @param start the index of the first element that is copied
229 * @param end the index of the first element that is not copied
230 */
231 private void appendToBuf(char[] ch, int start, int end) {
232 if (start == end) {
233 return;
234 }
235 int neededBufLen = pos + (end - start);
236 if (neededBufLen > buf.length) {
237 char[] newBuf = new char[neededBufLen];
238 System.arraycopy(buf, 0, newBuf, 0, pos);
239 if (bufHolder == null) {
240 bufHolder = buf; // keep the original around
241 }
242 buf = newBuf;
243 }
244 System.arraycopy(ch, start, buf, pos, end - start);
245 pos += (end - start);
246 }
247
248 /**
249 * @see nu.validator.htmlparser.common.CharacterHandler#end()
250 */
251 public void end() throws SAXException {
252 if (!alreadyComplainedAboutThisRun
253 && !Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
254 errAboutTextRun();
255 }
256 if (bufHolder != null) {
257 // restore the original small buffer to avoid leaking
258 // memory if this checker is recycled
259 buf = bufHolder;
260 bufHolder = null;
261 }
262 }
263
264 public void setErrorHandler(ErrorHandler errorHandler) {
265 this.errorHandler = errorHandler;
266 }
267
268 }