001 /* 002 * Copyright (c) 2006, 2007 Henri Sivonen 003 * Copyright (c) 2007 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.htmlparser.extra; 025 026 import nu.validator.htmlparser.common.CharacterHandler; 027 028 import org.xml.sax.ErrorHandler; 029 import org.xml.sax.Locator; 030 import org.xml.sax.SAXException; 031 import org.xml.sax.SAXParseException; 032 033 import com.ibm.icu.lang.UCharacter; 034 import com.ibm.icu.text.Normalizer; 035 import com.ibm.icu.text.UnicodeSet; 036 037 /** 038 * @version $Id$ 039 * @author hsivonen 040 */ 041 public final class NormalizationChecker implements CharacterHandler { 042 043 private ErrorHandler errorHandler; 044 045 private Locator locator; 046 047 /** 048 * A thread-safe set of composing characters as per Charmod Norm. 049 */ 050 @SuppressWarnings("deprecation") 051 private static final UnicodeSet COMPOSING_CHARACTERS = (UnicodeSet) new UnicodeSet( 052 "[[:nfc_qc=maybe:][:^ccc=0:]]").freeze(); 053 054 // see http://sourceforge.net/mailarchive/message.php?msg_id=37279908 055 056 /** 057 * A buffer for holding sequences overlap the SAX buffer boundary. 058 */ 059 private char[] buf = new char[128]; 060 061 /** 062 * A holder for the original buffer (for the memory leak prevention 063 * mechanism). 064 */ 065 private char[] bufHolder = null; 066 067 /** 068 * The current used length of the buffer, i.e. the index of the first slot 069 * that does not hold current data. 070 */ 071 private int pos; 072 073 /** 074 * Indicates whether the checker the next call to <code>characters()</code> 075 * is the first call in a run. 076 */ 077 private boolean atStartOfRun; 078 079 /** 080 * Indicates whether the current run has already caused an error. 081 */ 082 private boolean alreadyComplainedAboutThisRun; 083 084 /** 085 * Emit an error. The locator is used. 086 * 087 * @param message the error message 088 * @throws SAXException if something goes wrong 089 */ 090 public void err(String message) throws SAXException { 091 if (errorHandler != null) { 092 SAXParseException spe = new SAXParseException(message, locator); 093 errorHandler.error(spe); 094 } 095 } 096 097 /** 098 * Returns <code>true</code> if the argument is a composing BMP character 099 * or a surrogate and <code>false</code> otherwise. 100 * 101 * @param c a UTF-16 code unit 102 * @return <code>true</code> if the argument is a composing BMP character 103 * or a surrogate and <code>false</code> otherwise 104 */ 105 private static boolean isComposingCharOrSurrogate(char c) { 106 if (UCharacter.isHighSurrogate(c) || UCharacter.isLowSurrogate(c)) { 107 return true; 108 } 109 return isComposingChar(c); 110 } 111 112 /** 113 * Returns <code>true</code> if the argument is a composing character 114 * and <code>false</code> otherwise. 115 * 116 * @param c a Unicode code point 117 * @return <code>true</code> if the argument is a composing character 118 * <code>false</code> otherwise 119 */ 120 private static boolean isComposingChar(int c) { 121 return COMPOSING_CHARACTERS.contains(c); 122 } 123 124 /** 125 * Constructor with mode selection. 126 * 127 * @param sourceTextMode whether the source text-related messages 128 * should be enabled. 129 */ 130 public NormalizationChecker(Locator locator) { 131 super(); 132 start(); 133 } 134 135 /** 136 * @see nu.validator.htmlparser.common.CharacterHandler#start() 137 */ 138 public void start() { 139 atStartOfRun = true; 140 alreadyComplainedAboutThisRun = false; 141 pos = 0; 142 } 143 144 /** 145 * @see nu.validator.htmlparser.common.CharacterHandler#characters(char[], int, int) 146 */ 147 public void characters(char[] ch, int start, int length) 148 throws SAXException { 149 if (alreadyComplainedAboutThisRun) { 150 return; 151 } 152 if (atStartOfRun) { 153 char c = ch[start]; 154 if (pos == 1) { 155 // there's a single high surrogate in buf 156 if (isComposingChar(UCharacter.getCodePoint(buf[0], c))) { 157 err("Text run starts with a composing character."); 158 } 159 atStartOfRun = false; 160 } else { 161 if (length == 1 && UCharacter.isHighSurrogate(c)) { 162 buf[0] = c; 163 pos = 1; 164 return; 165 } else { 166 if (UCharacter.isHighSurrogate(c)) { 167 if (isComposingChar(UCharacter.getCodePoint(c, 168 ch[start + 1]))) { 169 err("Text run starts with a composing character."); 170 } 171 } else { 172 if (isComposingCharOrSurrogate(c)) { 173 err("Text run starts with a composing character."); 174 } 175 } 176 atStartOfRun = false; 177 } 178 } 179 } 180 int i = start; 181 int stop = start + length; 182 if (pos > 0) { 183 // there's stuff in buf 184 while (i < stop && isComposingCharOrSurrogate(ch[i])) { 185 i++; 186 } 187 appendToBuf(ch, start, i); 188 if (i == stop) { 189 return; 190 } else { 191 if (!Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) { 192 errAboutTextRun(); 193 } 194 pos = 0; 195 } 196 } 197 if (i < stop) { 198 start = i; 199 i = stop - 1; 200 while (i > start && isComposingCharOrSurrogate(ch[i])) { 201 i--; 202 } 203 if (i > start) { 204 if (!Normalizer.isNormalized(ch, start, i, Normalizer.NFC, 0)) { 205 errAboutTextRun(); 206 } 207 } 208 appendToBuf(ch, i, stop); 209 } 210 } 211 212 /** 213 * Emits an error stating that the current text run or the source 214 * text is not in NFC. 215 * 216 * @throws SAXException if the <code>ErrorHandler</code> throws 217 */ 218 private void errAboutTextRun() throws SAXException { 219 err("Source text is not in Unicode Normalization Form C."); 220 alreadyComplainedAboutThisRun = true; 221 } 222 223 /** 224 * Appends a slice of an UTF-16 code unit array to the internal 225 * buffer. 226 * 227 * @param ch the array from which to copy 228 * @param start the index of the first element that is copied 229 * @param end the index of the first element that is not copied 230 */ 231 private void appendToBuf(char[] ch, int start, int end) { 232 if (start == end) { 233 return; 234 } 235 int neededBufLen = pos + (end - start); 236 if (neededBufLen > buf.length) { 237 char[] newBuf = new char[neededBufLen]; 238 System.arraycopy(buf, 0, newBuf, 0, pos); 239 if (bufHolder == null) { 240 bufHolder = buf; // keep the original around 241 } 242 buf = newBuf; 243 } 244 System.arraycopy(ch, start, buf, pos, end - start); 245 pos += (end - start); 246 } 247 248 /** 249 * @see nu.validator.htmlparser.common.CharacterHandler#end() 250 */ 251 public void end() throws SAXException { 252 if (!alreadyComplainedAboutThisRun 253 && !Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) { 254 errAboutTextRun(); 255 } 256 if (bufHolder != null) { 257 // restore the original small buffer to avoid leaking 258 // memory if this checker is recycled 259 buf = bufHolder; 260 bufHolder = null; 261 } 262 } 263 264 public void setErrorHandler(ErrorHandler errorHandler) { 265 this.errorHandler = errorHandler; 266 } 267 268 }