001 /* 002 * Copyright (c) 2006, 2007 Henri Sivonen 003 * Copyright (c) 2007 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.htmlparser.impl; 025 026 import org.xml.sax.ErrorHandler; 027 import org.xml.sax.Locator; 028 import org.xml.sax.SAXException; 029 import org.xml.sax.SAXParseException; 030 031 import com.ibm.icu.lang.UCharacter; 032 import com.ibm.icu.text.Normalizer; 033 import com.ibm.icu.text.UnicodeSet; 034 035 /** 036 * @version $Id: NormalizationChecker.java 155 2007-09-13 11:54:32Z hsivonen $ 037 * @author hsivonen 038 */ 039 public final class NormalizationChecker implements CharacterHandler { 040 041 private ErrorHandler errorHandler; 042 043 private Locator locator; 044 045 /** 046 * A thread-safe set of composing characters as per Charmod Norm. 047 */ 048 @SuppressWarnings("deprecation") 049 private static final UnicodeSet COMPOSING_CHARACTERS = (UnicodeSet) new UnicodeSet( 050 "[[:nfc_qc=maybe:][:^ccc=0:]]").freeze(); 051 052 // see http://sourceforge.net/mailarchive/message.php?msg_id=37279908 053 054 /** 055 * A buffer for holding sequences overlap the SAX buffer boundary. 056 */ 057 private char[] buf = new char[128]; 058 059 /** 060 * A holder for the original buffer (for the memory leak prevention 061 * mechanism). 062 */ 063 private char[] bufHolder = null; 064 065 /** 066 * The current used length of the buffer, i.e. the index of the first slot 067 * that does not hold current data. 068 */ 069 private int pos; 070 071 /** 072 * Indicates whether the checker the next call to <code>characters()</code> 073 * is the first call in a run. 074 */ 075 private boolean atStartOfRun; 076 077 /** 078 * Indicates whether the current run has already caused an error. 079 */ 080 private boolean alreadyComplainedAboutThisRun; 081 082 /** 083 * Emit an error. The locator is used. 084 * 085 * @param message the error message 086 * @throws SAXException if something goes wrong 087 */ 088 public void err(String message) throws SAXException { 089 if (errorHandler != null) { 090 SAXParseException spe = new SAXParseException(message, locator); 091 errorHandler.error(spe); 092 } 093 } 094 095 /** 096 * Returns <code>true</code> if the argument is a composing BMP character 097 * or a surrogate and <code>false</code> otherwise. 098 * 099 * @param c a UTF-16 code unit 100 * @return <code>true</code> if the argument is a composing BMP character 101 * or a surrogate and <code>false</code> otherwise 102 */ 103 private static boolean isComposingCharOrSurrogate(char c) { 104 if (UCharacter.isHighSurrogate(c) || UCharacter.isLowSurrogate(c)) { 105 return true; 106 } 107 return isComposingChar(c); 108 } 109 110 /** 111 * Returns <code>true</code> if the argument is a composing character 112 * and <code>false</code> otherwise. 113 * 114 * @param c a Unicode code point 115 * @return <code>true</code> if the argument is a composing character 116 * <code>false</code> otherwise 117 */ 118 private static boolean isComposingChar(int c) { 119 return COMPOSING_CHARACTERS.contains(c); 120 } 121 122 /** 123 * Constructor with mode selection. 124 * 125 * @param sourceTextMode whether the source text-related messages 126 * should be enabled. 127 */ 128 public NormalizationChecker(Locator locator) { 129 super(); 130 start(); 131 } 132 133 /** 134 * @see nu.validator.htmlparser.impl.CharacterHandler#start() 135 */ 136 public void start() { 137 atStartOfRun = true; 138 alreadyComplainedAboutThisRun = false; 139 pos = 0; 140 } 141 142 /** 143 * @see nu.validator.htmlparser.impl.CharacterHandler#characters(char[], int, int) 144 */ 145 public void characters(char[] ch, int start, int length) 146 throws SAXException { 147 if (alreadyComplainedAboutThisRun) { 148 return; 149 } 150 if (atStartOfRun) { 151 char c = ch[start]; 152 if (pos == 1) { 153 // there's a single high surrogate in buf 154 if (isComposingChar(UCharacter.getCodePoint(buf[0], c))) { 155 err("Text run starts with a composing character."); 156 } 157 atStartOfRun = false; 158 } else { 159 if (length == 1 && UCharacter.isHighSurrogate(c)) { 160 buf[0] = c; 161 pos = 1; 162 return; 163 } else { 164 if (UCharacter.isHighSurrogate(c)) { 165 if (isComposingChar(UCharacter.getCodePoint(c, 166 ch[start + 1]))) { 167 err("Text run starts with a composing character."); 168 } 169 } else { 170 if (isComposingCharOrSurrogate(c)) { 171 err("Text run starts with a composing character."); 172 } 173 } 174 atStartOfRun = false; 175 } 176 } 177 } 178 int i = start; 179 int stop = start + length; 180 if (pos > 0) { 181 // there's stuff in buf 182 while (i < stop && isComposingCharOrSurrogate(ch[i])) { 183 i++; 184 } 185 appendToBuf(ch, start, i); 186 if (i == stop) { 187 return; 188 } else { 189 if (!Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) { 190 errAboutTextRun(); 191 } 192 pos = 0; 193 } 194 } 195 if (i < stop) { 196 start = i; 197 i = stop - 1; 198 while (i > start && isComposingCharOrSurrogate(ch[i])) { 199 i--; 200 } 201 if (i > start) { 202 if (!Normalizer.isNormalized(ch, start, i, Normalizer.NFC, 0)) { 203 errAboutTextRun(); 204 } 205 } 206 appendToBuf(ch, i, stop); 207 } 208 } 209 210 /** 211 * Emits an error stating that the current text run or the source 212 * text is not in NFC. 213 * 214 * @throws SAXException if the <code>ErrorHandler</code> throws 215 */ 216 private void errAboutTextRun() throws SAXException { 217 err("Source text is not in Unicode Normalization Form C."); 218 alreadyComplainedAboutThisRun = true; 219 } 220 221 /** 222 * Appends a slice of an UTF-16 code unit array to the internal 223 * buffer. 224 * 225 * @param ch the array from which to copy 226 * @param start the index of the first element that is copied 227 * @param end the index of the first element that is not copied 228 */ 229 private void appendToBuf(char[] ch, int start, int end) { 230 if (start == end) { 231 return; 232 } 233 int neededBufLen = pos + (end - start); 234 if (neededBufLen > buf.length) { 235 char[] newBuf = new char[neededBufLen]; 236 System.arraycopy(buf, 0, newBuf, 0, pos); 237 if (bufHolder == null) { 238 bufHolder = buf; // keep the original around 239 } 240 buf = newBuf; 241 } 242 System.arraycopy(ch, start, buf, pos, end - start); 243 pos += (end - start); 244 } 245 246 /** 247 * @see nu.validator.htmlparser.impl.CharacterHandler#end() 248 */ 249 public void end() throws SAXException { 250 if (!alreadyComplainedAboutThisRun 251 && !Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) { 252 errAboutTextRun(); 253 } 254 if (bufHolder != null) { 255 // restore the original small buffer to avoid leaking 256 // memory if this checker is recycled 257 buf = bufHolder; 258 bufHolder = null; 259 } 260 } 261 262 public void setErrorHandler(ErrorHandler errorHandler) { 263 this.errorHandler = errorHandler; 264 } 265 266 }