001 /* 002 * Copyright (c) 2007 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package nu.validator.htmlparser.io; 024 025 import java.io.IOException; 026 import java.io.InputStream; 027 import java.io.Reader; 028 import java.nio.ByteBuffer; 029 import java.nio.CharBuffer; 030 import java.nio.charset.Charset; 031 import java.nio.charset.CharsetDecoder; 032 import java.nio.charset.CoderResult; 033 import java.nio.charset.CodingErrorAction; 034 035 import nu.validator.htmlparser.common.ByteReadable; 036 import nu.validator.htmlparser.common.Heuristics; 037 import nu.validator.htmlparser.common.XmlViolationPolicy; 038 import nu.validator.htmlparser.extra.ChardetSniffer; 039 import nu.validator.htmlparser.extra.IcuDetectorSniffer; 040 import nu.validator.htmlparser.impl.Tokenizer; 041 042 import org.xml.sax.ErrorHandler; 043 import org.xml.sax.Locator; 044 import org.xml.sax.SAXException; 045 import org.xml.sax.SAXParseException; 046 047 /** 048 * Be very careful with this class. It is not a general-purpose subclass of of 049 * <code>Reader</code>. Instead, it is the minimal implementation that does 050 * what <code>Tokenizer</code> needs while being an instance of 051 * <code>Reader</code>. 052 * 053 * The only reason why this is a public class is that it needs to be visible to 054 * test code in another package. 055 * 056 * @version $Id$ 057 * @author hsivonen 058 */ 059 public final class HtmlInputStreamReader extends Reader implements 060 ByteReadable, Locator { 061 062 private static final int SNIFFING_LIMIT = 1024; 063 064 private final InputStream inputStream; 065 066 private final ErrorHandler errorHandler; 067 068 private final Tokenizer tokenizer; 069 070 private final Driver driver; 071 072 private CharsetDecoder decoder = null; 073 074 private boolean sniffing = true; 075 076 private int limit = 0; 077 078 private int position = 0; 079 080 private int bytesRead = 0; 081 082 private boolean eofSeen = false; 083 084 private boolean shouldReadBytes = false; 085 086 private boolean charsetBoundaryPassed = false; 087 088 private final byte[] byteArray = new byte[4096]; // Length must be >= 089 090 // SNIFFING_LIMIT 091 092 private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray); 093 094 private boolean needToNotifyTokenizer = false; 095 096 private boolean flushing = false; 097 098 private int line = -1; 099 100 private int col = -1; 101 102 private int lineColPos; 103 104 private boolean hasPendingReplacementCharacter = false; 105 106 private boolean nextCharOnNewLine; 107 108 private boolean prevWasCR; 109 110 /** 111 * @param inputStream 112 * @param errorHandler 113 * @param locator 114 * @throws IOException 115 * @throws SAXException 116 */ 117 public HtmlInputStreamReader(InputStream inputStream, 118 ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver, 119 Heuristics heuristics) throws SAXException, IOException { 120 this.inputStream = inputStream; 121 this.errorHandler = errorHandler; 122 this.tokenizer = tokenizer; 123 this.driver = driver; 124 this.sniffing = true; 125 Encoding encoding = (new BomSniffer(this)).sniff(); 126 if (encoding == null) { 127 position = 0; 128 encoding = (new MetaSniffer(errorHandler, this)).sniff(this); 129 if (encoding == null 130 && (heuristics == Heuristics.CHARDET || heuristics == Heuristics.ALL)) { 131 encoding = (new ChardetSniffer(byteArray, limit)).sniff(); 132 } 133 if (encoding == null 134 && (heuristics == Heuristics.ICU || heuristics == Heuristics.ALL)) { 135 position = 0; 136 encoding = (new IcuDetectorSniffer(this)).sniff(); 137 } 138 sniffing = false; 139 if (encoding == null) { 140 encoding = Encoding.WINDOWS1252; 141 } 142 if (driver != null) { 143 driver.setEncoding(encoding, Confidence.TENTATIVE); 144 } 145 } else { 146 if (encoding == Encoding.UTF8) { 147 if (driver != null) { 148 driver.setEncoding(Encoding.UTF8, Confidence.CERTAIN); 149 } 150 } else { 151 if (driver != null) { 152 driver.setEncoding(Encoding.UTF16, Confidence.CERTAIN); 153 } 154 } 155 } 156 this.decoder = encoding.newDecoder(); 157 sniffing = false; 158 position = 0; 159 bytesRead = 0; 160 byteBuffer.position(position); 161 byteBuffer.limit(limit); 162 initDecoder(); 163 } 164 165 /** 166 * 167 */ 168 private void initDecoder() { 169 this.decoder.onMalformedInput(CodingErrorAction.REPORT); 170 this.decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 171 } 172 173 public HtmlInputStreamReader(InputStream inputStream, 174 ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver, 175 Encoding encoding) throws SAXException, IOException { 176 this.inputStream = inputStream; 177 this.errorHandler = errorHandler; 178 this.tokenizer = tokenizer; 179 this.driver = driver; 180 this.decoder = encoding.newDecoder(); 181 this.sniffing = false; 182 position = 0; 183 bytesRead = 0; 184 byteBuffer.position(0); 185 byteBuffer.limit(0); 186 shouldReadBytes = true; 187 initDecoder(); 188 } 189 190 @Override public void close() throws IOException { 191 inputStream.close(); 192 } 193 194 @Override public int read(char[] charArray) throws IOException { 195 lineColPos = 0; 196 assert !sniffing; 197 assert charArray.length >= 2; 198 if (needToNotifyTokenizer) { 199 if (driver != null) { 200 driver.notifyAboutMetaBoundary(); 201 } 202 needToNotifyTokenizer = false; 203 } 204 CharBuffer charBuffer = CharBuffer.wrap(charArray); 205 charBuffer.limit(charArray.length); 206 charBuffer.position(0); 207 if (flushing) { 208 decoder.flush(charBuffer); 209 // return -1 if zero 210 int cPos = charBuffer.position(); 211 return cPos == 0 ? -1 : cPos; 212 } 213 if (hasPendingReplacementCharacter) { 214 charBuffer.put('\uFFFD'); 215 hasPendingReplacementCharacter = false; 216 } 217 for (;;) { 218 if (shouldReadBytes) { 219 int oldLimit = byteBuffer.limit(); 220 int readLen; 221 if (charsetBoundaryPassed) { 222 readLen = byteArray.length - oldLimit; 223 } else { 224 readLen = SNIFFING_LIMIT - oldLimit; 225 } 226 int num = inputStream.read(byteArray, oldLimit, readLen); 227 if (num == -1) { 228 eofSeen = true; 229 inputStream.close(); 230 } else { 231 byteBuffer.position(0); 232 byteBuffer.limit(oldLimit + num); 233 } 234 shouldReadBytes = false; 235 } 236 boolean finalDecode = false; 237 for (;;) { 238 int oldBytePos = byteBuffer.position(); 239 CoderResult cr = decoder.decode(byteBuffer, charBuffer, 240 finalDecode); 241 bytesRead += byteBuffer.position() - oldBytePos; 242 if (cr == CoderResult.OVERFLOW) { 243 // Decoder will remember surrogates 244 return charBuffer.position(); 245 } else if (cr == CoderResult.UNDERFLOW) { 246 int remaining = byteBuffer.remaining(); 247 if (!charsetBoundaryPassed) { 248 if (bytesRead + remaining >= SNIFFING_LIMIT) { 249 needToNotifyTokenizer = true; 250 charsetBoundaryPassed = true; 251 } 252 } 253 254 // XXX what happens if the entire byte buffer consists of 255 // a pathologically long malformed sequence? 256 257 // If the buffer was not fully consumed, there may be an 258 // incomplete byte sequence that needs to seed the next 259 // buffer. 260 if (remaining > 0) { 261 System.arraycopy(byteArray, byteBuffer.position(), 262 byteArray, 0, remaining); 263 } 264 byteBuffer.position(0); 265 byteBuffer.limit(remaining); 266 if (flushing) { 267 // The final decode was successful. Not sure if this 268 // ever happens. 269 // Let's get out in any case. 270 int cPos = charBuffer.position(); 271 return cPos == 0 ? -1 : cPos; 272 } else if (eofSeen) { 273 // If there's something left, it isn't something that 274 // would be 275 // consumed in the middle of the stream. Rerun the loop 276 // once 277 // in the final mode. 278 shouldReadBytes = false; 279 finalDecode = true; 280 flushing = true; 281 continue; 282 } else { 283 // The usual stuff. Want more bytes next time. 284 shouldReadBytes = true; 285 // return -1 if zero 286 int cPos = charBuffer.position(); 287 return cPos == 0 ? -1 : cPos; 288 } 289 } else { 290 // The result is in error. No need to test. 291 StringBuilder sb = new StringBuilder(); 292 for (int i = 0; i < cr.length(); i++) { 293 if (i > 0) { 294 sb.append(", "); 295 } 296 sb.append('\u201C'); 297 sb.append(Integer.toHexString(byteBuffer.get() & 0xFF)); 298 bytesRead++; 299 sb.append('\u201D'); 300 } 301 if (charBuffer.hasRemaining()) { 302 charBuffer.put('\uFFFD'); 303 } else { 304 hasPendingReplacementCharacter = true; 305 } 306 calculateLineAndCol(charBuffer); 307 if (cr.isMalformed()) { 308 err("Malformed byte sequence: " + sb + "."); 309 } else if (cr.isUnmappable()) { 310 err("Unmappable byte sequence: " + sb + "."); 311 } else { 312 throw new RuntimeException( 313 "CoderResult was none of overflow, underflow, malformed or unmappable."); 314 } 315 if (finalDecode) { 316 // These were the last bytes of input. Return without 317 // relooping. 318 // return -1 if zero 319 int cPos = charBuffer.position(); 320 return cPos == 0 ? -1 : cPos; 321 } 322 } 323 } 324 } 325 } 326 327 private void calculateLineAndCol(CharBuffer charBuffer) { 328 if (tokenizer != null) { 329 if (lineColPos == 0) { 330 line = tokenizer.getLine(); 331 col = tokenizer.getCol(); 332 nextCharOnNewLine = tokenizer.isNextCharOnNewLine(); 333 prevWasCR = tokenizer.isPrevCR(); 334 } 335 336 char[] charArray = charBuffer.array(); 337 int i = lineColPos; 338 while (i < charBuffer.position()) { 339 char c; 340 if (nextCharOnNewLine) { 341 line++; 342 col = 1; 343 nextCharOnNewLine = false; 344 } else { 345 col++; 346 } 347 348 c = charArray[i]; 349 switch (c) { 350 case '\r': 351 nextCharOnNewLine = true; 352 prevWasCR = true; 353 break; 354 case '\n': 355 if (prevWasCR) { 356 col--; 357 } else { 358 nextCharOnNewLine = true; 359 } 360 break; 361 } 362 i++; 363 } 364 lineColPos = i; 365 } 366 } 367 368 public int readByte() throws IOException { 369 if (!sniffing) { 370 throw new IllegalStateException( 371 "readByte() called when not in the sniffing state."); 372 } 373 if (position == SNIFFING_LIMIT) { 374 return -1; 375 } else if (position < limit) { 376 return byteArray[position++] & 0xFF; 377 } else { 378 int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit); 379 if (num == -1) { 380 return -1; 381 } else { 382 limit += num; 383 return byteArray[position++] & 0xFF; 384 } 385 } 386 } 387 388 public static void main(String[] args) { 389 CharsetDecoder dec = Charset.forName("UTF-8").newDecoder(); 390 dec.onMalformedInput(CodingErrorAction.REPORT); 391 dec.onUnmappableCharacter(CodingErrorAction.REPORT); 392 byte[] bytes = { (byte) 0xF0, (byte) 0x9D, (byte) 0x80, (byte) 0x80 }; 393 byte[] bytes2 = { (byte) 0xB8, (byte) 0x80, 0x63, 0x64, 0x65 }; 394 ByteBuffer byteBuf = ByteBuffer.wrap(bytes); 395 ByteBuffer byteBuf2 = ByteBuffer.wrap(bytes2); 396 char[] chars = new char[1]; 397 CharBuffer charBuf = CharBuffer.wrap(chars); 398 399 CoderResult cr = dec.decode(byteBuf, charBuf, false); 400 System.out.println(cr); 401 System.out.println(byteBuf); 402 // byteBuf.get(); 403 cr = dec.decode(byteBuf2, charBuf, false); 404 System.out.println(cr); 405 System.out.println(byteBuf2); 406 407 } 408 409 public int getColumnNumber() { 410 if (tokenizer != null) { 411 return col; 412 } 413 return -1; 414 } 415 416 public int getLineNumber() { 417 if (tokenizer != null) { 418 return line; 419 } 420 return -1; 421 } 422 423 public String getPublicId() { 424 if (tokenizer != null) { 425 return tokenizer.getPublicId(); 426 } 427 return null; 428 } 429 430 public String getSystemId() { 431 if (tokenizer != null) { 432 return tokenizer.getSystemId(); 433 } 434 return null; 435 } 436 437 /** 438 * @param string 439 * @throws SAXException 440 */ 441 private void err(String message) throws IOException { 442 // TODO remove wrapping when changing read() to take a CharBuffer 443 try { 444 if (errorHandler != null) { 445 SAXParseException spe = new SAXParseException(message, this); 446 errorHandler.error(spe); 447 } 448 } catch (SAXException e) { 449 throw (IOException) new IOException(e.getMessage()).initCause(e); 450 } 451 } 452 453 public Charset getCharset() { 454 return decoder.charset(); 455 } 456 457 /** 458 * @see java.io.Reader#read() 459 */ 460 @Override public int read() throws IOException { 461 throw new UnsupportedOperationException(); 462 } 463 464 /** 465 * @see java.io.Reader#read(char[], int, int) 466 */ 467 @Override public int read(char[] cbuf, int off, int len) throws IOException { 468 throw new UnsupportedOperationException(); 469 } 470 471 /** 472 * @see java.io.Reader#read(java.nio.CharBuffer) 473 */ 474 @Override public int read(CharBuffer target) throws IOException { 475 throw new UnsupportedOperationException(); 476 } 477 478 public void switchEncoding(Encoding newEnc) { 479 this.decoder = newEnc.newDecoder(); 480 initDecoder(); 481 } 482 }