001 /* 002 * Copyright (c) 2007 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package nu.validator.htmlparser.impl; 024 025 import java.io.IOException; 026 import java.io.InputStream; 027 import java.io.Reader; 028 import java.nio.ByteBuffer; 029 import java.nio.CharBuffer; 030 import java.nio.charset.Charset; 031 import java.nio.charset.CharsetDecoder; 032 import java.nio.charset.CoderResult; 033 import java.nio.charset.CodingErrorAction; 034 035 036 import org.xml.sax.ErrorHandler; 037 import org.xml.sax.Locator; 038 import org.xml.sax.SAXException; 039 import org.xml.sax.SAXParseException; 040 041 /** 042 * Be very careful with this class. It is not a general-purpose subclass of of 043 * <code>Reader</code>. Instead, it is the minimal implementation that does 044 * what <code>Tokenizer</code> needs while being an instance of 045 * <code>Reader</code>. 046 * 047 * The only reason why this is a public class is that it needs to be visible to 048 * test code in another package. 049 * 050 * @version $Id: HtmlInputStreamReader.java 150 2007-08-16 19:21:25Z hsivonen $ 051 * @author hsivonen 052 */ 053 public final class HtmlInputStreamReader extends Reader implements 054 ByteReadable, Locator { 055 056 private static final int SNIFFING_LIMIT = 512; 057 058 private final InputStream inputStream; 059 060 private final ErrorHandler errorHandler; 061 062 private final Locator locator; 063 064 private final Tokenizer tokenizer; 065 066 private CharsetDecoder decoder = null; 067 068 private boolean sniffing = true; 069 070 private int limit = 0; 071 072 private int position = 0; 073 074 private int bytesRead = 0; 075 076 private boolean eofSeen = false; 077 078 private boolean shouldReadBytes = false; 079 080 private boolean charsetBoundaryPassed = false; 081 082 private final byte[] byteArray = new byte[4096]; // Length must be >= 083 084 // SNIFFING_LIMIT 085 086 private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray); 087 088 private boolean needToNotifyTokenizer = false; 089 090 private boolean flushing = false; 091 092 private int line = -1; 093 094 private int col = -1; 095 096 private int lineColPos; 097 098 /** 099 * @param inputStream 100 * @param errorHandler 101 * @param locator 102 * @throws IOException 103 * @throws SAXException 104 */ 105 public HtmlInputStreamReader(InputStream inputStream, 106 ErrorHandler errorHandler, Locator locator, Tokenizer tokenizer) 107 throws SAXException, IOException { 108 this.inputStream = inputStream; 109 this.errorHandler = errorHandler; 110 this.locator = locator; 111 this.tokenizer = tokenizer; 112 this.sniffing = true; 113 this.decoder = (new BomSniffer(this)).sniff(); 114 if (this.decoder == null) { 115 position = 0; 116 this.decoder = (new MetaSniffer(this, errorHandler, this)).sniff(); 117 sniffing = false; 118 // TODO chardet 119 if (this.decoder == null) { 120 if (tokenizer != null) { 121 tokenizer.noEncodingDeclared(); 122 } 123 err("Could not determine the character encoding of the document. Using \u201CWindows-1252\u201D."); 124 this.decoder = Charset.forName("Windows-1252").newDecoder(); 125 } 126 } 127 sniffing = false; 128 position = 0; 129 bytesRead = 0; 130 byteBuffer.position(position); 131 byteBuffer.limit(limit); 132 initDecoder(); 133 } 134 135 /** 136 * 137 */ 138 private void initDecoder() { 139 if ("ISO-8859-1".equals(this.decoder.charset().name())) { 140 this.decoder = Charset.forName("Windows-1252").newDecoder(); 141 } 142 this.decoder.onMalformedInput(CodingErrorAction.REPORT); 143 this.decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 144 } 145 146 public HtmlInputStreamReader(InputStream inputStream, 147 ErrorHandler errorHandler, Locator locator, Tokenizer tokenizer, 148 CharsetDecoder decoder) throws SAXException, IOException { 149 this.inputStream = inputStream; 150 this.errorHandler = errorHandler; 151 this.locator = locator; 152 this.tokenizer = tokenizer; 153 this.decoder = decoder; 154 this.sniffing = false; 155 position = 0; 156 bytesRead = 0; 157 byteBuffer.position(0); 158 byteBuffer.limit(0); 159 shouldReadBytes = true; 160 initDecoder(); 161 } 162 163 @Override 164 public void close() throws IOException { 165 // TODO Auto-generated method stub 166 inputStream.close(); 167 } 168 169 @Override 170 public int read(char[] charArray) throws IOException { 171 lineColPos = 0; 172 if (sniffing) { 173 throw new IllegalStateException( 174 "read() called when in the sniffing state."); 175 } 176 assert charArray.length >= 2; 177 if (needToNotifyTokenizer) { 178 if (tokenizer != null) { 179 tokenizer.notifyAboutMetaBoundary(); 180 } 181 needToNotifyTokenizer = false; 182 } 183 CharBuffer charBuffer = CharBuffer.wrap(charArray); 184 charBuffer.limit(charArray.length); 185 charBuffer.position(0); 186 if (flushing) { 187 decoder.flush(charBuffer); 188 // return -1 if zero 189 int cPos = charBuffer.position(); 190 return cPos == 0 ? -1 : cPos; 191 } 192 outer: for (;;) { 193 if (shouldReadBytes) { 194 int oldLimit = byteBuffer.limit(); 195 int readLen; 196 if (charsetBoundaryPassed) { 197 readLen = byteArray.length - oldLimit; 198 } else { 199 readLen = SNIFFING_LIMIT - oldLimit; 200 } 201 int num = inputStream.read(byteArray, oldLimit, readLen); 202 if (num == -1) { 203 eofSeen = true; 204 inputStream.close(); 205 } else { 206 byteBuffer.position(0); 207 byteBuffer.limit(oldLimit + num); 208 } 209 shouldReadBytes = false; 210 } 211 boolean finalDecode = false; 212 for (;;) { 213 int oldBytePos = byteBuffer.position(); 214 CoderResult cr = decoder.decode(byteBuffer, charBuffer, 215 finalDecode); 216 bytesRead += byteBuffer.position() - oldBytePos; 217 if (cr == CoderResult.OVERFLOW) { 218 // Decoder will remember surrogates 219 return charBuffer.position(); 220 } else if (cr == CoderResult.UNDERFLOW) { 221 int remaining = byteBuffer.remaining(); 222 if (!charsetBoundaryPassed) { 223 if (bytesRead + remaining >= SNIFFING_LIMIT) { 224 needToNotifyTokenizer = true; 225 } 226 } 227 228 // XXX what happens if the entire byte buffer consists of 229 // a pathologically long malformed sequence? 230 231 // If the buffer was not fully consumed, there may be an 232 // incomplete byte sequence that needs to seed the next 233 // buffer. 234 if (remaining > 0) { 235 System.arraycopy(byteArray, byteBuffer.position(), 236 byteArray, 0, remaining); 237 } 238 byteBuffer.position(0); 239 byteBuffer.limit(remaining); 240 if (flushing) { 241 // The final decode was successful. Not sure if this 242 // ever happens. 243 // Let's get out in any case. 244 int cPos = charBuffer.position(); 245 return cPos == 0 ? -1 : cPos; 246 } else if (eofSeen) { 247 // If there's something left, it isn't something that 248 // would be 249 // consumed in the middle of the stream. Rerun the loop 250 // once 251 // in the final mode. 252 shouldReadBytes = false; 253 finalDecode = true; 254 flushing = true; 255 continue; 256 } else { 257 // The usual stuff. Want more bytes next time. 258 shouldReadBytes = true; 259 return charBuffer.position(); 260 } 261 } else { 262 // The result is in error. No need to test. 263 StringBuilder sb = new StringBuilder(); 264 for (int i = 0; i < cr.length(); i++) { 265 if (i > 0) { 266 sb.append(", "); 267 } 268 sb.append('\u201C'); 269 sb.append(Integer.toHexString(byteBuffer.get() & 0xFF)); 270 bytesRead++; 271 sb.append('\u201D'); 272 } 273 charBuffer.put('\uFFFD'); 274 calculateLineAndCol(charBuffer); 275 if (cr.isMalformed()) { 276 err("Malformed byte sequence: " + sb + "."); 277 } else if (cr.isUnmappable()) { 278 err("Unmappable byte sequence: " + sb + "."); 279 } else { 280 throw new RuntimeException( 281 "CoderResult was none of overflow, underflow, malformed or unmappable."); 282 } 283 if (finalDecode) { 284 // These were the last bytes of input. Return without 285 // relooping. 286 return charBuffer.position(); 287 } 288 } 289 } 290 } 291 } 292 293 private void calculateLineAndCol(CharBuffer charBuffer) { 294 if (locator != null) { 295 line = locator.getLineNumber(); 296 col = locator.getColumnNumber(); 297 char[] charArray = charBuffer.array(); 298 boolean prevWasCR = false; 299 int i; 300 for (i = lineColPos; i < charBuffer.position(); i++) { 301 switch (charArray[i]) { 302 case '\n': // LF 303 if (!prevWasCR) { 304 line++; 305 col = 0; 306 } 307 prevWasCR = false; 308 break; 309 case '\r': // CR 310 line++; 311 col = 0; 312 prevWasCR = true; 313 break; 314 default: 315 col++; 316 prevWasCR = false; 317 break; 318 } 319 } 320 lineColPos = i; 321 } 322 } 323 324 public int readByte() throws IOException { 325 if (!sniffing) { 326 throw new IllegalStateException( 327 "readByte() called when not in the sniffing state."); 328 } 329 if (position == SNIFFING_LIMIT) { 330 return -1; 331 } else if (position < limit) { 332 return byteArray[position++] & 0xFF; 333 } else { 334 int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit); 335 if (num == -1) { 336 return -1; 337 } else { 338 limit += num; 339 return byteArray[position++] & 0xFF; 340 } 341 } 342 } 343 344 public static void main(String[] args) { 345 CharsetDecoder dec = Charset.forName("UTF-8").newDecoder(); 346 dec.onMalformedInput(CodingErrorAction.REPORT); 347 dec.onUnmappableCharacter(CodingErrorAction.REPORT); 348 byte[] bytes = { (byte) 0xF0, (byte) 0x9D, (byte) 0x80, (byte) 0x80 }; 349 byte[] bytes2 = { (byte) 0xB8, (byte) 0x80, 0x63, 0x64, 0x65 }; 350 ByteBuffer byteBuf = ByteBuffer.wrap(bytes); 351 ByteBuffer byteBuf2 = ByteBuffer.wrap(bytes2); 352 char[] chars = new char[1]; 353 CharBuffer charBuf = CharBuffer.wrap(chars); 354 355 CoderResult cr = dec.decode(byteBuf, charBuf, false); 356 System.out.println(cr); 357 System.out.println(byteBuf); 358 // byteBuf.get(); 359 cr = dec.decode(byteBuf2, charBuf, false); 360 System.out.println(cr); 361 System.out.println(byteBuf2); 362 363 } 364 365 public int getColumnNumber() { 366 if (locator != null) { 367 return col; 368 } 369 return -1; 370 } 371 372 public int getLineNumber() { 373 if (locator != null) { 374 return line; 375 } 376 return -1; 377 } 378 379 public String getPublicId() { 380 if (locator != null) { 381 return locator.getPublicId(); 382 } 383 return null; 384 } 385 386 public String getSystemId() { 387 if (locator != null) { 388 return locator.getSystemId(); 389 } 390 return null; 391 } 392 393 /** 394 * @param string 395 * @throws SAXException 396 */ 397 private void err(String message) throws IOException { 398 // TODO remove wrapping when changing read() to take a CharBuffer 399 try { 400 if (errorHandler != null) { 401 SAXParseException spe = new SAXParseException(message, this); 402 errorHandler.error(spe); 403 } 404 } catch (SAXException e) { 405 throw (IOException) new IOException(e.getMessage()).initCause(e); 406 } 407 } 408 409 /** 410 * @param string 411 * @throws SAXException 412 */ 413 private void warn(String message) throws IOException { 414 try { 415 if (errorHandler != null) { 416 SAXParseException spe = new SAXParseException(message, this); 417 errorHandler.warning(spe); 418 } 419 } catch (SAXException e) { 420 throw (IOException) new IOException(e.getMessage()).initCause(e); 421 } 422 } 423 424 public Charset getCharset() { 425 return decoder.charset(); 426 } 427 428 /** 429 * @see java.io.Reader#read() 430 */ 431 @Override 432 public int read() throws IOException { 433 throw new UnsupportedOperationException(); 434 } 435 436 /** 437 * @see java.io.Reader#read(char[], int, int) 438 */ 439 @Override 440 public int read(char[] cbuf, int off, int len) throws IOException { 441 throw new UnsupportedOperationException(); 442 } 443 444 /** 445 * @see java.io.Reader#read(java.nio.CharBuffer) 446 */ 447 @Override 448 public int read(CharBuffer target) throws IOException { 449 throw new UnsupportedOperationException(); 450 } 451 452 }