001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package nu.validator.htmlparser.io;
024
025 import java.io.IOException;
026 import java.io.InputStream;
027 import java.io.Reader;
028 import java.nio.ByteBuffer;
029 import java.nio.CharBuffer;
030 import java.nio.charset.Charset;
031 import java.nio.charset.CharsetDecoder;
032 import java.nio.charset.CoderResult;
033 import java.nio.charset.CodingErrorAction;
034
035 import nu.validator.htmlparser.common.ByteReadable;
036 import nu.validator.htmlparser.common.Heuristics;
037 import nu.validator.htmlparser.common.XmlViolationPolicy;
038 import nu.validator.htmlparser.extra.ChardetSniffer;
039 import nu.validator.htmlparser.extra.IcuDetectorSniffer;
040 import nu.validator.htmlparser.impl.Tokenizer;
041
042 import org.xml.sax.ErrorHandler;
043 import org.xml.sax.Locator;
044 import org.xml.sax.SAXException;
045 import org.xml.sax.SAXParseException;
046
047 /**
048 * Be very careful with this class. It is not a general-purpose subclass of of
049 * <code>Reader</code>. Instead, it is the minimal implementation that does
050 * what <code>Tokenizer</code> needs while being an instance of
051 * <code>Reader</code>.
052 *
053 * The only reason why this is a public class is that it needs to be visible to
054 * test code in another package.
055 *
056 * @version $Id$
057 * @author hsivonen
058 */
059 public final class HtmlInputStreamReader extends Reader implements
060 ByteReadable, Locator {
061
062 private static final int SNIFFING_LIMIT = 1024;
063
064 private final InputStream inputStream;
065
066 private final ErrorHandler errorHandler;
067
068 private final Tokenizer tokenizer;
069
070 private final Driver driver;
071
072 private CharsetDecoder decoder = null;
073
074 private boolean sniffing = true;
075
076 private int limit = 0;
077
078 private int position = 0;
079
080 private int bytesRead = 0;
081
082 private boolean eofSeen = false;
083
084 private boolean shouldReadBytes = false;
085
086 private boolean charsetBoundaryPassed = false;
087
088 private final byte[] byteArray = new byte[4096]; // Length must be >=
089
090 // SNIFFING_LIMIT
091
092 private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);
093
094 private boolean needToNotifyTokenizer = false;
095
096 private boolean flushing = false;
097
098 private int line = -1;
099
100 private int col = -1;
101
102 private int lineColPos;
103
104 private boolean hasPendingReplacementCharacter = false;
105
106 private boolean nextCharOnNewLine;
107
108 private boolean prevWasCR;
109
110 /**
111 * @param inputStream
112 * @param errorHandler
113 * @param locator
114 * @throws IOException
115 * @throws SAXException
116 */
117 public HtmlInputStreamReader(InputStream inputStream,
118 ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
119 Heuristics heuristics) throws SAXException, IOException {
120 this.inputStream = inputStream;
121 this.errorHandler = errorHandler;
122 this.tokenizer = tokenizer;
123 this.driver = driver;
124 this.sniffing = true;
125 Encoding encoding = (new BomSniffer(this)).sniff();
126 if (encoding == null) {
127 position = 0;
128 encoding = (new MetaSniffer(errorHandler, this)).sniff(this);
129 if (encoding == null
130 && (heuristics == Heuristics.CHARDET || heuristics == Heuristics.ALL)) {
131 encoding = (new ChardetSniffer(byteArray, limit)).sniff();
132 }
133 if (encoding == null
134 && (heuristics == Heuristics.ICU || heuristics == Heuristics.ALL)) {
135 position = 0;
136 encoding = (new IcuDetectorSniffer(this)).sniff();
137 }
138 sniffing = false;
139 if (encoding == null) {
140 encoding = Encoding.WINDOWS1252;
141 }
142 if (driver != null) {
143 driver.setEncoding(encoding, Confidence.TENTATIVE);
144 }
145 } else {
146 if (encoding == Encoding.UTF8) {
147 if (driver != null) {
148 driver.setEncoding(Encoding.UTF8, Confidence.CERTAIN);
149 }
150 } else {
151 if (driver != null) {
152 driver.setEncoding(Encoding.UTF16, Confidence.CERTAIN);
153 }
154 }
155 }
156 this.decoder = encoding.newDecoder();
157 sniffing = false;
158 position = 0;
159 bytesRead = 0;
160 byteBuffer.position(position);
161 byteBuffer.limit(limit);
162 initDecoder();
163 }
164
165 /**
166 *
167 */
168 private void initDecoder() {
169 this.decoder.onMalformedInput(CodingErrorAction.REPORT);
170 this.decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
171 }
172
173 public HtmlInputStreamReader(InputStream inputStream,
174 ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
175 Encoding encoding) throws SAXException, IOException {
176 this.inputStream = inputStream;
177 this.errorHandler = errorHandler;
178 this.tokenizer = tokenizer;
179 this.driver = driver;
180 this.decoder = encoding.newDecoder();
181 this.sniffing = false;
182 position = 0;
183 bytesRead = 0;
184 byteBuffer.position(0);
185 byteBuffer.limit(0);
186 shouldReadBytes = true;
187 initDecoder();
188 }
189
190 @Override public void close() throws IOException {
191 inputStream.close();
192 }
193
194 @Override public int read(char[] charArray) throws IOException {
195 lineColPos = 0;
196 assert !sniffing;
197 assert charArray.length >= 2;
198 if (needToNotifyTokenizer) {
199 if (driver != null) {
200 driver.notifyAboutMetaBoundary();
201 }
202 needToNotifyTokenizer = false;
203 }
204 CharBuffer charBuffer = CharBuffer.wrap(charArray);
205 charBuffer.limit(charArray.length);
206 charBuffer.position(0);
207 if (flushing) {
208 decoder.flush(charBuffer);
209 // return -1 if zero
210 int cPos = charBuffer.position();
211 return cPos == 0 ? -1 : cPos;
212 }
213 if (hasPendingReplacementCharacter) {
214 charBuffer.put('\uFFFD');
215 hasPendingReplacementCharacter = false;
216 }
217 for (;;) {
218 if (shouldReadBytes) {
219 int oldLimit = byteBuffer.limit();
220 int readLen;
221 if (charsetBoundaryPassed) {
222 readLen = byteArray.length - oldLimit;
223 } else {
224 readLen = SNIFFING_LIMIT - oldLimit;
225 }
226 int num = inputStream.read(byteArray, oldLimit, readLen);
227 if (num == -1) {
228 eofSeen = true;
229 inputStream.close();
230 } else {
231 byteBuffer.position(0);
232 byteBuffer.limit(oldLimit + num);
233 }
234 shouldReadBytes = false;
235 }
236 boolean finalDecode = false;
237 for (;;) {
238 int oldBytePos = byteBuffer.position();
239 CoderResult cr = decoder.decode(byteBuffer, charBuffer,
240 finalDecode);
241 bytesRead += byteBuffer.position() - oldBytePos;
242 if (cr == CoderResult.OVERFLOW) {
243 // Decoder will remember surrogates
244 return charBuffer.position();
245 } else if (cr == CoderResult.UNDERFLOW) {
246 int remaining = byteBuffer.remaining();
247 if (!charsetBoundaryPassed) {
248 if (bytesRead + remaining >= SNIFFING_LIMIT) {
249 needToNotifyTokenizer = true;
250 charsetBoundaryPassed = true;
251 }
252 }
253
254 // XXX what happens if the entire byte buffer consists of
255 // a pathologically long malformed sequence?
256
257 // If the buffer was not fully consumed, there may be an
258 // incomplete byte sequence that needs to seed the next
259 // buffer.
260 if (remaining > 0) {
261 System.arraycopy(byteArray, byteBuffer.position(),
262 byteArray, 0, remaining);
263 }
264 byteBuffer.position(0);
265 byteBuffer.limit(remaining);
266 if (flushing) {
267 // The final decode was successful. Not sure if this
268 // ever happens.
269 // Let's get out in any case.
270 int cPos = charBuffer.position();
271 return cPos == 0 ? -1 : cPos;
272 } else if (eofSeen) {
273 // If there's something left, it isn't something that
274 // would be
275 // consumed in the middle of the stream. Rerun the loop
276 // once
277 // in the final mode.
278 shouldReadBytes = false;
279 finalDecode = true;
280 flushing = true;
281 continue;
282 } else {
283 // The usual stuff. Want more bytes next time.
284 shouldReadBytes = true;
285 // return -1 if zero
286 int cPos = charBuffer.position();
287 return cPos == 0 ? -1 : cPos;
288 }
289 } else {
290 // The result is in error. No need to test.
291 StringBuilder sb = new StringBuilder();
292 for (int i = 0; i < cr.length(); i++) {
293 if (i > 0) {
294 sb.append(", ");
295 }
296 sb.append('\u201C');
297 sb.append(Integer.toHexString(byteBuffer.get() & 0xFF));
298 bytesRead++;
299 sb.append('\u201D');
300 }
301 if (charBuffer.hasRemaining()) {
302 charBuffer.put('\uFFFD');
303 } else {
304 hasPendingReplacementCharacter = true;
305 }
306 calculateLineAndCol(charBuffer);
307 if (cr.isMalformed()) {
308 err("Malformed byte sequence: " + sb + ".");
309 } else if (cr.isUnmappable()) {
310 err("Unmappable byte sequence: " + sb + ".");
311 } else {
312 throw new RuntimeException(
313 "CoderResult was none of overflow, underflow, malformed or unmappable.");
314 }
315 if (finalDecode) {
316 // These were the last bytes of input. Return without
317 // relooping.
318 // return -1 if zero
319 int cPos = charBuffer.position();
320 return cPos == 0 ? -1 : cPos;
321 }
322 }
323 }
324 }
325 }
326
327 private void calculateLineAndCol(CharBuffer charBuffer) {
328 if (tokenizer != null) {
329 if (lineColPos == 0) {
330 line = tokenizer.getLine();
331 col = tokenizer.getCol();
332 nextCharOnNewLine = tokenizer.isNextCharOnNewLine();
333 prevWasCR = tokenizer.isPrevCR();
334 }
335
336 char[] charArray = charBuffer.array();
337 int i = lineColPos;
338 while (i < charBuffer.position()) {
339 char c;
340 if (nextCharOnNewLine) {
341 line++;
342 col = 1;
343 nextCharOnNewLine = false;
344 } else {
345 col++;
346 }
347
348 c = charArray[i];
349 switch (c) {
350 case '\r':
351 nextCharOnNewLine = true;
352 prevWasCR = true;
353 break;
354 case '\n':
355 if (prevWasCR) {
356 col--;
357 } else {
358 nextCharOnNewLine = true;
359 }
360 break;
361 }
362 i++;
363 }
364 lineColPos = i;
365 }
366 }
367
368 public int readByte() throws IOException {
369 if (!sniffing) {
370 throw new IllegalStateException(
371 "readByte() called when not in the sniffing state.");
372 }
373 if (position == SNIFFING_LIMIT) {
374 return -1;
375 } else if (position < limit) {
376 return byteArray[position++] & 0xFF;
377 } else {
378 int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit);
379 if (num == -1) {
380 return -1;
381 } else {
382 limit += num;
383 return byteArray[position++] & 0xFF;
384 }
385 }
386 }
387
388 public static void main(String[] args) {
389 CharsetDecoder dec = Charset.forName("UTF-8").newDecoder();
390 dec.onMalformedInput(CodingErrorAction.REPORT);
391 dec.onUnmappableCharacter(CodingErrorAction.REPORT);
392 byte[] bytes = { (byte) 0xF0, (byte) 0x9D, (byte) 0x80, (byte) 0x80 };
393 byte[] bytes2 = { (byte) 0xB8, (byte) 0x80, 0x63, 0x64, 0x65 };
394 ByteBuffer byteBuf = ByteBuffer.wrap(bytes);
395 ByteBuffer byteBuf2 = ByteBuffer.wrap(bytes2);
396 char[] chars = new char[1];
397 CharBuffer charBuf = CharBuffer.wrap(chars);
398
399 CoderResult cr = dec.decode(byteBuf, charBuf, false);
400 System.out.println(cr);
401 System.out.println(byteBuf);
402 // byteBuf.get();
403 cr = dec.decode(byteBuf2, charBuf, false);
404 System.out.println(cr);
405 System.out.println(byteBuf2);
406
407 }
408
409 public int getColumnNumber() {
410 if (tokenizer != null) {
411 return col;
412 }
413 return -1;
414 }
415
416 public int getLineNumber() {
417 if (tokenizer != null) {
418 return line;
419 }
420 return -1;
421 }
422
423 public String getPublicId() {
424 if (tokenizer != null) {
425 return tokenizer.getPublicId();
426 }
427 return null;
428 }
429
430 public String getSystemId() {
431 if (tokenizer != null) {
432 return tokenizer.getSystemId();
433 }
434 return null;
435 }
436
437 /**
438 * @param string
439 * @throws SAXException
440 */
441 private void err(String message) throws IOException {
442 // TODO remove wrapping when changing read() to take a CharBuffer
443 try {
444 if (errorHandler != null) {
445 SAXParseException spe = new SAXParseException(message, this);
446 errorHandler.error(spe);
447 }
448 } catch (SAXException e) {
449 throw (IOException) new IOException(e.getMessage()).initCause(e);
450 }
451 }
452
453 public Charset getCharset() {
454 return decoder.charset();
455 }
456
457 /**
458 * @see java.io.Reader#read()
459 */
460 @Override public int read() throws IOException {
461 throw new UnsupportedOperationException();
462 }
463
464 /**
465 * @see java.io.Reader#read(char[], int, int)
466 */
467 @Override public int read(char[] cbuf, int off, int len) throws IOException {
468 throw new UnsupportedOperationException();
469 }
470
471 /**
472 * @see java.io.Reader#read(java.nio.CharBuffer)
473 */
474 @Override public int read(CharBuffer target) throws IOException {
475 throw new UnsupportedOperationException();
476 }
477
478 public void switchEncoding(Encoding newEnc) {
479 this.decoder = newEnc.newDecoder();
480 initDecoder();
481 }
482 }