001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package nu.validator.htmlparser.impl;
024
025 import java.io.IOException;
026 import java.io.InputStream;
027 import java.io.Reader;
028 import java.nio.ByteBuffer;
029 import java.nio.CharBuffer;
030 import java.nio.charset.Charset;
031 import java.nio.charset.CharsetDecoder;
032 import java.nio.charset.CoderResult;
033 import java.nio.charset.CodingErrorAction;
034
035
036 import org.xml.sax.ErrorHandler;
037 import org.xml.sax.Locator;
038 import org.xml.sax.SAXException;
039 import org.xml.sax.SAXParseException;
040
041 /**
042 * Be very careful with this class. It is not a general-purpose subclass of of
043 * <code>Reader</code>. Instead, it is the minimal implementation that does
044 * what <code>Tokenizer</code> needs while being an instance of
045 * <code>Reader</code>.
046 *
047 * The only reason why this is a public class is that it needs to be visible to
048 * test code in another package.
049 *
050 * @version $Id: HtmlInputStreamReader.java 150 2007-08-16 19:21:25Z hsivonen $
051 * @author hsivonen
052 */
053 public final class HtmlInputStreamReader extends Reader implements
054 ByteReadable, Locator {
055
056 private static final int SNIFFING_LIMIT = 512;
057
058 private final InputStream inputStream;
059
060 private final ErrorHandler errorHandler;
061
062 private final Locator locator;
063
064 private final Tokenizer tokenizer;
065
066 private CharsetDecoder decoder = null;
067
068 private boolean sniffing = true;
069
070 private int limit = 0;
071
072 private int position = 0;
073
074 private int bytesRead = 0;
075
076 private boolean eofSeen = false;
077
078 private boolean shouldReadBytes = false;
079
080 private boolean charsetBoundaryPassed = false;
081
082 private final byte[] byteArray = new byte[4096]; // Length must be >=
083
084 // SNIFFING_LIMIT
085
086 private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);
087
088 private boolean needToNotifyTokenizer = false;
089
090 private boolean flushing = false;
091
092 private int line = -1;
093
094 private int col = -1;
095
096 private int lineColPos;
097
098 /**
099 * @param inputStream
100 * @param errorHandler
101 * @param locator
102 * @throws IOException
103 * @throws SAXException
104 */
105 public HtmlInputStreamReader(InputStream inputStream,
106 ErrorHandler errorHandler, Locator locator, Tokenizer tokenizer)
107 throws SAXException, IOException {
108 this.inputStream = inputStream;
109 this.errorHandler = errorHandler;
110 this.locator = locator;
111 this.tokenizer = tokenizer;
112 this.sniffing = true;
113 this.decoder = (new BomSniffer(this)).sniff();
114 if (this.decoder == null) {
115 position = 0;
116 this.decoder = (new MetaSniffer(this, errorHandler, this)).sniff();
117 sniffing = false;
118 // TODO chardet
119 if (this.decoder == null) {
120 if (tokenizer != null) {
121 tokenizer.noEncodingDeclared();
122 }
123 err("Could not determine the character encoding of the document. Using \u201CWindows-1252\u201D.");
124 this.decoder = Charset.forName("Windows-1252").newDecoder();
125 }
126 }
127 sniffing = false;
128 position = 0;
129 bytesRead = 0;
130 byteBuffer.position(position);
131 byteBuffer.limit(limit);
132 initDecoder();
133 }
134
135 /**
136 *
137 */
138 private void initDecoder() {
139 if ("ISO-8859-1".equals(this.decoder.charset().name())) {
140 this.decoder = Charset.forName("Windows-1252").newDecoder();
141 }
142 this.decoder.onMalformedInput(CodingErrorAction.REPORT);
143 this.decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
144 }
145
146 public HtmlInputStreamReader(InputStream inputStream,
147 ErrorHandler errorHandler, Locator locator, Tokenizer tokenizer,
148 CharsetDecoder decoder) throws SAXException, IOException {
149 this.inputStream = inputStream;
150 this.errorHandler = errorHandler;
151 this.locator = locator;
152 this.tokenizer = tokenizer;
153 this.decoder = decoder;
154 this.sniffing = false;
155 position = 0;
156 bytesRead = 0;
157 byteBuffer.position(0);
158 byteBuffer.limit(0);
159 shouldReadBytes = true;
160 initDecoder();
161 }
162
163 @Override
164 public void close() throws IOException {
165 // TODO Auto-generated method stub
166 inputStream.close();
167 }
168
169 @Override
170 public int read(char[] charArray) throws IOException {
171 lineColPos = 0;
172 if (sniffing) {
173 throw new IllegalStateException(
174 "read() called when in the sniffing state.");
175 }
176 assert charArray.length >= 2;
177 if (needToNotifyTokenizer) {
178 if (tokenizer != null) {
179 tokenizer.notifyAboutMetaBoundary();
180 }
181 needToNotifyTokenizer = false;
182 }
183 CharBuffer charBuffer = CharBuffer.wrap(charArray);
184 charBuffer.limit(charArray.length);
185 charBuffer.position(0);
186 if (flushing) {
187 decoder.flush(charBuffer);
188 // return -1 if zero
189 int cPos = charBuffer.position();
190 return cPos == 0 ? -1 : cPos;
191 }
192 outer: for (;;) {
193 if (shouldReadBytes) {
194 int oldLimit = byteBuffer.limit();
195 int readLen;
196 if (charsetBoundaryPassed) {
197 readLen = byteArray.length - oldLimit;
198 } else {
199 readLen = SNIFFING_LIMIT - oldLimit;
200 }
201 int num = inputStream.read(byteArray, oldLimit, readLen);
202 if (num == -1) {
203 eofSeen = true;
204 inputStream.close();
205 } else {
206 byteBuffer.position(0);
207 byteBuffer.limit(oldLimit + num);
208 }
209 shouldReadBytes = false;
210 }
211 boolean finalDecode = false;
212 for (;;) {
213 int oldBytePos = byteBuffer.position();
214 CoderResult cr = decoder.decode(byteBuffer, charBuffer,
215 finalDecode);
216 bytesRead += byteBuffer.position() - oldBytePos;
217 if (cr == CoderResult.OVERFLOW) {
218 // Decoder will remember surrogates
219 return charBuffer.position();
220 } else if (cr == CoderResult.UNDERFLOW) {
221 int remaining = byteBuffer.remaining();
222 if (!charsetBoundaryPassed) {
223 if (bytesRead + remaining >= SNIFFING_LIMIT) {
224 needToNotifyTokenizer = true;
225 }
226 }
227
228 // XXX what happens if the entire byte buffer consists of
229 // a pathologically long malformed sequence?
230
231 // If the buffer was not fully consumed, there may be an
232 // incomplete byte sequence that needs to seed the next
233 // buffer.
234 if (remaining > 0) {
235 System.arraycopy(byteArray, byteBuffer.position(),
236 byteArray, 0, remaining);
237 }
238 byteBuffer.position(0);
239 byteBuffer.limit(remaining);
240 if (flushing) {
241 // The final decode was successful. Not sure if this
242 // ever happens.
243 // Let's get out in any case.
244 int cPos = charBuffer.position();
245 return cPos == 0 ? -1 : cPos;
246 } else if (eofSeen) {
247 // If there's something left, it isn't something that
248 // would be
249 // consumed in the middle of the stream. Rerun the loop
250 // once
251 // in the final mode.
252 shouldReadBytes = false;
253 finalDecode = true;
254 flushing = true;
255 continue;
256 } else {
257 // The usual stuff. Want more bytes next time.
258 shouldReadBytes = true;
259 return charBuffer.position();
260 }
261 } else {
262 // The result is in error. No need to test.
263 StringBuilder sb = new StringBuilder();
264 for (int i = 0; i < cr.length(); i++) {
265 if (i > 0) {
266 sb.append(", ");
267 }
268 sb.append('\u201C');
269 sb.append(Integer.toHexString(byteBuffer.get() & 0xFF));
270 bytesRead++;
271 sb.append('\u201D');
272 }
273 charBuffer.put('\uFFFD');
274 calculateLineAndCol(charBuffer);
275 if (cr.isMalformed()) {
276 err("Malformed byte sequence: " + sb + ".");
277 } else if (cr.isUnmappable()) {
278 err("Unmappable byte sequence: " + sb + ".");
279 } else {
280 throw new RuntimeException(
281 "CoderResult was none of overflow, underflow, malformed or unmappable.");
282 }
283 if (finalDecode) {
284 // These were the last bytes of input. Return without
285 // relooping.
286 return charBuffer.position();
287 }
288 }
289 }
290 }
291 }
292
293 private void calculateLineAndCol(CharBuffer charBuffer) {
294 if (locator != null) {
295 line = locator.getLineNumber();
296 col = locator.getColumnNumber();
297 char[] charArray = charBuffer.array();
298 boolean prevWasCR = false;
299 int i;
300 for (i = lineColPos; i < charBuffer.position(); i++) {
301 switch (charArray[i]) {
302 case '\n': // LF
303 if (!prevWasCR) {
304 line++;
305 col = 0;
306 }
307 prevWasCR = false;
308 break;
309 case '\r': // CR
310 line++;
311 col = 0;
312 prevWasCR = true;
313 break;
314 default:
315 col++;
316 prevWasCR = false;
317 break;
318 }
319 }
320 lineColPos = i;
321 }
322 }
323
324 public int readByte() throws IOException {
325 if (!sniffing) {
326 throw new IllegalStateException(
327 "readByte() called when not in the sniffing state.");
328 }
329 if (position == SNIFFING_LIMIT) {
330 return -1;
331 } else if (position < limit) {
332 return byteArray[position++] & 0xFF;
333 } else {
334 int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit);
335 if (num == -1) {
336 return -1;
337 } else {
338 limit += num;
339 return byteArray[position++] & 0xFF;
340 }
341 }
342 }
343
344 public static void main(String[] args) {
345 CharsetDecoder dec = Charset.forName("UTF-8").newDecoder();
346 dec.onMalformedInput(CodingErrorAction.REPORT);
347 dec.onUnmappableCharacter(CodingErrorAction.REPORT);
348 byte[] bytes = { (byte) 0xF0, (byte) 0x9D, (byte) 0x80, (byte) 0x80 };
349 byte[] bytes2 = { (byte) 0xB8, (byte) 0x80, 0x63, 0x64, 0x65 };
350 ByteBuffer byteBuf = ByteBuffer.wrap(bytes);
351 ByteBuffer byteBuf2 = ByteBuffer.wrap(bytes2);
352 char[] chars = new char[1];
353 CharBuffer charBuf = CharBuffer.wrap(chars);
354
355 CoderResult cr = dec.decode(byteBuf, charBuf, false);
356 System.out.println(cr);
357 System.out.println(byteBuf);
358 // byteBuf.get();
359 cr = dec.decode(byteBuf2, charBuf, false);
360 System.out.println(cr);
361 System.out.println(byteBuf2);
362
363 }
364
365 public int getColumnNumber() {
366 if (locator != null) {
367 return col;
368 }
369 return -1;
370 }
371
372 public int getLineNumber() {
373 if (locator != null) {
374 return line;
375 }
376 return -1;
377 }
378
379 public String getPublicId() {
380 if (locator != null) {
381 return locator.getPublicId();
382 }
383 return null;
384 }
385
386 public String getSystemId() {
387 if (locator != null) {
388 return locator.getSystemId();
389 }
390 return null;
391 }
392
393 /**
394 * @param string
395 * @throws SAXException
396 */
397 private void err(String message) throws IOException {
398 // TODO remove wrapping when changing read() to take a CharBuffer
399 try {
400 if (errorHandler != null) {
401 SAXParseException spe = new SAXParseException(message, this);
402 errorHandler.error(spe);
403 }
404 } catch (SAXException e) {
405 throw (IOException) new IOException(e.getMessage()).initCause(e);
406 }
407 }
408
409 /**
410 * @param string
411 * @throws SAXException
412 */
413 private void warn(String message) throws IOException {
414 try {
415 if (errorHandler != null) {
416 SAXParseException spe = new SAXParseException(message, this);
417 errorHandler.warning(spe);
418 }
419 } catch (SAXException e) {
420 throw (IOException) new IOException(e.getMessage()).initCause(e);
421 }
422 }
423
424 public Charset getCharset() {
425 return decoder.charset();
426 }
427
428 /**
429 * @see java.io.Reader#read()
430 */
431 @Override
432 public int read() throws IOException {
433 throw new UnsupportedOperationException();
434 }
435
436 /**
437 * @see java.io.Reader#read(char[], int, int)
438 */
439 @Override
440 public int read(char[] cbuf, int off, int len) throws IOException {
441 throw new UnsupportedOperationException();
442 }
443
444 /**
445 * @see java.io.Reader#read(java.nio.CharBuffer)
446 */
447 @Override
448 public int read(CharBuffer target) throws IOException {
449 throw new UnsupportedOperationException();
450 }
451
452 }