001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package nu.validator.htmlparser.impl;
024    
025    import java.io.IOException;
026    import java.io.InputStream;
027    import java.io.Reader;
028    import java.nio.ByteBuffer;
029    import java.nio.CharBuffer;
030    import java.nio.charset.Charset;
031    import java.nio.charset.CharsetDecoder;
032    import java.nio.charset.CoderResult;
033    import java.nio.charset.CodingErrorAction;
034    
035    
036    import org.xml.sax.ErrorHandler;
037    import org.xml.sax.Locator;
038    import org.xml.sax.SAXException;
039    import org.xml.sax.SAXParseException;
040    
041    /**
042     * Be very careful with this class. It is not a general-purpose subclass of of
043     * <code>Reader</code>. Instead, it is the minimal implementation that does
044     * what <code>Tokenizer</code> needs while being an instance of
045     * <code>Reader</code>.
046     * 
047     * The only reason why this is a public class is that it needs to be visible to
048     * test code in another package.
049     * 
050     * @version $Id: HtmlInputStreamReader.java 150 2007-08-16 19:21:25Z hsivonen $
051     * @author hsivonen
052     */
053    public final class HtmlInputStreamReader extends Reader implements
054            ByteReadable, Locator {
055    
056        private static final int SNIFFING_LIMIT = 512;
057    
058        private final InputStream inputStream;
059    
060        private final ErrorHandler errorHandler;
061    
062        private final Locator locator;
063    
064        private final Tokenizer tokenizer;
065    
066        private CharsetDecoder decoder = null;
067    
068        private boolean sniffing = true;
069    
070        private int limit = 0;
071    
072        private int position = 0;
073    
074        private int bytesRead = 0;
075    
076        private boolean eofSeen = false;
077    
078        private boolean shouldReadBytes = false;
079    
080        private boolean charsetBoundaryPassed = false;
081    
082        private final byte[] byteArray = new byte[4096]; // Length must be >=
083    
084        // SNIFFING_LIMIT
085    
086        private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);
087    
088        private boolean needToNotifyTokenizer = false;
089    
090        private boolean flushing = false;
091    
092        private int line = -1;
093    
094        private int col = -1;
095    
096        private int lineColPos;
097    
098        /**
099         * @param inputStream
100         * @param errorHandler
101         * @param locator
102         * @throws IOException
103         * @throws SAXException
104         */
105        public HtmlInputStreamReader(InputStream inputStream,
106                ErrorHandler errorHandler, Locator locator, Tokenizer tokenizer)
107                throws SAXException, IOException {
108            this.inputStream = inputStream;
109            this.errorHandler = errorHandler;
110            this.locator = locator;
111            this.tokenizer = tokenizer;
112            this.sniffing = true;
113            this.decoder = (new BomSniffer(this)).sniff();
114            if (this.decoder == null) {
115                position = 0;
116                this.decoder = (new MetaSniffer(this, errorHandler, this)).sniff();
117                sniffing = false;
118                // TODO chardet
119                if (this.decoder == null) {
120                    if (tokenizer != null) {
121                        tokenizer.noEncodingDeclared();
122                    }
123                    err("Could not determine the character encoding of the document. Using \u201CWindows-1252\u201D.");
124                    this.decoder = Charset.forName("Windows-1252").newDecoder();
125                }
126            }
127            sniffing = false;
128            position = 0;
129            bytesRead = 0;
130            byteBuffer.position(position);
131            byteBuffer.limit(limit);
132            initDecoder();
133        }
134    
135        /**
136         * 
137         */
138        private void initDecoder() {
139            if ("ISO-8859-1".equals(this.decoder.charset().name())) {
140                this.decoder = Charset.forName("Windows-1252").newDecoder();
141            }
142            this.decoder.onMalformedInput(CodingErrorAction.REPORT);
143            this.decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
144        }
145    
146        public HtmlInputStreamReader(InputStream inputStream,
147                ErrorHandler errorHandler, Locator locator, Tokenizer tokenizer,
148                CharsetDecoder decoder) throws SAXException, IOException {
149            this.inputStream = inputStream;
150            this.errorHandler = errorHandler;
151            this.locator = locator;
152            this.tokenizer = tokenizer;
153            this.decoder = decoder;
154            this.sniffing = false;
155            position = 0;
156            bytesRead = 0;
157            byteBuffer.position(0);
158            byteBuffer.limit(0);
159            shouldReadBytes = true;
160            initDecoder();
161        }
162    
163        @Override
164        public void close() throws IOException {
165            // TODO Auto-generated method stub
166            inputStream.close();
167        }
168    
169        @Override
170        public int read(char[] charArray) throws IOException {
171            lineColPos = 0;
172            if (sniffing) {
173                throw new IllegalStateException(
174                        "read() called when in the sniffing state.");
175            }
176            assert charArray.length >= 2;
177            if (needToNotifyTokenizer) {
178                if (tokenizer != null) {
179                    tokenizer.notifyAboutMetaBoundary();
180                }
181                needToNotifyTokenizer = false;
182            }
183            CharBuffer charBuffer = CharBuffer.wrap(charArray);
184            charBuffer.limit(charArray.length);
185            charBuffer.position(0);
186            if (flushing) {
187                decoder.flush(charBuffer);
188                // return -1 if zero
189                int cPos = charBuffer.position();
190                return cPos == 0 ? -1 : cPos;
191            }
192            outer: for (;;) {
193                if (shouldReadBytes) {
194                    int oldLimit = byteBuffer.limit();
195                    int readLen;
196                    if (charsetBoundaryPassed) {
197                        readLen = byteArray.length - oldLimit;
198                    } else {
199                        readLen = SNIFFING_LIMIT - oldLimit;
200                    }
201                    int num = inputStream.read(byteArray, oldLimit, readLen);
202                    if (num == -1) {
203                        eofSeen = true;
204                        inputStream.close();
205                    } else {
206                        byteBuffer.position(0);
207                        byteBuffer.limit(oldLimit + num);
208                    }
209                    shouldReadBytes = false;
210                }
211                boolean finalDecode = false;
212                for (;;) {
213                    int oldBytePos = byteBuffer.position();
214                    CoderResult cr = decoder.decode(byteBuffer, charBuffer,
215                            finalDecode);
216                    bytesRead += byteBuffer.position() - oldBytePos;
217                    if (cr == CoderResult.OVERFLOW) {
218                        // Decoder will remember surrogates
219                        return charBuffer.position();
220                    } else if (cr == CoderResult.UNDERFLOW) {
221                        int remaining = byteBuffer.remaining();
222                        if (!charsetBoundaryPassed) {
223                            if (bytesRead + remaining >= SNIFFING_LIMIT) {
224                                needToNotifyTokenizer = true;
225                            }
226                        }
227    
228                        // XXX what happens if the entire byte buffer consists of 
229                        // a pathologically long malformed sequence?
230                        
231                        // If the buffer was not fully consumed, there may be an
232                        // incomplete byte sequence that needs to seed the next
233                        // buffer.
234                        if (remaining > 0) {
235                            System.arraycopy(byteArray, byteBuffer.position(),
236                                    byteArray, 0, remaining);
237                        }
238                        byteBuffer.position(0);
239                        byteBuffer.limit(remaining);
240                        if (flushing) {
241                            // The final decode was successful. Not sure if this
242                            // ever happens.
243                            // Let's get out in any case.
244                            int cPos = charBuffer.position();
245                            return cPos == 0 ? -1 : cPos;
246                        } else if (eofSeen) {
247                            // If there's something left, it isn't something that
248                            // would be
249                            // consumed in the middle of the stream. Rerun the loop
250                            // once
251                            // in the final mode.
252                            shouldReadBytes = false;
253                            finalDecode = true;
254                            flushing = true;
255                            continue;
256                        } else {
257                            // The usual stuff. Want more bytes next time.
258                            shouldReadBytes = true;
259                            return charBuffer.position();
260                        }
261                    } else {
262                        // The result is in error. No need to test.
263                        StringBuilder sb = new StringBuilder();
264                        for (int i = 0; i < cr.length(); i++) {
265                            if (i > 0) {
266                                sb.append(", ");
267                            }
268                            sb.append('\u201C');
269                            sb.append(Integer.toHexString(byteBuffer.get() & 0xFF));
270                            bytesRead++;
271                            sb.append('\u201D');
272                        }
273                        charBuffer.put('\uFFFD');
274                        calculateLineAndCol(charBuffer);
275                        if (cr.isMalformed()) {
276                            err("Malformed byte sequence: " + sb + ".");
277                        } else if (cr.isUnmappable()) {
278                            err("Unmappable byte sequence: " + sb + ".");
279                        } else {
280                            throw new RuntimeException(
281                                    "CoderResult was none of overflow, underflow, malformed or unmappable.");
282                        }
283                        if (finalDecode) {
284                            // These were the last bytes of input. Return without
285                            // relooping.
286                            return charBuffer.position();
287                        }
288                    }
289                }
290            }
291        }
292    
293        private void calculateLineAndCol(CharBuffer charBuffer) {
294            if (locator != null) {
295                line = locator.getLineNumber();
296                col = locator.getColumnNumber();
297                char[] charArray = charBuffer.array();
298                boolean prevWasCR = false;
299                int i;
300                for (i = lineColPos; i < charBuffer.position(); i++) {
301                    switch (charArray[i]) {
302                        case '\n': // LF
303                            if (!prevWasCR) {
304                                line++;
305                                col = 0;
306                            }
307                            prevWasCR = false;
308                            break;
309                        case '\r': // CR
310                            line++;
311                            col = 0;
312                            prevWasCR = true;
313                            break;
314                        default:
315                            col++;
316                            prevWasCR = false;
317                            break;
318                    }
319                }
320                lineColPos = i;
321            }
322        }
323    
324        public int readByte() throws IOException {
325            if (!sniffing) {
326                throw new IllegalStateException(
327                        "readByte() called when not in the sniffing state.");
328            }
329            if (position == SNIFFING_LIMIT) {
330                return -1;
331            } else if (position < limit) {
332                return byteArray[position++] & 0xFF;
333            } else {
334                int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit);
335                if (num == -1) {
336                    return -1;
337                } else {
338                    limit += num;
339                    return byteArray[position++] & 0xFF;
340                }
341            }
342        }
343    
344        public static void main(String[] args) {
345            CharsetDecoder dec = Charset.forName("UTF-8").newDecoder();
346            dec.onMalformedInput(CodingErrorAction.REPORT);
347            dec.onUnmappableCharacter(CodingErrorAction.REPORT);
348            byte[] bytes = { (byte) 0xF0, (byte) 0x9D, (byte) 0x80, (byte) 0x80 };
349            byte[] bytes2 = { (byte) 0xB8, (byte) 0x80, 0x63, 0x64, 0x65 };
350            ByteBuffer byteBuf = ByteBuffer.wrap(bytes);
351            ByteBuffer byteBuf2 = ByteBuffer.wrap(bytes2);
352            char[] chars = new char[1];
353            CharBuffer charBuf = CharBuffer.wrap(chars);
354    
355            CoderResult cr = dec.decode(byteBuf, charBuf, false);
356            System.out.println(cr);
357            System.out.println(byteBuf);
358            // byteBuf.get();
359            cr = dec.decode(byteBuf2, charBuf, false);
360            System.out.println(cr);
361            System.out.println(byteBuf2);
362    
363        }
364    
365        public int getColumnNumber() {
366            if (locator != null) {
367                return col;
368            }
369            return -1;
370        }
371    
372        public int getLineNumber() {
373            if (locator != null) {
374                return line;
375            }
376            return -1;
377        }
378    
379        public String getPublicId() {
380            if (locator != null) {
381                return locator.getPublicId();
382            }
383            return null;
384        }
385    
386        public String getSystemId() {
387            if (locator != null) {
388                return locator.getSystemId();
389            }
390            return null;
391        }
392    
393        /**
394         * @param string
395         * @throws SAXException
396         */
397        private void err(String message) throws IOException {
398            // TODO remove wrapping when changing read() to take a CharBuffer
399            try {
400                if (errorHandler != null) {
401                    SAXParseException spe = new SAXParseException(message, this);
402                    errorHandler.error(spe);
403                }
404            } catch (SAXException e) {
405                throw (IOException) new IOException(e.getMessage()).initCause(e);
406            }
407        }
408    
409        /**
410         * @param string
411         * @throws SAXException
412         */
413        private void warn(String message) throws IOException {
414            try {
415                if (errorHandler != null) {
416                    SAXParseException spe = new SAXParseException(message, this);
417                    errorHandler.warning(spe);
418                }
419            } catch (SAXException e) {
420                throw (IOException) new IOException(e.getMessage()).initCause(e);
421            }
422        }
423    
424        public Charset getCharset() {
425            return decoder.charset();
426        }
427    
428        /**
429         * @see java.io.Reader#read()
430         */
431        @Override
432        public int read() throws IOException {
433            throw new UnsupportedOperationException();
434        }
435    
436        /**
437         * @see java.io.Reader#read(char[], int, int)
438         */
439        @Override
440        public int read(char[] cbuf, int off, int len) throws IOException {
441            throw new UnsupportedOperationException();
442        }
443    
444        /**
445         * @see java.io.Reader#read(java.nio.CharBuffer)
446         */
447        @Override
448        public int read(CharBuffer target) throws IOException {
449            throw new UnsupportedOperationException();
450        }
451    
452    }