001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package nu.validator.htmlparser.io;
024    
025    import java.io.IOException;
026    import java.io.InputStream;
027    import java.io.Reader;
028    import java.nio.ByteBuffer;
029    import java.nio.CharBuffer;
030    import java.nio.charset.Charset;
031    import java.nio.charset.CharsetDecoder;
032    import java.nio.charset.CoderResult;
033    import java.nio.charset.CodingErrorAction;
034    
035    import nu.validator.htmlparser.common.ByteReadable;
036    import nu.validator.htmlparser.common.Heuristics;
037    import nu.validator.htmlparser.common.XmlViolationPolicy;
038    import nu.validator.htmlparser.extra.ChardetSniffer;
039    import nu.validator.htmlparser.extra.IcuDetectorSniffer;
040    import nu.validator.htmlparser.impl.Tokenizer;
041    
042    import org.xml.sax.ErrorHandler;
043    import org.xml.sax.Locator;
044    import org.xml.sax.SAXException;
045    import org.xml.sax.SAXParseException;
046    
047    /**
048     * Be very careful with this class. It is not a general-purpose subclass of of
049     * <code>Reader</code>. Instead, it is the minimal implementation that does
050     * what <code>Tokenizer</code> needs while being an instance of
051     * <code>Reader</code>.
052     * 
053     * The only reason why this is a public class is that it needs to be visible to
054     * test code in another package.
055     * 
056     * @version $Id$
057     * @author hsivonen
058     */
059    public final class HtmlInputStreamReader extends Reader implements
060            ByteReadable, Locator {
061    
062        private static final int SNIFFING_LIMIT = 1024;
063    
064        private final InputStream inputStream;
065    
066        private final ErrorHandler errorHandler;
067    
068        private final Tokenizer tokenizer;
069    
070        private final Driver driver;
071    
072        private CharsetDecoder decoder = null;
073    
074        private boolean sniffing = true;
075    
076        private int limit = 0;
077    
078        private int position = 0;
079    
080        private int bytesRead = 0;
081    
082        private boolean eofSeen = false;
083    
084        private boolean shouldReadBytes = false;
085    
086        private boolean charsetBoundaryPassed = false;
087    
088        private final byte[] byteArray = new byte[4096]; // Length must be >=
089    
090        // SNIFFING_LIMIT
091    
092        private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);
093    
094        private boolean needToNotifyTokenizer = false;
095    
096        private boolean flushing = false;
097    
098        private int line = -1;
099    
100        private int col = -1;
101    
102        private int lineColPos;
103    
104        private boolean hasPendingReplacementCharacter = false;
105    
106        private boolean nextCharOnNewLine;
107    
108        private boolean prevWasCR;
109    
110        /**
111         * @param inputStream
112         * @param errorHandler
113         * @param locator
114         * @throws IOException
115         * @throws SAXException
116         */
117        public HtmlInputStreamReader(InputStream inputStream,
118                ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
119                Heuristics heuristics) throws SAXException, IOException {
120            this.inputStream = inputStream;
121            this.errorHandler = errorHandler;
122            this.tokenizer = tokenizer;
123            this.driver = driver;
124            this.sniffing = true;
125            Encoding encoding = (new BomSniffer(this)).sniff();
126            if (encoding == null) {
127                position = 0;
128                encoding = (new MetaSniffer(errorHandler, this)).sniff(this);
129                if (encoding == null
130                        && (heuristics == Heuristics.CHARDET || heuristics == Heuristics.ALL)) {
131                    encoding = (new ChardetSniffer(byteArray, limit)).sniff();
132                }
133                if (encoding == null
134                        && (heuristics == Heuristics.ICU || heuristics == Heuristics.ALL)) {
135                    position = 0;
136                    encoding = (new IcuDetectorSniffer(this)).sniff();
137                }
138                sniffing = false;
139                if (encoding == null) {
140                    encoding = Encoding.WINDOWS1252;
141                }
142                if (driver != null) {
143                    driver.setEncoding(encoding, Confidence.TENTATIVE);
144                }
145            } else {
146                if (encoding == Encoding.UTF8) {
147                    if (driver != null) {
148                        driver.setEncoding(Encoding.UTF8, Confidence.CERTAIN);
149                    }
150                } else {
151                    if (driver != null) {
152                        driver.setEncoding(Encoding.UTF16, Confidence.CERTAIN);
153                    }
154                }
155            }
156            this.decoder = encoding.newDecoder();
157            sniffing = false;
158            position = 0;
159            bytesRead = 0;
160            byteBuffer.position(position);
161            byteBuffer.limit(limit);
162            initDecoder();
163        }
164    
165        /**
166         * 
167         */
168        private void initDecoder() {
169            this.decoder.onMalformedInput(CodingErrorAction.REPORT);
170            this.decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
171        }
172    
173        public HtmlInputStreamReader(InputStream inputStream,
174                ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
175                Encoding encoding) throws SAXException, IOException {
176            this.inputStream = inputStream;
177            this.errorHandler = errorHandler;
178            this.tokenizer = tokenizer;
179            this.driver = driver;
180            this.decoder = encoding.newDecoder();
181            this.sniffing = false;
182            position = 0;
183            bytesRead = 0;
184            byteBuffer.position(0);
185            byteBuffer.limit(0);
186            shouldReadBytes = true;
187            initDecoder();
188        }
189    
190        @Override public void close() throws IOException {
191            inputStream.close();
192        }
193    
194        @Override public int read(char[] charArray) throws IOException {
195            lineColPos = 0;
196            assert !sniffing;
197            assert charArray.length >= 2;
198            if (needToNotifyTokenizer) {
199                if (driver != null) {
200                    driver.notifyAboutMetaBoundary();
201                }
202                needToNotifyTokenizer = false;
203            }
204            CharBuffer charBuffer = CharBuffer.wrap(charArray);
205            charBuffer.limit(charArray.length);
206            charBuffer.position(0);
207            if (flushing) {
208                decoder.flush(charBuffer);
209                // return -1 if zero
210                int cPos = charBuffer.position();
211                return cPos == 0 ? -1 : cPos;
212            }
213            if (hasPendingReplacementCharacter) {
214                charBuffer.put('\uFFFD');
215                hasPendingReplacementCharacter = false;
216            }
217            for (;;) {
218                if (shouldReadBytes) {
219                    int oldLimit = byteBuffer.limit();
220                    int readLen;
221                    if (charsetBoundaryPassed) {
222                        readLen = byteArray.length - oldLimit;
223                    } else {
224                        readLen = SNIFFING_LIMIT - oldLimit;
225                    }
226                    int num = inputStream.read(byteArray, oldLimit, readLen);
227                    if (num == -1) {
228                        eofSeen = true;
229                        inputStream.close();
230                    } else {
231                        byteBuffer.position(0);
232                        byteBuffer.limit(oldLimit + num);
233                    }
234                    shouldReadBytes = false;
235                }
236                boolean finalDecode = false;
237                for (;;) {
238                    int oldBytePos = byteBuffer.position();
239                    CoderResult cr = decoder.decode(byteBuffer, charBuffer,
240                            finalDecode);
241                    bytesRead += byteBuffer.position() - oldBytePos;
242                    if (cr == CoderResult.OVERFLOW) {
243                        // Decoder will remember surrogates
244                        return charBuffer.position();
245                    } else if (cr == CoderResult.UNDERFLOW) {
246                        int remaining = byteBuffer.remaining();
247                        if (!charsetBoundaryPassed) {
248                            if (bytesRead + remaining >= SNIFFING_LIMIT) {
249                                needToNotifyTokenizer = true;
250                                charsetBoundaryPassed = true;
251                            }
252                        }
253    
254                        // XXX what happens if the entire byte buffer consists of 
255                        // a pathologically long malformed sequence?
256    
257                        // If the buffer was not fully consumed, there may be an
258                        // incomplete byte sequence that needs to seed the next
259                        // buffer.
260                        if (remaining > 0) {
261                            System.arraycopy(byteArray, byteBuffer.position(),
262                                    byteArray, 0, remaining);
263                        }
264                        byteBuffer.position(0);
265                        byteBuffer.limit(remaining);
266                        if (flushing) {
267                            // The final decode was successful. Not sure if this
268                            // ever happens.
269                            // Let's get out in any case.
270                            int cPos = charBuffer.position();
271                            return cPos == 0 ? -1 : cPos;
272                        } else if (eofSeen) {
273                            // If there's something left, it isn't something that
274                            // would be
275                            // consumed in the middle of the stream. Rerun the loop
276                            // once
277                            // in the final mode.
278                            shouldReadBytes = false;
279                            finalDecode = true;
280                            flushing = true;
281                            continue;
282                        } else {
283                            // The usual stuff. Want more bytes next time.
284                            shouldReadBytes = true;
285                            // return -1 if zero
286                            int cPos = charBuffer.position();
287                            return cPos == 0 ? -1 : cPos;
288                        }
289                    } else {
290                        // The result is in error. No need to test.
291                        StringBuilder sb = new StringBuilder();
292                        for (int i = 0; i < cr.length(); i++) {
293                            if (i > 0) {
294                                sb.append(", ");
295                            }
296                            sb.append('\u201C');
297                            sb.append(Integer.toHexString(byteBuffer.get() & 0xFF));
298                            bytesRead++;
299                            sb.append('\u201D');
300                        }
301                        if (charBuffer.hasRemaining()) {
302                            charBuffer.put('\uFFFD');                     
303                        } else {
304                            hasPendingReplacementCharacter = true;
305                        }
306                        calculateLineAndCol(charBuffer);
307                        if (cr.isMalformed()) {
308                            err("Malformed byte sequence: " + sb + ".");
309                        } else if (cr.isUnmappable()) {
310                            err("Unmappable byte sequence: " + sb + ".");
311                        } else {
312                            throw new RuntimeException(
313                                    "CoderResult was none of overflow, underflow, malformed or unmappable.");
314                        }
315                        if (finalDecode) {
316                            // These were the last bytes of input. Return without
317                            // relooping.
318                            // return -1 if zero
319                            int cPos = charBuffer.position();
320                            return cPos == 0 ? -1 : cPos;
321                        }
322                    }
323                }
324            }
325        }
326    
327        private void calculateLineAndCol(CharBuffer charBuffer) {
328            if (tokenizer != null) {
329                if (lineColPos == 0) {
330                    line = tokenizer.getLine();
331                    col = tokenizer.getCol();
332                    nextCharOnNewLine = tokenizer.isNextCharOnNewLine();
333                    prevWasCR = tokenizer.isPrevCR();
334                }
335                
336                char[] charArray = charBuffer.array();
337                int i = lineColPos;
338                while (i < charBuffer.position()) {
339                    char c;
340                    if (nextCharOnNewLine) {
341                        line++;
342                        col = 1;
343                        nextCharOnNewLine = false;
344                    } else {
345                        col++;
346                    }
347    
348                    c = charArray[i];
349                    switch (c) {
350                        case '\r':
351                            nextCharOnNewLine = true;
352                            prevWasCR = true;
353                            break;
354                        case '\n':
355                            if (prevWasCR) {
356                                col--;
357                            } else {
358                                nextCharOnNewLine = true;
359                            }
360                            break;
361                    }
362                    i++;
363                }
364                lineColPos = i;
365            }
366        }
367    
368        public int readByte() throws IOException {
369            if (!sniffing) {
370                throw new IllegalStateException(
371                        "readByte() called when not in the sniffing state.");
372            }
373            if (position == SNIFFING_LIMIT) {
374                return -1;
375            } else if (position < limit) {
376                return byteArray[position++] & 0xFF;
377            } else {
378                int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit);
379                if (num == -1) {
380                    return -1;
381                } else {
382                    limit += num;
383                    return byteArray[position++] & 0xFF;
384                }
385            }
386        }
387    
388        public static void main(String[] args) {
389            CharsetDecoder dec = Charset.forName("UTF-8").newDecoder();
390            dec.onMalformedInput(CodingErrorAction.REPORT);
391            dec.onUnmappableCharacter(CodingErrorAction.REPORT);
392            byte[] bytes = { (byte) 0xF0, (byte) 0x9D, (byte) 0x80, (byte) 0x80 };
393            byte[] bytes2 = { (byte) 0xB8, (byte) 0x80, 0x63, 0x64, 0x65 };
394            ByteBuffer byteBuf = ByteBuffer.wrap(bytes);
395            ByteBuffer byteBuf2 = ByteBuffer.wrap(bytes2);
396            char[] chars = new char[1];
397            CharBuffer charBuf = CharBuffer.wrap(chars);
398    
399            CoderResult cr = dec.decode(byteBuf, charBuf, false);
400            System.out.println(cr);
401            System.out.println(byteBuf);
402            // byteBuf.get();
403            cr = dec.decode(byteBuf2, charBuf, false);
404            System.out.println(cr);
405            System.out.println(byteBuf2);
406    
407        }
408    
409        public int getColumnNumber() {
410            if (tokenizer != null) {
411                return col;
412            }
413            return -1;
414        }
415    
416        public int getLineNumber() {
417            if (tokenizer != null) {
418                return line;
419            }
420            return -1;
421        }
422    
423        public String getPublicId() {
424            if (tokenizer != null) {
425                return tokenizer.getPublicId();
426            }
427            return null;
428        }
429    
430        public String getSystemId() {
431            if (tokenizer != null) {
432                return tokenizer.getSystemId();
433            }
434            return null;
435        }
436    
437        /**
438         * @param string
439         * @throws SAXException
440         */
441        private void err(String message) throws IOException {
442            // TODO remove wrapping when changing read() to take a CharBuffer
443            try {
444                if (errorHandler != null) {
445                    SAXParseException spe = new SAXParseException(message, this);
446                    errorHandler.error(spe);
447                }
448            } catch (SAXException e) {
449                throw (IOException) new IOException(e.getMessage()).initCause(e);
450            }
451        }
452    
453        public Charset getCharset() {
454            return decoder.charset();
455        }
456    
457        /**
458         * @see java.io.Reader#read()
459         */
460        @Override public int read() throws IOException {
461            throw new UnsupportedOperationException();
462        }
463    
464        /**
465         * @see java.io.Reader#read(char[], int, int)
466         */
467        @Override public int read(char[] cbuf, int off, int len) throws IOException {
468            throw new UnsupportedOperationException();
469        }
470    
471        /**
472         * @see java.io.Reader#read(java.nio.CharBuffer)
473         */
474        @Override public int read(CharBuffer target) throws IOException {
475            throw new UnsupportedOperationException();
476        }
477    
478        public void switchEncoding(Encoding newEnc) {
479            this.decoder = newEnc.newDecoder();
480            initDecoder();
481        }
482    }