001 /* 002 * Copyright (c) 2007 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package nu.validator.htmlparser.impl; 024 025 import java.io.IOException; 026 import java.nio.charset.Charset; 027 import java.nio.charset.CharsetDecoder; 028 029 /** 030 * The BOM sniffing part of the HTML5 encoding sniffing algorithm. 031 * 032 * @version $Id: BomSniffer.java 150 2007-08-16 19:21:25Z hsivonen $ 033 * @author hsivonen 034 */ 035 public final class BomSniffer { 036 037 private final ByteReadable source; 038 039 /** 040 * @param source 041 */ 042 public BomSniffer(final ByteReadable source) { 043 this.source = source; 044 } 045 046 CharsetDecoder sniff() throws IOException { 047 int b = source.readByte(); 048 if (b == 0xEF) { // UTF-8 049 b = source.readByte(); 050 if (b == 0xBB) { 051 b = source.readByte(); 052 if (b == 0xBF) { 053 // return new CharsetProviderICU().charsetForName("UTF-8").newDecoder(); 054 return Charset.forName("UTF-8").newDecoder(); 055 } else { 056 return null; 057 } 058 } else { 059 return null; 060 } 061 } else if (b == 0xFF) { // little-endian 062 b = source.readByte(); 063 if (b == 0xFE) { 064 return Charset.forName("UTF-16LE").newDecoder(); 065 } else { 066 return null; 067 } 068 } else if (b == 0xFE) { // big-endian UTF-16 069 b = source.readByte(); 070 if (b == 0xFF) { 071 return Charset.forName("UTF-16BE").newDecoder(); 072 } else { 073 return null; 074 } 075 } else { 076 return null; 077 } 078 } 079 080 }