001 /* 002 * Copyright (c) 2007 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package nu.validator.htmlparser.io; 024 025 import java.io.IOException; 026 027 import nu.validator.htmlparser.common.ByteReadable; 028 029 /** 030 * The BOM sniffing part of the HTML5 encoding sniffing algorithm. 031 * 032 * @version $Id$ 033 * @author hsivonen 034 */ 035 public final class BomSniffer { 036 037 private final ByteReadable source; 038 039 /** 040 * @param source 041 */ 042 public BomSniffer(final ByteReadable source) { 043 this.source = source; 044 } 045 046 Encoding sniff() throws IOException { 047 int b = source.readByte(); 048 if (b == 0xEF) { // UTF-8 049 b = source.readByte(); 050 if (b == 0xBB) { 051 b = source.readByte(); 052 if (b == 0xBF) { 053 return Encoding.UTF8; 054 } else { 055 return null; 056 } 057 } else { 058 return null; 059 } 060 } else if (b == 0xFF) { // little-endian 061 b = source.readByte(); 062 if (b == 0xFE) { 063 return Encoding.UTF16LE; 064 } else { 065 return null; 066 } 067 } else if (b == 0xFE) { // big-endian UTF-16 068 b = source.readByte(); 069 if (b == 0xFF) { 070 return Encoding.UTF16BE; 071 } else { 072 return null; 073 } 074 } else { 075 return null; 076 } 077 } 078 079 }