001 /* 002 * Copyright (c) 2009 Mozilla Foundation 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package nu.validator.htmlparser.io; 024 025 import java.io.IOException; 026 import java.nio.charset.UnsupportedCharsetException; 027 028 import nu.validator.htmlparser.common.ByteReadable; 029 import nu.validator.htmlparser.impl.MetaScanner; 030 031 import org.xml.sax.ErrorHandler; 032 import org.xml.sax.Locator; 033 import org.xml.sax.SAXException; 034 import org.xml.sax.SAXParseException; 035 036 public class MetaSniffer extends MetaScanner implements Locator { 037 038 private Encoding characterEncoding = null; 039 040 private final ErrorHandler errorHandler; 041 042 private final Locator locator; 043 044 private int line = 1; 045 046 private int col = 0; 047 048 private boolean prevWasCR = false; 049 050 public MetaSniffer(ErrorHandler eh, Locator locator) { 051 this.errorHandler = eh; 052 this.locator = locator; 053 this.characterEncoding = null; 054 } 055 056 /** 057 * -1 means end. 058 * @return 059 * @throws IOException 060 */ 061 protected int read() throws IOException { 062 int b = readable.readByte(); 063 // [NOCPP[ 064 switch (b) { 065 case '\n': 066 if (!prevWasCR) { 067 line++; 068 col = 0; 069 } 070 prevWasCR = false; 071 break; 072 case '\r': 073 line++; 074 col = 0; 075 prevWasCR = true; 076 break; 077 default: 078 col++; 079 prevWasCR = false; 080 break; 081 } 082 // ]NOCPP] 083 return b; 084 } 085 086 /** 087 * Main loop. 088 * 089 * @return 090 * 091 * @throws SAXException 092 * @throws IOException 093 * @throws 094 */ 095 public Encoding sniff(ByteReadable readable) throws SAXException, IOException { 096 this.readable = readable; 097 stateLoop(stateSave); 098 return characterEncoding; 099 } 100 101 102 /** 103 * @param string 104 * @throws SAXException 105 */ 106 private void err(String message) throws SAXException { 107 if (errorHandler != null) { 108 SAXParseException spe = new SAXParseException(message, this); 109 errorHandler.error(spe); 110 } 111 } 112 113 /** 114 * @param string 115 * @throws SAXException 116 */ 117 private void warn(String message) throws SAXException { 118 if (errorHandler != null) { 119 SAXParseException spe = new SAXParseException(message, this); 120 errorHandler.warning(spe); 121 } 122 } 123 124 public int getColumnNumber() { 125 return col; 126 } 127 128 public int getLineNumber() { 129 return line; 130 } 131 132 public String getPublicId() { 133 if (locator != null) { 134 return locator.getPublicId(); 135 } 136 return null; 137 } 138 139 public String getSystemId() { 140 if (locator != null) { 141 return locator.getSystemId(); 142 } 143 return null; 144 } 145 146 protected boolean tryCharset(String encoding) throws SAXException { 147 encoding = Encoding.toAsciiLowerCase(encoding); 148 try { 149 // XXX spec says only UTF-16 150 if ("utf-16".equals(encoding) || "utf-16be".equals(encoding) || "utf-16le".equals(encoding) || "utf-32".equals(encoding) || "utf-32be".equals(encoding) || "utf-32le".equals(encoding)) { 151 this.characterEncoding = Encoding.UTF8; 152 err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead."); 153 return true; 154 } else { 155 Encoding cs = Encoding.forName(encoding); 156 String canonName = cs.getCanonName(); 157 if (!cs.isAsciiSuperset()) { 158 err("The encoding \u201C" 159 + encoding 160 + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm."); 161 return false; 162 } 163 if (!cs.isRegistered()) { 164 if (encoding.startsWith("x-")) { 165 err("The encoding \u201C" 166 + encoding 167 + "\u201D is not an IANA-registered encoding. (Charmod C022)"); 168 } else { 169 err("The encoding \u201C" 170 + encoding 171 + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)"); 172 } 173 } else if (!cs.getCanonName().equals(encoding)) { 174 err("The encoding \u201C" + encoding 175 + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" 176 + canonName + "\u201D. (Charmod C024)"); 177 } 178 if (cs.isShouldNot()) { 179 warn("Authors should not use the character encoding \u201C" 180 + encoding 181 + "\u201D. It is recommended to use \u201CUTF-8\u201D."); 182 } else if (cs.isObscure()) { 183 warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); 184 } 185 Encoding actual = cs.getActualHtmlEncoding(); 186 if (actual == null) { 187 this.characterEncoding = cs; 188 } else { 189 warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D."); 190 this.characterEncoding = actual; 191 } 192 return true; 193 } 194 } catch (UnsupportedCharsetException e) { 195 err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing."); 196 } 197 return false; 198 } 199 }