001 /* 002 * Copyright (c) 2003, 2004 Henri Sivonen and Taavi Hupponen 003 * Copyright (c) 2006 Henri Sivonen 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.xml; 025 026 import java.io.IOException; 027 import java.io.OutputStream; 028 import java.io.OutputStreamWriter; 029 import java.io.UnsupportedEncodingException; 030 import java.io.Writer; 031 import java.util.Arrays; 032 033 import nu.validator.io.NcrEscapingWindows1252OutputStreamWriter; 034 035 import org.xml.sax.Attributes; 036 import org.xml.sax.ContentHandler; 037 import org.xml.sax.Locator; 038 import org.xml.sax.SAXException; 039 import org.xml.sax.XMLReader; 040 041 042 /** 043 * Serializes a sequence of SAX events representing an XHTML 1.0 Strict document 044 * to an <code>OutputStream</code> as a UTF-8-encoded HTML 4.01 Strict 045 * document. The SAX events must represent a valid XHTML 1.0 document, except 046 * the namespace prefixes don't matter and there may be 047 * <code>startElement</code> and <code>endElement</code> calls for elements 048 * from other namespaces. The <code>startElement</code> and 049 * <code>endElement</code> calls for non-XHTML elements are ignored. No 050 * validity checking is performed. Hence, the emitter of the SAX events is 051 * responsible for making sure the events represent a document that meets the 052 * above requirements. The <code>OutputStream</code> is closed when the end of 053 * the document is seen. 054 * 055 * @version $Id: HtmlSerializer.java 9 2007-08-11 08:40:38Z hsivonen $ 056 * @author hsivonen 057 * @author taavi 058 */ 059 public class HtmlSerializer implements ContentHandler { 060 061 public final static int NO_DOCTYPE = 0; 062 063 public final static int DOCTYPE_HTML401_TRANSITIONAL = 1; 064 065 public final static int DOCTYPE_HTML401_STRICT = 2; 066 067 public final static int DOCTYPE_HTML5 = 3; 068 069 /** 070 * The XHTML namespace URI 071 */ 072 private final static String XHTML_NS = "http://www.w3.org/1999/xhtml"; 073 074 /** 075 * HTML 4.01 elements which don't have an end tag 076 */ 077 private static final String[] emptyElements = { "area", "base", "basefont", 078 "br", "col", "command", "frame", "hr", "img", "input", "isindex", 079 "link", "meta", "param" }; 080 081 /** 082 * Minimized "boolean" HTML attributes 083 */ 084 private static final String[] booleanAttributes = { "active", "async", 085 "autofocus", "autosubmit", "checked", "compact", "declare", 086 "default", "defer", "disabled", "ismap", "multiple", "nohref", 087 "noresize", "noshade", "nowrap", "readonly", "required", "selected" }; 088 089 /** 090 * The writer used for output 091 */ 092 protected Writer writer; 093 094 private int doctype; 095 096 private String encoding; 097 098 private boolean emitMeta; 099 100 /** 101 * Creates a new instance of HtmlSerializer in the HTML 4.01 doctype mode 102 * with the UTF-8 encoding and no charset meta. 103 * 104 * @param out 105 * the stream to which the output is written 106 */ 107 public HtmlSerializer(OutputStream out) { 108 this(out, DOCTYPE_HTML401_STRICT, false, "UTF-8"); 109 } 110 111 public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta) { 112 this(out, doctype, emitMeta, "UTF-8"); 113 } 114 115 public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta, 116 String enc) { 117 this.emitMeta = emitMeta; 118 if (doctype < 0 || doctype > 3) { 119 throw new IllegalArgumentException("Bad doctype constant."); 120 } 121 this.doctype = doctype; 122 if ("UTF-8".equalsIgnoreCase(enc)) { 123 try { 124 this.encoding = "UTF-8"; 125 this.writer = new OutputStreamWriter(out, "UTF-8"); 126 } catch (UnsupportedEncodingException uee) { 127 throw new RuntimeException("UTF-8 not supported", uee); 128 } 129 } else if ("Windows-1252".equalsIgnoreCase(enc)) { 130 this.encoding = "Windows-1252"; 131 this.writer = new NcrEscapingWindows1252OutputStreamWriter(out); 132 } else { 133 throw new IllegalArgumentException( 134 "Encoding must be UTF-8 or Windows-1252."); 135 } 136 } 137 138 /** 139 * Writes out characters. 140 * 141 * @param ch 142 * the source array 143 * @param start 144 * the index of the first character to be written 145 * @param length 146 * the number of characters to write 147 * 148 * @throws SAXException 149 * if there are IO problems 150 */ 151 public void characters(char[] ch, int start, int length) 152 throws SAXException { 153 try { 154 for (int j = 0; j < length; j++) { 155 char c = ch[start + j]; 156 switch (c) { 157 case '<': 158 this.writer.write("<"); 159 break; 160 case '>': 161 this.writer.write(">"); 162 break; 163 case '&': 164 this.writer.write("&"); 165 break; 166 default: 167 this.writer.write(c); 168 } 169 } 170 } catch (IOException ioe) { 171 throw (SAXException)new SAXException(ioe).initCause(ioe); 172 } 173 } 174 175 /** 176 * Must be called in the end. 177 * 178 * @throws SAXException 179 * if there are IO problems 180 */ 181 public void endDocument() throws SAXException { 182 try { 183 this.writer.close(); 184 } catch (IOException ioe) { 185 throw (SAXException)new SAXException(ioe).initCause(ioe); 186 } 187 } 188 189 /** 190 * Writes an end tag if the element is an XHTML element and is not an empty 191 * element in HTML 4.01 Strict. 192 * 193 * @param namespaceURI 194 * the XML namespace 195 * @param localName 196 * the element name in the namespace 197 * @param qName 198 * ignored 199 * 200 * @throws SAXException 201 * if there are IO problems 202 */ 203 public void endElement(String namespaceURI, String localName, String qName) 204 throws SAXException { 205 try { 206 if (XHTML_NS.equals(namespaceURI) 207 && Arrays.binarySearch(emptyElements, localName) < 0) { 208 this.writer.write("</"); 209 this.writer.write(localName); 210 this.writer.write('>'); 211 } 212 } catch (IOException ioe) { 213 throw (SAXException)new SAXException(ioe).initCause(ioe); 214 } 215 } 216 217 /** 218 * Must be called first. 219 */ 220 public void startDocument() throws SAXException { 221 try { 222 switch (doctype) { 223 case NO_DOCTYPE: 224 return; 225 case DOCTYPE_HTML5: 226 writer.write("<!DOCTYPE html>\n"); 227 return; 228 case DOCTYPE_HTML401_STRICT: 229 writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n"); 230 return; 231 case DOCTYPE_HTML401_TRANSITIONAL: 232 writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n"); 233 return; 234 } 235 } catch (IOException ioe) { 236 throw (SAXException)new SAXException(ioe).initCause(ioe); 237 } 238 } 239 240 /** 241 * Writes a start tag if the element is an XHTML element. 242 * 243 * @param namespaceURI 244 * the XML namespace 245 * @param localName 246 * the element name in the namespace 247 * @param qName 248 * ignored 249 * @param atts 250 * the attribute list 251 * 252 * @throws SAXException 253 * if there are IO problems 254 */ 255 public void startElement(String namespaceURI, String localName, 256 String qName, Attributes atts) throws SAXException { 257 try { 258 if (XHTML_NS.equals(namespaceURI)) { 259 260 if ("meta".equals(localName) 261 && ((atts.getIndex("", "http-equiv") != -1) || (atts.getIndex( 262 "", "httpequiv") != -1))) { 263 return; 264 } 265 266 // start and element name 267 this.writer.write('<'); 268 this.writer.write(localName); 269 270 // attributes 271 int length = atts.getLength(); 272 boolean langPrinted = false; 273 for (int i = 0; i < length; i++) { 274 String ns = atts.getURI(i); 275 String name = null; 276 if ("".equals(ns)) { 277 name = atts.getLocalName(i); 278 } else if ("http://www.w3.org/XML/1998/namespace".equals(ns) 279 && "lang".equals(atts.getLocalName(i))) { 280 name = "lang"; 281 } 282 if (name != null && !(langPrinted && "lang".equals(name))) { 283 this.writer.write(' '); 284 this.writer.write(name); 285 if ("lang".equals(name)) { 286 langPrinted = true; 287 } 288 if (Arrays.binarySearch(booleanAttributes, name) < 0) { 289 // write value, escape certain characters 290 this.writer.write("=\""); 291 String value = atts.getValue(i); 292 for (int j = 0; j < value.length(); j++) { 293 char c = value.charAt(j); 294 switch (c) { 295 case '<': 296 this.writer.write("<"); 297 break; 298 case '>': 299 this.writer.write(">"); 300 break; 301 case '&': 302 this.writer.write("&"); 303 break; 304 case '"': 305 this.writer.write("""); 306 break; 307 default: 308 this.writer.write(c); 309 } 310 } 311 312 this.writer.write('"'); 313 } 314 } 315 } 316 317 // close 318 this.writer.write('>'); 319 if (emitMeta && "head".equals(localName)) { 320 this.writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset="); 321 this.writer.write(encoding); 322 this.writer.write("\">"); 323 } 324 } 325 } catch (IOException ioe) { 326 throw (SAXException)new SAXException(ioe).initCause(ioe); 327 } 328 } 329 330 /** 331 * Used for testing. Pass a file:// URL as the command line argument. 332 */ 333 public static void main(String[] args) { 334 try { 335 javax.xml.parsers.SAXParserFactory fac = javax.xml.parsers.SAXParserFactory.newInstance(); 336 fac.setNamespaceAware(true); 337 fac.setValidating(false); 338 XMLReader parser = fac.newSAXParser().getXMLReader(); 339 parser.setContentHandler(new HtmlSerializer(System.out)); 340 parser.parse(args[0]); 341 } catch (Exception e) { 342 throw new RuntimeException(e); 343 } 344 } 345 346 /** Does nothing. */ 347 public void endPrefixMapping(String str) throws SAXException { 348 } 349 350 /** Does nothing. */ 351 public void ignorableWhitespace(char[] values, int param, int param2) 352 throws SAXException { 353 } 354 355 /** Does nothing. */ 356 public void processingInstruction(String str, String str1) 357 throws SAXException { 358 } 359 360 /** Does nothing. */ 361 public void setDocumentLocator(Locator locator) { 362 } 363 364 /** Does nothing. */ 365 public void skippedEntity(String str) throws SAXException { 366 } 367 368 /** Does nothing. */ 369 public void startPrefixMapping(String str, String str1) throws SAXException { 370 } 371 }