001 /* 002 * Copyright (c) 2007 Henri Sivonen 003 * Copyright (c) 2008-2011 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.htmlparser.sax; 025 026 import java.io.IOException; 027 import java.io.OutputStream; 028 import java.io.OutputStreamWriter; 029 import java.io.UnsupportedEncodingException; 030 import java.io.Writer; 031 import java.util.Arrays; 032 033 import org.xml.sax.Attributes; 034 import org.xml.sax.ContentHandler; 035 import org.xml.sax.Locator; 036 import org.xml.sax.SAXException; 037 import org.xml.sax.ext.LexicalHandler; 038 039 public class HtmlSerializer implements ContentHandler, LexicalHandler { 040 041 private static final String[] VOID_ELEMENTS = { "area", "base", "basefont", 042 "bgsound", "br", "col", "command", "embed", "frame", "hr", "img", 043 "input", "keygen", "link", "meta", "param", "source", "track", 044 "wbr" }; 045 046 private static final String[] NON_ESCAPING = { "iframe", "noembed", 047 "noframes", "noscript", "plaintext", "script", "style", "xmp" }; 048 049 private static Writer wrap(OutputStream out) { 050 try { 051 return new OutputStreamWriter(out, "UTF-8"); 052 } catch (UnsupportedEncodingException e) { 053 throw new RuntimeException(e); 054 } 055 } 056 057 private int ignoreLevel = 0; 058 059 private int escapeLevel = 0; 060 061 private final Writer writer; 062 063 public HtmlSerializer(OutputStream out) { 064 this(wrap(out)); 065 } 066 067 public HtmlSerializer(Writer out) { 068 this.writer = out; 069 } 070 071 public void characters(char[] ch, int start, int length) 072 throws SAXException { 073 try { 074 if (escapeLevel > 0) { 075 writer.write(ch, start, length); 076 } else { 077 for (int i = start; i < start + length; i++) { 078 char c = ch[i]; 079 switch (c) { 080 case '<': 081 writer.write("<"); 082 break; 083 case '>': 084 writer.write(">"); 085 break; 086 case '&': 087 writer.write("&"); 088 break; 089 case '\u00A0': 090 writer.write(" "); 091 break; 092 default: 093 writer.write(c); 094 break; 095 } 096 } 097 } 098 } catch (IOException e) { 099 throw new SAXException(e); 100 } 101 } 102 103 public void endDocument() throws SAXException { 104 try { 105 writer.flush(); 106 writer.close(); 107 } catch (IOException e) { 108 throw new SAXException(e); 109 } 110 } 111 112 public void endElement(String uri, String localName, String qName) 113 throws SAXException { 114 if (escapeLevel > 0) { 115 escapeLevel--; 116 } 117 if (ignoreLevel > 0) { 118 ignoreLevel--; 119 } else { 120 try { 121 writer.write('<'); 122 writer.write('/'); 123 writer.write(localName); 124 writer.write('>'); 125 } catch (IOException e) { 126 throw new SAXException(e); 127 } 128 } 129 } 130 131 public void ignorableWhitespace(char[] ch, int start, int length) 132 throws SAXException { 133 characters(ch, start, length); 134 } 135 136 public void processingInstruction(String target, String data) 137 throws SAXException { 138 } 139 140 public void setDocumentLocator(Locator locator) { 141 } 142 143 public void startDocument() throws SAXException { 144 try { 145 writer.write("<!DOCTYPE html>\n"); 146 } catch (IOException e) { 147 throw new SAXException(e); 148 } 149 } 150 151 public void startElement(String uri, String localName, String qName, 152 Attributes atts) throws SAXException { 153 if (escapeLevel > 0) { 154 escapeLevel++; 155 } 156 boolean xhtml = "http://www.w3.org/1999/xhtml".equals(uri); 157 if (ignoreLevel > 0 158 || !(xhtml || "http://www.w3.org/2000/svg".equals(uri) || "http://www.w3.org/1998/Math/MathML".equals(uri))) { 159 ignoreLevel++; 160 return; 161 } 162 try { 163 writer.write('<'); 164 writer.write(localName); 165 for (int i = 0; i < atts.getLength(); i++) { 166 String attUri = atts.getURI(i); 167 String attLocal = atts.getLocalName(i); 168 if (attUri.length() == 0) { 169 writer.write(' '); 170 } else if (!xhtml 171 && "http://www.w3.org/1999/xlink".equals(attUri)) { 172 writer.write(" xlink:"); 173 } else if ("http://www.w3.org/XML/1998/namespace".equals(attUri)) { 174 if (xhtml) { 175 if ("lang".equals(attLocal)) { 176 writer.write(' '); 177 } else { 178 continue; 179 } 180 } else { 181 writer.write(" xml:"); 182 } 183 } else { 184 continue; 185 } 186 writer.write(atts.getLocalName(i)); 187 writer.write('='); 188 writer.write('"'); 189 String val = atts.getValue(i); 190 for (int j = 0; j < val.length(); j++) { 191 char c = val.charAt(j); 192 switch (c) { 193 case '"': 194 writer.write("""); 195 break; 196 case '&': 197 writer.write("&"); 198 break; 199 case '\u00A0': 200 writer.write(" "); 201 break; 202 default: 203 writer.write(c); 204 break; 205 } 206 } 207 writer.write('"'); 208 } 209 writer.write('>'); 210 if (Arrays.binarySearch(VOID_ELEMENTS, localName) > -1) { 211 ignoreLevel++; 212 return; 213 } 214 if ("pre".equals(localName) || "textarea".equals(localName) 215 || "listing".equals(localName)) { 216 writer.write('\n'); 217 } 218 if (escapeLevel == 0 219 && Arrays.binarySearch(NON_ESCAPING, localName) > -1) { 220 escapeLevel = 1; 221 } 222 } catch (IOException e) { 223 throw new SAXException(e); 224 } 225 } 226 227 public void comment(char[] ch, int start, int length) throws SAXException { 228 if (ignoreLevel > 0 || escapeLevel > 0) { 229 return; 230 } 231 try { 232 writer.write("<!--"); 233 writer.write(ch, start, length); 234 writer.write("-->"); 235 } catch (IOException e) { 236 throw new SAXException(e); 237 } 238 } 239 240 public void endCDATA() throws SAXException { 241 } 242 243 public void endDTD() throws SAXException { 244 } 245 246 public void endEntity(String name) throws SAXException { 247 } 248 249 public void startCDATA() throws SAXException { 250 } 251 252 public void startDTD(String name, String publicId, String systemId) 253 throws SAXException { 254 } 255 256 public void startEntity(String name) throws SAXException { 257 } 258 259 public void startPrefixMapping(String prefix, String uri) 260 throws SAXException { 261 } 262 263 public void endPrefixMapping(String prefix) throws SAXException { 264 } 265 266 public void skippedEntity(String name) throws SAXException { 267 } 268 269 }