001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 * Copyright (c) 2008-2011 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.sax;
025
026 import java.io.IOException;
027 import java.io.OutputStream;
028 import java.io.OutputStreamWriter;
029 import java.io.UnsupportedEncodingException;
030 import java.io.Writer;
031 import java.util.Arrays;
032
033 import org.xml.sax.Attributes;
034 import org.xml.sax.ContentHandler;
035 import org.xml.sax.Locator;
036 import org.xml.sax.SAXException;
037 import org.xml.sax.ext.LexicalHandler;
038
039 public class HtmlSerializer implements ContentHandler, LexicalHandler {
040
041 private static final String[] VOID_ELEMENTS = { "area", "base", "basefont",
042 "bgsound", "br", "col", "command", "embed", "frame", "hr", "img",
043 "input", "keygen", "link", "meta", "param", "source", "track",
044 "wbr" };
045
046 private static final String[] NON_ESCAPING = { "iframe", "noembed",
047 "noframes", "noscript", "plaintext", "script", "style", "xmp" };
048
049 private static Writer wrap(OutputStream out) {
050 try {
051 return new OutputStreamWriter(out, "UTF-8");
052 } catch (UnsupportedEncodingException e) {
053 throw new RuntimeException(e);
054 }
055 }
056
057 private int ignoreLevel = 0;
058
059 private int escapeLevel = 0;
060
061 private final Writer writer;
062
063 public HtmlSerializer(OutputStream out) {
064 this(wrap(out));
065 }
066
067 public HtmlSerializer(Writer out) {
068 this.writer = out;
069 }
070
071 public void characters(char[] ch, int start, int length)
072 throws SAXException {
073 try {
074 if (escapeLevel > 0) {
075 writer.write(ch, start, length);
076 } else {
077 for (int i = start; i < start + length; i++) {
078 char c = ch[i];
079 switch (c) {
080 case '<':
081 writer.write("<");
082 break;
083 case '>':
084 writer.write(">");
085 break;
086 case '&':
087 writer.write("&");
088 break;
089 case '\u00A0':
090 writer.write(" ");
091 break;
092 default:
093 writer.write(c);
094 break;
095 }
096 }
097 }
098 } catch (IOException e) {
099 throw new SAXException(e);
100 }
101 }
102
103 public void endDocument() throws SAXException {
104 try {
105 writer.flush();
106 writer.close();
107 } catch (IOException e) {
108 throw new SAXException(e);
109 }
110 }
111
112 public void endElement(String uri, String localName, String qName)
113 throws SAXException {
114 if (escapeLevel > 0) {
115 escapeLevel--;
116 }
117 if (ignoreLevel > 0) {
118 ignoreLevel--;
119 } else {
120 try {
121 writer.write('<');
122 writer.write('/');
123 writer.write(localName);
124 writer.write('>');
125 } catch (IOException e) {
126 throw new SAXException(e);
127 }
128 }
129 }
130
131 public void ignorableWhitespace(char[] ch, int start, int length)
132 throws SAXException {
133 characters(ch, start, length);
134 }
135
136 public void processingInstruction(String target, String data)
137 throws SAXException {
138 }
139
140 public void setDocumentLocator(Locator locator) {
141 }
142
143 public void startDocument() throws SAXException {
144 try {
145 writer.write("<!DOCTYPE html>\n");
146 } catch (IOException e) {
147 throw new SAXException(e);
148 }
149 }
150
151 public void startElement(String uri, String localName, String qName,
152 Attributes atts) throws SAXException {
153 if (escapeLevel > 0) {
154 escapeLevel++;
155 }
156 boolean xhtml = "http://www.w3.org/1999/xhtml".equals(uri);
157 if (ignoreLevel > 0
158 || !(xhtml || "http://www.w3.org/2000/svg".equals(uri) || "http://www.w3.org/1998/Math/MathML".equals(uri))) {
159 ignoreLevel++;
160 return;
161 }
162 try {
163 writer.write('<');
164 writer.write(localName);
165 for (int i = 0; i < atts.getLength(); i++) {
166 String attUri = atts.getURI(i);
167 String attLocal = atts.getLocalName(i);
168 if (attUri.length() == 0) {
169 writer.write(' ');
170 } else if (!xhtml
171 && "http://www.w3.org/1999/xlink".equals(attUri)) {
172 writer.write(" xlink:");
173 } else if ("http://www.w3.org/XML/1998/namespace".equals(attUri)) {
174 if (xhtml) {
175 if ("lang".equals(attLocal)) {
176 writer.write(' ');
177 } else {
178 continue;
179 }
180 } else {
181 writer.write(" xml:");
182 }
183 } else {
184 continue;
185 }
186 writer.write(atts.getLocalName(i));
187 writer.write('=');
188 writer.write('"');
189 String val = atts.getValue(i);
190 for (int j = 0; j < val.length(); j++) {
191 char c = val.charAt(j);
192 switch (c) {
193 case '"':
194 writer.write(""");
195 break;
196 case '&':
197 writer.write("&");
198 break;
199 case '\u00A0':
200 writer.write(" ");
201 break;
202 default:
203 writer.write(c);
204 break;
205 }
206 }
207 writer.write('"');
208 }
209 writer.write('>');
210 if (Arrays.binarySearch(VOID_ELEMENTS, localName) > -1) {
211 ignoreLevel++;
212 return;
213 }
214 if ("pre".equals(localName) || "textarea".equals(localName)
215 || "listing".equals(localName)) {
216 writer.write('\n');
217 }
218 if (escapeLevel == 0
219 && Arrays.binarySearch(NON_ESCAPING, localName) > -1) {
220 escapeLevel = 1;
221 }
222 } catch (IOException e) {
223 throw new SAXException(e);
224 }
225 }
226
227 public void comment(char[] ch, int start, int length) throws SAXException {
228 if (ignoreLevel > 0 || escapeLevel > 0) {
229 return;
230 }
231 try {
232 writer.write("<!--");
233 writer.write(ch, start, length);
234 writer.write("-->");
235 } catch (IOException e) {
236 throw new SAXException(e);
237 }
238 }
239
240 public void endCDATA() throws SAXException {
241 }
242
243 public void endDTD() throws SAXException {
244 }
245
246 public void endEntity(String name) throws SAXException {
247 }
248
249 public void startCDATA() throws SAXException {
250 }
251
252 public void startDTD(String name, String publicId, String systemId)
253 throws SAXException {
254 }
255
256 public void startEntity(String name) throws SAXException {
257 }
258
259 public void startPrefixMapping(String prefix, String uri)
260 throws SAXException {
261 }
262
263 public void endPrefixMapping(String prefix) throws SAXException {
264 }
265
266 public void skippedEntity(String name) throws SAXException {
267 }
268
269 }