001 /*
002 * Copyright (c) 2007 Henri Sivonen
003 * Copyright (c) 2008 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.htmlparser.sax;
025
026 import java.io.IOException;
027 import java.io.OutputStream;
028 import java.io.OutputStreamWriter;
029 import java.io.UnsupportedEncodingException;
030 import java.io.Writer;
031 import java.util.Arrays;
032
033 import org.xml.sax.Attributes;
034 import org.xml.sax.ContentHandler;
035 import org.xml.sax.Locator;
036 import org.xml.sax.SAXException;
037 import org.xml.sax.ext.LexicalHandler;
038
039 public class HtmlSerializer implements ContentHandler, LexicalHandler {
040
041 private static final String[] VOID_ELEMENTS = { "area", "base", "basefont",
042 "bgsound", "br", "col", "command", "embed", "event-source",
043 "frame", "hr", "img", "input", "link", "meta", "param", "source",
044 "spacer", "wbr" };
045
046 private static final String[] NON_ESCAPING = { "iframe", "noembed",
047 "noframes", "noscript", "plaintext", "script", "style", "xmp" };
048
049 private static Writer wrap(OutputStream out) {
050 try {
051 return new OutputStreamWriter(out, "UTF-8");
052 } catch (UnsupportedEncodingException e) {
053 throw new RuntimeException(e);
054 }
055 }
056
057 private int ignoreLevel = 0;
058
059 private int escapeLevel = 0;
060
061 private final Writer writer;
062
063 public HtmlSerializer(OutputStream out) {
064 this(wrap(out));
065 }
066
067 public HtmlSerializer(Writer out) {
068 this.writer = out;
069 }
070
071 public void characters(char[] ch, int start, int length)
072 throws SAXException {
073 try {
074 if (escapeLevel > 0) {
075 writer.write(ch, start, length);
076 } else {
077 for (int i = start; i < start + length; i++) {
078 char c = ch[i];
079 switch (c) {
080 case '<':
081 writer.write("<");
082 break;
083 case '>':
084 writer.write(">");
085 break;
086 case '&':
087 writer.write("&");
088 break;
089 case '\u00A0':
090 writer.write(" ");
091 break;
092 default:
093 writer.write(c);
094 break;
095 }
096 }
097 }
098 } catch (IOException e) {
099 throw new SAXException(e);
100 }
101 }
102
103 public void endDocument() throws SAXException {
104 try {
105 writer.flush();
106 writer.close();
107 } catch (IOException e) {
108 throw new SAXException(e);
109 }
110 }
111
112 public void endElement(String uri, String localName, String qName)
113 throws SAXException {
114 if (escapeLevel > 0) {
115 escapeLevel--;
116 }
117 if (ignoreLevel > 0) {
118 ignoreLevel--;
119 } else {
120 try {
121 writer.write('<');
122 writer.write('/');
123 writer.write(localName);
124 writer.write('>');
125 } catch (IOException e) {
126 throw new SAXException(e);
127 }
128 }
129 }
130
131 public void ignorableWhitespace(char[] ch, int start, int length)
132 throws SAXException {
133 characters(ch, start, length);
134 }
135
136 public void processingInstruction(String target, String data)
137 throws SAXException {
138 }
139
140 public void setDocumentLocator(Locator locator) {
141 }
142
143 public void startDocument() throws SAXException {
144 try {
145 writer.write("<!DOCTYPE html>\n");
146 } catch (IOException e) {
147 throw new SAXException(e);
148 }
149 }
150
151 public void startElement(String uri, String localName, String qName,
152 Attributes atts) throws SAXException {
153 if (escapeLevel > 0) {
154 escapeLevel++;
155 }
156 boolean xhtml = "http://www.w3.org/1999/xhtml".equals(uri);
157 if (ignoreLevel > 0 || !(xhtml || "http://www.w3.org/2000/svg".equals(uri) || "http://www.w3.org/1998/Math/MathML".equals(uri))) {
158 ignoreLevel++;
159 return;
160 }
161 try {
162 writer.write('<');
163 writer.write(localName);
164 for (int i = 0; i < atts.getLength(); i++) {
165 String attUri = atts.getURI(i);
166 String attLocal = atts.getLocalName(i);
167 if (attUri.length() == 0) {
168 writer.write(' ');
169 } else if (!xhtml && "http://www.w3.org/1999/xlink".equals(attUri)) {
170 writer.write(" xlink:");
171 } else if ("http://www.w3.org/XML/1998/namespace".equals(attUri)) {
172 if (xhtml) {
173 if ("lang".equals(attLocal)) {
174 writer.write(' ');
175 } else {
176 continue;
177 }
178 } else {
179 writer.write(" xml:");
180 }
181 } else {
182 continue;
183 }
184 writer.write(atts.getLocalName(i));
185 writer.write('=');
186 writer.write('"');
187 String val = atts.getValue(i);
188 for (int j = 0; j < val.length(); j++) {
189 char c = val.charAt(j);
190 switch (c) {
191 case '"':
192 writer.write(""");
193 break;
194 case '&':
195 writer.write("&");
196 break;
197 case '\u00A0':
198 writer.write(" ");
199 break;
200 default:
201 writer.write(c);
202 break;
203 }
204 }
205 writer.write('"');
206 }
207 writer.write('>');
208 if (Arrays.binarySearch(VOID_ELEMENTS, localName) > -1) {
209 ignoreLevel++;
210 return;
211 }
212 if ("pre".equals(localName) || "textarea".equals(localName) || "listing".equals(localName)) {
213 writer.write('\n');
214 }
215 if (escapeLevel == 0
216 && Arrays.binarySearch(NON_ESCAPING, localName) > -1) {
217 escapeLevel = 1;
218 }
219 } catch (IOException e) {
220 throw new SAXException(e);
221 }
222 }
223
224 public void comment(char[] ch, int start, int length) throws SAXException {
225 if (ignoreLevel > 0) {
226 return;
227 }
228 try {
229 writer.write("<!--");
230 writer.write(ch, start, length);
231 writer.write("-->");
232 } catch (IOException e) {
233 throw new SAXException(e);
234 }
235 }
236
237 public void endCDATA() throws SAXException {
238 }
239
240 public void endDTD() throws SAXException {
241 }
242
243 public void endEntity(String name) throws SAXException {
244 }
245
246 public void startCDATA() throws SAXException {
247 }
248
249 public void startDTD(String name, String publicId, String systemId)
250 throws SAXException {
251 }
252
253 public void startEntity(String name) throws SAXException {
254 }
255
256 public void startPrefixMapping(String prefix, String uri)
257 throws SAXException {
258 }
259
260 public void endPrefixMapping(String prefix) throws SAXException {
261 }
262
263 public void skippedEntity(String name) throws SAXException {
264 }
265
266 }