001 /*
002 * Copyright (c) 2003, 2004 Henri Sivonen and Taavi Hupponen
003 * Copyright (c) 2006 Henri Sivonen
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.xml;
025
026 import java.io.IOException;
027 import java.io.OutputStream;
028 import java.io.OutputStreamWriter;
029 import java.io.UnsupportedEncodingException;
030 import java.io.Writer;
031 import java.util.Arrays;
032
033 import nu.validator.io.NcrEscapingWindows1252OutputStreamWriter;
034
035 import org.xml.sax.Attributes;
036 import org.xml.sax.ContentHandler;
037 import org.xml.sax.Locator;
038 import org.xml.sax.SAXException;
039 import org.xml.sax.XMLReader;
040
041
042 /**
043 * Serializes a sequence of SAX events representing an XHTML 1.0 Strict document
044 * to an <code>OutputStream</code> as a UTF-8-encoded HTML 4.01 Strict
045 * document. The SAX events must represent a valid XHTML 1.0 document, except
046 * the namespace prefixes don't matter and there may be
047 * <code>startElement</code> and <code>endElement</code> calls for elements
048 * from other namespaces. The <code>startElement</code> and
049 * <code>endElement</code> calls for non-XHTML elements are ignored. No
050 * validity checking is performed. Hence, the emitter of the SAX events is
051 * responsible for making sure the events represent a document that meets the
052 * above requirements. The <code>OutputStream</code> is closed when the end of
053 * the document is seen.
054 *
055 * @version $Id: HtmlSerializer.java 9 2007-08-11 08:40:38Z hsivonen $
056 * @author hsivonen
057 * @author taavi
058 */
059 public class HtmlSerializer implements ContentHandler {
060
061 public final static int NO_DOCTYPE = 0;
062
063 public final static int DOCTYPE_HTML401_TRANSITIONAL = 1;
064
065 public final static int DOCTYPE_HTML401_STRICT = 2;
066
067 public final static int DOCTYPE_HTML5 = 3;
068
069 /**
070 * The XHTML namespace URI
071 */
072 private final static String XHTML_NS = "http://www.w3.org/1999/xhtml";
073
074 /**
075 * HTML 4.01 elements which don't have an end tag
076 */
077 private static final String[] emptyElements = { "area", "base", "basefont",
078 "br", "col", "command", "frame", "hr", "img", "input", "isindex",
079 "link", "meta", "param" };
080
081 /**
082 * Minimized "boolean" HTML attributes
083 */
084 private static final String[] booleanAttributes = { "active", "async",
085 "autofocus", "autosubmit", "checked", "compact", "declare",
086 "default", "defer", "disabled", "ismap", "multiple", "nohref",
087 "noresize", "noshade", "nowrap", "readonly", "required", "selected" };
088
089 /**
090 * The writer used for output
091 */
092 protected Writer writer;
093
094 private int doctype;
095
096 private String encoding;
097
098 private boolean emitMeta;
099
100 /**
101 * Creates a new instance of HtmlSerializer in the HTML 4.01 doctype mode
102 * with the UTF-8 encoding and no charset meta.
103 *
104 * @param out
105 * the stream to which the output is written
106 */
107 public HtmlSerializer(OutputStream out) {
108 this(out, DOCTYPE_HTML401_STRICT, false, "UTF-8");
109 }
110
111 public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta) {
112 this(out, doctype, emitMeta, "UTF-8");
113 }
114
115 public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta,
116 String enc) {
117 this.emitMeta = emitMeta;
118 if (doctype < 0 || doctype > 3) {
119 throw new IllegalArgumentException("Bad doctype constant.");
120 }
121 this.doctype = doctype;
122 if ("UTF-8".equalsIgnoreCase(enc)) {
123 try {
124 this.encoding = "UTF-8";
125 this.writer = new OutputStreamWriter(out, "UTF-8");
126 } catch (UnsupportedEncodingException uee) {
127 throw new RuntimeException("UTF-8 not supported", uee);
128 }
129 } else if ("Windows-1252".equalsIgnoreCase(enc)) {
130 this.encoding = "Windows-1252";
131 this.writer = new NcrEscapingWindows1252OutputStreamWriter(out);
132 } else {
133 throw new IllegalArgumentException(
134 "Encoding must be UTF-8 or Windows-1252.");
135 }
136 }
137
138 /**
139 * Writes out characters.
140 *
141 * @param ch
142 * the source array
143 * @param start
144 * the index of the first character to be written
145 * @param length
146 * the number of characters to write
147 *
148 * @throws SAXException
149 * if there are IO problems
150 */
151 public void characters(char[] ch, int start, int length)
152 throws SAXException {
153 try {
154 for (int j = 0; j < length; j++) {
155 char c = ch[start + j];
156 switch (c) {
157 case '<':
158 this.writer.write("<");
159 break;
160 case '>':
161 this.writer.write(">");
162 break;
163 case '&':
164 this.writer.write("&");
165 break;
166 default:
167 this.writer.write(c);
168 }
169 }
170 } catch (IOException ioe) {
171 throw (SAXException)new SAXException(ioe).initCause(ioe);
172 }
173 }
174
175 /**
176 * Must be called in the end.
177 *
178 * @throws SAXException
179 * if there are IO problems
180 */
181 public void endDocument() throws SAXException {
182 try {
183 this.writer.close();
184 } catch (IOException ioe) {
185 throw (SAXException)new SAXException(ioe).initCause(ioe);
186 }
187 }
188
189 /**
190 * Writes an end tag if the element is an XHTML element and is not an empty
191 * element in HTML 4.01 Strict.
192 *
193 * @param namespaceURI
194 * the XML namespace
195 * @param localName
196 * the element name in the namespace
197 * @param qName
198 * ignored
199 *
200 * @throws SAXException
201 * if there are IO problems
202 */
203 public void endElement(String namespaceURI, String localName, String qName)
204 throws SAXException {
205 try {
206 if (XHTML_NS.equals(namespaceURI)
207 && Arrays.binarySearch(emptyElements, localName) < 0) {
208 this.writer.write("</");
209 this.writer.write(localName);
210 this.writer.write('>');
211 }
212 } catch (IOException ioe) {
213 throw (SAXException)new SAXException(ioe).initCause(ioe);
214 }
215 }
216
217 /**
218 * Must be called first.
219 */
220 public void startDocument() throws SAXException {
221 try {
222 switch (doctype) {
223 case NO_DOCTYPE:
224 return;
225 case DOCTYPE_HTML5:
226 writer.write("<!DOCTYPE html>\n");
227 return;
228 case DOCTYPE_HTML401_STRICT:
229 writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n");
230 return;
231 case DOCTYPE_HTML401_TRANSITIONAL:
232 writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n");
233 return;
234 }
235 } catch (IOException ioe) {
236 throw (SAXException)new SAXException(ioe).initCause(ioe);
237 }
238 }
239
240 /**
241 * Writes a start tag if the element is an XHTML element.
242 *
243 * @param namespaceURI
244 * the XML namespace
245 * @param localName
246 * the element name in the namespace
247 * @param qName
248 * ignored
249 * @param atts
250 * the attribute list
251 *
252 * @throws SAXException
253 * if there are IO problems
254 */
255 public void startElement(String namespaceURI, String localName,
256 String qName, Attributes atts) throws SAXException {
257 try {
258 if (XHTML_NS.equals(namespaceURI)) {
259
260 if ("meta".equals(localName)
261 && ((atts.getIndex("", "http-equiv") != -1) || (atts.getIndex(
262 "", "httpequiv") != -1))) {
263 return;
264 }
265
266 // start and element name
267 this.writer.write('<');
268 this.writer.write(localName);
269
270 // attributes
271 int length = atts.getLength();
272 boolean langPrinted = false;
273 for (int i = 0; i < length; i++) {
274 String ns = atts.getURI(i);
275 String name = null;
276 if ("".equals(ns)) {
277 name = atts.getLocalName(i);
278 } else if ("http://www.w3.org/XML/1998/namespace".equals(ns)
279 && "lang".equals(atts.getLocalName(i))) {
280 name = "lang";
281 }
282 if (name != null && !(langPrinted && "lang".equals(name))) {
283 this.writer.write(' ');
284 this.writer.write(name);
285 if ("lang".equals(name)) {
286 langPrinted = true;
287 }
288 if (Arrays.binarySearch(booleanAttributes, name) < 0) {
289 // write value, escape certain characters
290 this.writer.write("=\"");
291 String value = atts.getValue(i);
292 for (int j = 0; j < value.length(); j++) {
293 char c = value.charAt(j);
294 switch (c) {
295 case '<':
296 this.writer.write("<");
297 break;
298 case '>':
299 this.writer.write(">");
300 break;
301 case '&':
302 this.writer.write("&");
303 break;
304 case '"':
305 this.writer.write(""");
306 break;
307 default:
308 this.writer.write(c);
309 }
310 }
311
312 this.writer.write('"');
313 }
314 }
315 }
316
317 // close
318 this.writer.write('>');
319 if (emitMeta && "head".equals(localName)) {
320 this.writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=");
321 this.writer.write(encoding);
322 this.writer.write("\">");
323 }
324 }
325 } catch (IOException ioe) {
326 throw (SAXException)new SAXException(ioe).initCause(ioe);
327 }
328 }
329
330 /**
331 * Used for testing. Pass a file:// URL as the command line argument.
332 */
333 public static void main(String[] args) {
334 try {
335 javax.xml.parsers.SAXParserFactory fac = javax.xml.parsers.SAXParserFactory.newInstance();
336 fac.setNamespaceAware(true);
337 fac.setValidating(false);
338 XMLReader parser = fac.newSAXParser().getXMLReader();
339 parser.setContentHandler(new HtmlSerializer(System.out));
340 parser.parse(args[0]);
341 } catch (Exception e) {
342 throw new RuntimeException(e);
343 }
344 }
345
346 /** Does nothing. */
347 public void endPrefixMapping(String str) throws SAXException {
348 }
349
350 /** Does nothing. */
351 public void ignorableWhitespace(char[] values, int param, int param2)
352 throws SAXException {
353 }
354
355 /** Does nothing. */
356 public void processingInstruction(String str, String str1)
357 throws SAXException {
358 }
359
360 /** Does nothing. */
361 public void setDocumentLocator(Locator locator) {
362 }
363
364 /** Does nothing. */
365 public void skippedEntity(String str) throws SAXException {
366 }
367
368 /** Does nothing. */
369 public void startPrefixMapping(String str, String str1) throws SAXException {
370 }
371 }