001    /*
002     * Copyright (c) 2003, 2004 Henri Sivonen and Taavi Hupponen
003     * Copyright (c) 2006 Henri Sivonen
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.xml;
025    
026    import java.io.IOException;
027    import java.io.OutputStream;
028    import java.io.OutputStreamWriter;
029    import java.io.UnsupportedEncodingException;
030    import java.io.Writer;
031    import java.util.Arrays;
032    
033    import nu.validator.io.NcrEscapingWindows1252OutputStreamWriter;
034    
035    import org.xml.sax.Attributes;
036    import org.xml.sax.ContentHandler;
037    import org.xml.sax.Locator;
038    import org.xml.sax.SAXException;
039    import org.xml.sax.XMLReader;
040    
041    
042    /**
043     * Serializes a sequence of SAX events representing an XHTML 1.0 Strict document
044     * to an <code>OutputStream</code> as a UTF-8-encoded HTML 4.01 Strict
045     * document. The SAX events must represent a valid XHTML 1.0 document, except
046     * the namespace prefixes don't matter and there may be
047     * <code>startElement</code> and <code>endElement</code> calls for elements
048     * from other namespaces. The <code>startElement</code> and
049     * <code>endElement</code> calls for non-XHTML elements are ignored. No
050     * validity checking is performed. Hence, the emitter of the SAX events is
051     * responsible for making sure the events represent a document that meets the
052     * above requirements. The <code>OutputStream</code> is closed when the end of
053     * the document is seen.
054     * 
055     * @version $Id: HtmlSerializer.java 9 2007-08-11 08:40:38Z hsivonen $
056     * @author hsivonen
057     * @author taavi
058     */
059    public class HtmlSerializer implements ContentHandler {
060    
061        public final static int NO_DOCTYPE = 0;
062    
063        public final static int DOCTYPE_HTML401_TRANSITIONAL = 1;
064    
065        public final static int DOCTYPE_HTML401_STRICT = 2;
066    
067        public final static int DOCTYPE_HTML5 = 3;
068    
069        /**
070         * The XHTML namespace URI
071         */
072        private final static String XHTML_NS = "http://www.w3.org/1999/xhtml";
073    
074        /**
075         * HTML 4.01 elements which don't have an end tag
076         */
077        private static final String[] emptyElements = { "area", "base", "basefont",
078                "br", "col", "command", "frame", "hr", "img", "input", "isindex",
079                "link", "meta", "param" };
080    
081        /**
082         * Minimized "boolean" HTML attributes
083         */
084        private static final String[] booleanAttributes = { "active", "async",
085                "autofocus", "autosubmit", "checked", "compact", "declare",
086                "default", "defer", "disabled", "ismap", "multiple", "nohref",
087                "noresize", "noshade", "nowrap", "readonly", "required", "selected" };
088    
089        /**
090         * The writer used for output
091         */
092        protected Writer writer;
093    
094        private int doctype;
095    
096        private String encoding;
097    
098        private boolean emitMeta;
099    
100        /**
101         * Creates a new instance of HtmlSerializer in the HTML 4.01 doctype mode
102         * with the UTF-8 encoding and no charset meta.
103         * 
104         * @param out
105         *            the stream to which the output is written
106         */
107        public HtmlSerializer(OutputStream out) {
108            this(out, DOCTYPE_HTML401_STRICT, false, "UTF-8");
109        }
110    
111        public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta) {
112            this(out, doctype, emitMeta, "UTF-8");
113        }
114    
115        public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta,
116                String enc) {
117            this.emitMeta = emitMeta;
118            if (doctype < 0 || doctype > 3) {
119                throw new IllegalArgumentException("Bad doctype constant.");
120            }
121            this.doctype = doctype;
122            if ("UTF-8".equalsIgnoreCase(enc)) {
123                try {
124                    this.encoding = "UTF-8";
125                    this.writer = new OutputStreamWriter(out, "UTF-8");
126                } catch (UnsupportedEncodingException uee) {
127                    throw new RuntimeException("UTF-8 not supported", uee);
128                }
129            } else if ("Windows-1252".equalsIgnoreCase(enc)) {
130                this.encoding = "Windows-1252";
131                this.writer = new NcrEscapingWindows1252OutputStreamWriter(out);
132            } else {
133                throw new IllegalArgumentException(
134                        "Encoding must be UTF-8 or Windows-1252.");
135            }
136        }
137    
138        /**
139         * Writes out characters.
140         * 
141         * @param ch
142         *            the source array
143         * @param start
144         *            the index of the first character to be written
145         * @param length
146         *            the number of characters to write
147         * 
148         * @throws SAXException
149         *             if there are IO problems
150         */
151        public void characters(char[] ch, int start, int length)
152                throws SAXException {
153            try {
154                for (int j = 0; j < length; j++) {
155                    char c = ch[start + j];
156                    switch (c) {
157                        case '<':
158                            this.writer.write("&lt;");
159                            break;
160                        case '>':
161                            this.writer.write("&gt;");
162                            break;
163                        case '&':
164                            this.writer.write("&amp;");
165                            break;
166                        default:
167                            this.writer.write(c);
168                    }
169                }
170            } catch (IOException ioe) {
171                throw (SAXException)new SAXException(ioe).initCause(ioe);
172            }
173        }
174    
175        /**
176         * Must be called in the end.
177         * 
178         * @throws SAXException
179         *             if there are IO problems
180         */
181        public void endDocument() throws SAXException {
182            try {
183                this.writer.close();
184            } catch (IOException ioe) {
185                throw (SAXException)new SAXException(ioe).initCause(ioe);
186            }
187        }
188    
189        /**
190         * Writes an end tag if the element is an XHTML element and is not an empty
191         * element in HTML 4.01 Strict.
192         * 
193         * @param namespaceURI
194         *            the XML namespace
195         * @param localName
196         *            the element name in the namespace
197         * @param qName
198         *            ignored
199         * 
200         * @throws SAXException
201         *             if there are IO problems
202         */
203        public void endElement(String namespaceURI, String localName, String qName)
204                throws SAXException {
205            try {
206                if (XHTML_NS.equals(namespaceURI)
207                        && Arrays.binarySearch(emptyElements, localName) < 0) {
208                    this.writer.write("</");
209                    this.writer.write(localName);
210                    this.writer.write('>');
211                }
212            } catch (IOException ioe) {
213                throw (SAXException)new SAXException(ioe).initCause(ioe);
214            }
215        }
216    
217        /**
218         * Must be called first.
219         */
220        public void startDocument() throws SAXException {
221            try {
222                switch (doctype) {
223                    case NO_DOCTYPE:
224                        return;
225                    case DOCTYPE_HTML5:
226                        writer.write("<!DOCTYPE html>\n");
227                        return;
228                    case DOCTYPE_HTML401_STRICT:
229                        writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n");
230                        return;
231                    case DOCTYPE_HTML401_TRANSITIONAL:
232                        writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n");
233                        return;
234                }
235            } catch (IOException ioe) {
236                throw (SAXException)new SAXException(ioe).initCause(ioe);
237            }
238        }
239    
240        /**
241         * Writes a start tag if the element is an XHTML element.
242         * 
243         * @param namespaceURI
244         *            the XML namespace
245         * @param localName
246         *            the element name in the namespace
247         * @param qName
248         *            ignored
249         * @param atts
250         *            the attribute list
251         * 
252         * @throws SAXException
253         *             if there are IO problems
254         */
255        public void startElement(String namespaceURI, String localName,
256                String qName, Attributes atts) throws SAXException {
257            try {
258                if (XHTML_NS.equals(namespaceURI)) {
259    
260                    if ("meta".equals(localName)
261                            && ((atts.getIndex("", "http-equiv") != -1) || (atts.getIndex(
262                                    "", "httpequiv") != -1))) {
263                        return;
264                    }
265    
266                    // start and element name
267                    this.writer.write('<');
268                    this.writer.write(localName);
269    
270                    // attributes
271                    int length = atts.getLength();
272                    boolean langPrinted = false;
273                    for (int i = 0; i < length; i++) {
274                        String ns = atts.getURI(i);
275                        String name = null;
276                        if ("".equals(ns)) {
277                            name = atts.getLocalName(i);
278                        } else if ("http://www.w3.org/XML/1998/namespace".equals(ns)
279                                && "lang".equals(atts.getLocalName(i))) {
280                            name = "lang";
281                        }
282                        if (name != null && !(langPrinted && "lang".equals(name))) {
283                            this.writer.write(' ');
284                            this.writer.write(name);
285                            if ("lang".equals(name)) {
286                                langPrinted = true;
287                            }
288                            if (Arrays.binarySearch(booleanAttributes, name) < 0) {
289                                // write value, escape certain characters
290                                this.writer.write("=\"");
291                                String value = atts.getValue(i);
292                                for (int j = 0; j < value.length(); j++) {
293                                    char c = value.charAt(j);
294                                    switch (c) {
295                                        case '<':
296                                            this.writer.write("&lt;");
297                                            break;
298                                        case '>':
299                                            this.writer.write("&gt;");
300                                            break;
301                                        case '&':
302                                            this.writer.write("&amp;");
303                                            break;
304                                        case '"':
305                                            this.writer.write("&quot;");
306                                            break;
307                                        default:
308                                            this.writer.write(c);
309                                    }
310                                }
311    
312                                this.writer.write('"');
313                            }
314                        }
315                    }
316    
317                    // close
318                    this.writer.write('>');
319                    if (emitMeta && "head".equals(localName)) {
320                        this.writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=");
321                        this.writer.write(encoding);
322                        this.writer.write("\">");
323                    }
324                }
325            } catch (IOException ioe) {
326                throw (SAXException)new SAXException(ioe).initCause(ioe);
327            }
328        }
329    
330        /**
331         * Used for testing. Pass a file:// URL as the command line argument.
332         */
333        public static void main(String[] args) {
334            try {
335                javax.xml.parsers.SAXParserFactory fac = javax.xml.parsers.SAXParserFactory.newInstance();
336                fac.setNamespaceAware(true);
337                fac.setValidating(false);
338                XMLReader parser = fac.newSAXParser().getXMLReader();
339                parser.setContentHandler(new HtmlSerializer(System.out));
340                parser.parse(args[0]);
341            } catch (Exception e) {
342                throw new RuntimeException(e);
343            }
344        }
345    
346        /** Does nothing. */
347        public void endPrefixMapping(String str) throws SAXException {
348        }
349    
350        /** Does nothing. */
351        public void ignorableWhitespace(char[] values, int param, int param2)
352                throws SAXException {
353        }
354    
355        /** Does nothing. */
356        public void processingInstruction(String str, String str1)
357                throws SAXException {
358        }
359    
360        /** Does nothing. */
361        public void setDocumentLocator(Locator locator) {
362        }
363    
364        /** Does nothing. */
365        public void skippedEntity(String str) throws SAXException {
366        }
367    
368        /** Does nothing. */
369        public void startPrefixMapping(String str, String str1) throws SAXException {
370        }
371    }