001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     * Copyright (c) 2008-2011 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.sax;
025    
026    import java.io.IOException;
027    import java.io.OutputStream;
028    import java.io.OutputStreamWriter;
029    import java.io.UnsupportedEncodingException;
030    import java.io.Writer;
031    import java.util.Arrays;
032    
033    import org.xml.sax.Attributes;
034    import org.xml.sax.ContentHandler;
035    import org.xml.sax.Locator;
036    import org.xml.sax.SAXException;
037    import org.xml.sax.ext.LexicalHandler;
038    
039    public class HtmlSerializer implements ContentHandler, LexicalHandler {
040    
041        private static final String[] VOID_ELEMENTS = { "area", "base", "basefont",
042                "bgsound", "br", "col", "command", "embed", "frame", "hr", "img",
043                "input", "keygen", "link", "meta", "param", "source", "track",
044                "wbr" };
045    
046        private static final String[] NON_ESCAPING = { "iframe", "noembed",
047                "noframes", "noscript", "plaintext", "script", "style", "xmp" };
048    
049        private static Writer wrap(OutputStream out) {
050            try {
051                return new OutputStreamWriter(out, "UTF-8");
052            } catch (UnsupportedEncodingException e) {
053                throw new RuntimeException(e);
054            }
055        }
056    
057        private int ignoreLevel = 0;
058    
059        private int escapeLevel = 0;
060    
061        private final Writer writer;
062    
063        public HtmlSerializer(OutputStream out) {
064            this(wrap(out));
065        }
066    
067        public HtmlSerializer(Writer out) {
068            this.writer = out;
069        }
070    
071        public void characters(char[] ch, int start, int length)
072                throws SAXException {
073            try {
074                if (escapeLevel > 0) {
075                    writer.write(ch, start, length);
076                } else {
077                    for (int i = start; i < start + length; i++) {
078                        char c = ch[i];
079                        switch (c) {
080                            case '<':
081                                writer.write("&lt;");
082                                break;
083                            case '>':
084                                writer.write("&gt;");
085                                break;
086                            case '&':
087                                writer.write("&amp;");
088                                break;
089                            case '\u00A0':
090                                writer.write("&nbsp;");
091                                break;
092                            default:
093                                writer.write(c);
094                                break;
095                        }
096                    }
097                }
098            } catch (IOException e) {
099                throw new SAXException(e);
100            }
101        }
102    
103        public void endDocument() throws SAXException {
104            try {
105                writer.flush();
106                writer.close();
107            } catch (IOException e) {
108                throw new SAXException(e);
109            }
110        }
111    
112        public void endElement(String uri, String localName, String qName)
113                throws SAXException {
114            if (escapeLevel > 0) {
115                escapeLevel--;
116            }
117            if (ignoreLevel > 0) {
118                ignoreLevel--;
119            } else {
120                try {
121                    writer.write('<');
122                    writer.write('/');
123                    writer.write(localName);
124                    writer.write('>');
125                } catch (IOException e) {
126                    throw new SAXException(e);
127                }
128            }
129        }
130    
131        public void ignorableWhitespace(char[] ch, int start, int length)
132                throws SAXException {
133            characters(ch, start, length);
134        }
135    
136        public void processingInstruction(String target, String data)
137                throws SAXException {
138        }
139    
140        public void setDocumentLocator(Locator locator) {
141        }
142    
143        public void startDocument() throws SAXException {
144            try {
145                writer.write("<!DOCTYPE html>\n");
146            } catch (IOException e) {
147                throw new SAXException(e);
148            }
149        }
150    
151        public void startElement(String uri, String localName, String qName,
152                Attributes atts) throws SAXException {
153            if (escapeLevel > 0) {
154                escapeLevel++;
155            }
156            boolean xhtml = "http://www.w3.org/1999/xhtml".equals(uri);
157            if (ignoreLevel > 0
158                    || !(xhtml || "http://www.w3.org/2000/svg".equals(uri) || "http://www.w3.org/1998/Math/MathML".equals(uri))) {
159                ignoreLevel++;
160                return;
161            }
162            try {
163                writer.write('<');
164                writer.write(localName);
165                for (int i = 0; i < atts.getLength(); i++) {
166                    String attUri = atts.getURI(i);
167                    String attLocal = atts.getLocalName(i);
168                    if (attUri.length() == 0) {
169                        writer.write(' ');
170                    } else if (!xhtml
171                            && "http://www.w3.org/1999/xlink".equals(attUri)) {
172                        writer.write(" xlink:");
173                    } else if ("http://www.w3.org/XML/1998/namespace".equals(attUri)) {
174                        if (xhtml) {
175                            if ("lang".equals(attLocal)) {
176                                writer.write(' ');
177                            } else {
178                                continue;
179                            }
180                        } else {
181                            writer.write(" xml:");
182                        }
183                    } else {
184                        continue;
185                    }
186                    writer.write(atts.getLocalName(i));
187                    writer.write('=');
188                    writer.write('"');
189                    String val = atts.getValue(i);
190                    for (int j = 0; j < val.length(); j++) {
191                        char c = val.charAt(j);
192                        switch (c) {
193                            case '"':
194                                writer.write("&quot;");
195                                break;
196                            case '&':
197                                writer.write("&amp;");
198                                break;
199                            case '\u00A0':
200                                writer.write("&nbsp;");
201                                break;
202                            default:
203                                writer.write(c);
204                                break;
205                        }
206                    }
207                    writer.write('"');
208                }
209                writer.write('>');
210                if (Arrays.binarySearch(VOID_ELEMENTS, localName) > -1) {
211                    ignoreLevel++;
212                    return;
213                }
214                if ("pre".equals(localName) || "textarea".equals(localName)
215                        || "listing".equals(localName)) {
216                    writer.write('\n');
217                }
218                if (escapeLevel == 0
219                        && Arrays.binarySearch(NON_ESCAPING, localName) > -1) {
220                    escapeLevel = 1;
221                }
222            } catch (IOException e) {
223                throw new SAXException(e);
224            }
225        }
226    
227        public void comment(char[] ch, int start, int length) throws SAXException {
228            if (ignoreLevel > 0 || escapeLevel > 0) {
229                return;
230            }
231            try {
232                writer.write("<!--");
233                writer.write(ch, start, length);
234                writer.write("-->");
235            } catch (IOException e) {
236                throw new SAXException(e);
237            }
238        }
239    
240        public void endCDATA() throws SAXException {
241        }
242    
243        public void endDTD() throws SAXException {
244        }
245    
246        public void endEntity(String name) throws SAXException {
247        }
248    
249        public void startCDATA() throws SAXException {
250        }
251    
252        public void startDTD(String name, String publicId, String systemId)
253                throws SAXException {
254        }
255    
256        public void startEntity(String name) throws SAXException {
257        }
258    
259        public void startPrefixMapping(String prefix, String uri)
260                throws SAXException {
261        }
262    
263        public void endPrefixMapping(String prefix) throws SAXException {
264        }
265    
266        public void skippedEntity(String name) throws SAXException {
267        }
268    
269    }