001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     * Copyright (c) 2008 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.htmlparser.sax;
025    
026    import java.io.IOException;
027    import java.io.OutputStream;
028    import java.io.OutputStreamWriter;
029    import java.io.UnsupportedEncodingException;
030    import java.io.Writer;
031    import java.util.Arrays;
032    
033    import org.xml.sax.Attributes;
034    import org.xml.sax.ContentHandler;
035    import org.xml.sax.Locator;
036    import org.xml.sax.SAXException;
037    import org.xml.sax.ext.LexicalHandler;
038    
039    public class HtmlSerializer implements ContentHandler, LexicalHandler {
040    
041        private static final String[] VOID_ELEMENTS = { "area", "base", "basefont",
042                "bgsound", "br", "col", "command", "embed", "event-source",
043                "frame", "hr", "img", "input", "link", "meta", "param", "source",
044                "spacer", "wbr" };
045    
046        private static final String[] NON_ESCAPING = { "iframe", "noembed",
047                "noframes", "noscript", "plaintext", "script", "style", "xmp" };
048    
049        private static Writer wrap(OutputStream out) {
050            try {
051                return new OutputStreamWriter(out, "UTF-8");
052            } catch (UnsupportedEncodingException e) {
053                throw new RuntimeException(e);
054            }
055        }
056    
057        private int ignoreLevel = 0;
058    
059        private int escapeLevel = 0;
060    
061        private final Writer writer;
062    
063        public HtmlSerializer(OutputStream out) {
064            this(wrap(out));
065        }
066    
067        public HtmlSerializer(Writer out) {
068            this.writer = out;
069        }
070    
071        public void characters(char[] ch, int start, int length)
072                throws SAXException {
073            try {
074                if (escapeLevel > 0) {
075                    writer.write(ch, start, length);
076                } else {
077                    for (int i = start; i < start + length; i++) {
078                        char c = ch[i];
079                        switch (c) {
080                            case '<':
081                                writer.write("&lt;");
082                                break;
083                            case '>':
084                                writer.write("&gt;");
085                                break;
086                            case '&':
087                                writer.write("&amp;");
088                                break;
089                            case '\u00A0':
090                                writer.write("&nbsp;");
091                                break;
092                            default:
093                                writer.write(c);
094                                break;
095                        }
096                    }
097                }
098            } catch (IOException e) {
099                throw new SAXException(e);
100            }
101        }
102    
103        public void endDocument() throws SAXException {
104            try {
105                writer.flush();
106                writer.close();
107            } catch (IOException e) {
108                throw new SAXException(e);
109            }
110        }
111    
112        public void endElement(String uri, String localName, String qName)
113                throws SAXException {
114            if (escapeLevel > 0) {
115                escapeLevel--;
116            }
117            if (ignoreLevel > 0) {
118                ignoreLevel--;
119            } else {
120                try {
121                    writer.write('<');
122                    writer.write('/');
123                    writer.write(localName);
124                    writer.write('>');
125                } catch (IOException e) {
126                    throw new SAXException(e);
127                }
128            }
129        }
130    
131        public void ignorableWhitespace(char[] ch, int start, int length)
132                throws SAXException {
133            characters(ch, start, length);
134        }
135    
136        public void processingInstruction(String target, String data)
137                throws SAXException {
138        }
139    
140        public void setDocumentLocator(Locator locator) {
141        }
142    
143        public void startDocument() throws SAXException {
144            try {
145                writer.write("<!DOCTYPE html>\n");
146            } catch (IOException e) {
147                throw new SAXException(e);
148            }
149        }
150    
151        public void startElement(String uri, String localName, String qName,
152                Attributes atts) throws SAXException {
153            if (escapeLevel > 0) {
154                escapeLevel++;
155            }
156            boolean xhtml = "http://www.w3.org/1999/xhtml".equals(uri);
157            if (ignoreLevel > 0 || !(xhtml || "http://www.w3.org/2000/svg".equals(uri) || "http://www.w3.org/1998/Math/MathML".equals(uri))) {
158                ignoreLevel++;
159                return;
160            }
161            try {
162                writer.write('<');
163                writer.write(localName);
164                for (int i = 0; i < atts.getLength(); i++) {
165                    String attUri = atts.getURI(i);
166                    String attLocal = atts.getLocalName(i);
167                    if (attUri.length() == 0) {
168                        writer.write(' ');                                        
169                    } else if (!xhtml && "http://www.w3.org/1999/xlink".equals(attUri)) {
170                        writer.write(" xlink:");                    
171                    } else if ("http://www.w3.org/XML/1998/namespace".equals(attUri)) {
172                        if (xhtml) {
173                            if ("lang".equals(attLocal)) {
174                                writer.write(' ');                                                                    
175                            } else {
176                                continue;
177                            }                        
178                        } else {
179                            writer.write(" xml:");                        
180                        }
181                    } else {
182                        continue;
183                    }
184                    writer.write(atts.getLocalName(i));
185                    writer.write('=');
186                    writer.write('"');
187                    String val = atts.getValue(i);
188                    for (int j = 0; j < val.length(); j++) {
189                        char c = val.charAt(j);
190                        switch (c) {
191                            case '"':
192                                writer.write("&quot;");
193                                break;
194                            case '&':
195                                writer.write("&amp;");
196                                break;
197                            case '\u00A0':
198                                writer.write("&nbsp;");
199                                break;
200                            default:
201                                writer.write(c);
202                                break;
203                        }
204                    }
205                    writer.write('"');
206                }
207                writer.write('>');
208                if (Arrays.binarySearch(VOID_ELEMENTS, localName) > -1) {
209                    ignoreLevel++;
210                    return;
211                }
212                if ("pre".equals(localName) || "textarea".equals(localName) || "listing".equals(localName)) {
213                    writer.write('\n');
214                }
215                if (escapeLevel == 0
216                        && Arrays.binarySearch(NON_ESCAPING, localName) > -1) {
217                    escapeLevel = 1;
218                }
219            } catch (IOException e) {
220                throw new SAXException(e);
221            }
222        }
223    
224        public void comment(char[] ch, int start, int length) throws SAXException {
225            if (ignoreLevel > 0) {
226                return;
227            }
228            try {
229                writer.write("<!--");
230                writer.write(ch, start, length);
231                writer.write("-->");
232            } catch (IOException e) {
233                throw new SAXException(e);
234            }
235        }
236    
237        public void endCDATA() throws SAXException {
238        }
239    
240        public void endDTD() throws SAXException {
241        }
242    
243        public void endEntity(String name) throws SAXException {
244        }
245    
246        public void startCDATA() throws SAXException {
247        }
248    
249        public void startDTD(String name, String publicId, String systemId)
250                throws SAXException {
251        }
252    
253        public void startEntity(String name) throws SAXException {
254        }
255    
256        public void startPrefixMapping(String prefix, String uri)
257                throws SAXException {
258        }
259    
260        public void endPrefixMapping(String prefix) throws SAXException {
261        }
262    
263        public void skippedEntity(String name) throws SAXException {
264        }
265    
266    }