001    /*
002     * Copyright (c) 2007 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package nu.validator.htmlparser.sax;
024    
025    import java.io.IOException;
026    import java.io.OutputStream;
027    import java.io.OutputStreamWriter;
028    import java.io.UnsupportedEncodingException;
029    import java.io.Writer;
030    import java.util.Arrays;
031    
032    import org.xml.sax.Attributes;
033    import org.xml.sax.ContentHandler;
034    import org.xml.sax.Locator;
035    import org.xml.sax.SAXException;
036    import org.xml.sax.ext.LexicalHandler;
037    
038    public class HtmlSerializer implements ContentHandler, LexicalHandler {
039    
040        private static final String[] VOID_ELEMENTS = { "area", "base", "basefont",
041                "bgsound", "br", "col", "embed", "frame", "hr", "img", "input",
042                "link", "meta", "param", "spacer", "wbr" };
043        
044        private static final String[] NON_ESCAPING = {"iframe",
045            "noembed",
046            "noframes",
047            "noscript",
048            "plaintext",
049            "script",
050            "style",
051            "xmp"
052        };
053        
054        private static Writer wrap(OutputStream out) {
055            try {
056                return new OutputStreamWriter(out, "UTF-8");
057            } catch (UnsupportedEncodingException e) {
058                throw new RuntimeException(e);
059            }
060        }
061        
062        private int ignoreLevel = 0;
063        
064        private int escapeLevel = 0;
065        
066        private final Writer writer;
067        
068        public HtmlSerializer(OutputStream out) {
069            this(wrap(out));
070        }
071    
072        public HtmlSerializer(Writer out) {
073            this.writer = out;
074        }
075        
076        public void characters(char[] ch, int start, int length) throws SAXException {
077                try {
078                    if (escapeLevel > 0) {
079                        writer.write(ch, start, length);
080                    } else {
081                        for (int i = start; i < start + length; i++) {
082                            char c = ch[i];
083                            switch (c) {
084                                case '<':
085                                    writer.write("&lt;");
086                                    break;
087                                case '>':
088                                    writer.write("&gt;");
089                                    break;
090                                case '&':
091                                    writer.write("&amp;");
092                                    break;
093                                default:
094                                    writer.write(c);
095                                    break;
096                            }
097                        }
098                    }
099                } catch (IOException e) {
100                    throw new SAXException(e);
101                }           
102        }
103    
104        public void endDocument() throws SAXException {
105            try {
106                writer.flush();
107                writer.close();
108            } catch (IOException e) {
109                throw new SAXException(e);
110            }
111        }
112    
113        public void endElement(String uri, String localName, String qName) throws SAXException {
114            if (escapeLevel > 0) {
115                escapeLevel--;
116            }
117            if (ignoreLevel > 0) {
118                ignoreLevel--;
119            } else {
120                try {
121                    writer.write('<');
122                    writer.write('/');
123                    writer.write(localName);
124                    writer.write('>');
125                } catch (IOException e) {
126                    throw new SAXException(e);
127                }   
128            }
129        }
130    
131        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
132            characters(ch, start, length);
133        }
134    
135        public void processingInstruction(String target, String data) throws SAXException {
136        }
137    
138        public void setDocumentLocator(Locator locator) {
139        }
140    
141        public void startDocument() throws SAXException {
142            try {
143                writer.write("<!DOCTYPE html>\n");
144            } catch (IOException e) {
145                throw new SAXException(e);
146            }
147        }
148    
149        public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
150            if (escapeLevel > 0) {
151                escapeLevel++;
152            }
153            if (ignoreLevel > 0 || !"http://www.w3.org/1999/xhtml".equals(uri)) {
154                ignoreLevel++;
155                return;
156            }
157            try {
158                writer.write('<');
159                writer.write(localName);
160                for (int i = 0; i < atts.getLength(); i++) {
161                    writer.write(' ');
162                    writer.write(atts.getLocalName(i)); // XXX xml:lang
163                    writer.write('=');
164                    writer.write('"');
165                    String val = atts.getValue(i);
166                    for (int j = 0; j < val.length(); j++) {
167                        char c = val.charAt(j);
168                        switch (c) {
169                            case '"':
170                                writer.write("&quot;");
171                                break;
172                            case '<':
173                                writer.write("&lt;");
174                                break;
175                            case '>':
176                                writer.write("&gt;");
177                                break;
178                            case '&':
179                                writer.write("&amp;");
180                                break;
181                            default:
182                                writer.write(c);
183                                break;
184                        }
185                    }
186                    writer.write('"');                
187                }
188                writer.write('>');
189                if (Arrays.binarySearch(VOID_ELEMENTS, localName) > -1) {
190                    ignoreLevel++;
191                    return;                
192                }
193                if ("pre".equals(localName) || "textarea".equals(localName)) {
194                    writer.write('\n');                                
195                }
196                if (escapeLevel == 0 && Arrays.binarySearch(NON_ESCAPING, localName) > -1) {
197                    escapeLevel = 1;                
198                }
199            } catch (IOException e) {
200                throw new SAXException(e);
201            }        
202        }
203    
204        public void comment(char[] ch, int start, int length) throws SAXException {
205            if (ignoreLevel > 0) {
206                return;
207            }
208            try {
209                writer.write("<!--");
210                writer.write(ch, start, length);
211                writer.write("-->");
212            } catch (IOException e) {
213                throw new SAXException(e);
214            }
215        }
216    
217        public void endCDATA() throws SAXException {
218        }
219    
220        public void endDTD() throws SAXException {
221        }
222    
223        public void endEntity(String name) throws SAXException {
224        }
225    
226        public void startCDATA() throws SAXException {
227        }
228    
229        public void startDTD(String name, String publicId, String systemId) throws SAXException {
230        }
231    
232        public void startEntity(String name) throws SAXException {
233        }
234    
235        public void startPrefixMapping(String prefix, String uri) throws SAXException {
236        }
237    
238        public void endPrefixMapping(String prefix) throws SAXException {
239        }
240    
241        public void skippedEntity(String name) throws SAXException {
242        }
243    
244    }