001 package nu.validator.servlet; 002 003 import java.io.IOException; 004 import java.io.OutputStreamWriter; 005 import java.io.Writer; 006 import java.net.MalformedURLException; 007 008 import javax.servlet.http.HttpServletRequest; 009 import javax.servlet.http.HttpServletResponse; 010 011 import nu.validator.gnu.xml.aelfred2.SAXDriver; 012 import nu.validator.htmlparser.test.ListErrorHandler; 013 import nu.validator.htmlparser.test.TreeDumpContentHandler; 014 import nu.validator.xml.NullEntityResolver; 015 import nu.validator.xml.PrudentHttpEntityResolver; 016 import nu.validator.xml.TypedInputSource; 017 018 import org.xml.sax.SAXException; 019 import org.xml.sax.XMLReader; 020 021 import com.hp.hpl.jena.iri.IRIException; 022 import com.hp.hpl.jena.iri.IRIFactory; 023 024 025 public class ParseTreePrinter { 026 027 private static final String FORM_HTML = "<!DOCTYPE html><title>Parse Tree Dump</title><form><p><input type='url' name='doc' id='doc' pattern='(?:https?://.+)?'> <input name='submit' value='Print Tree' type='submit' id='submit'></form>"; 028 029 private final HttpServletRequest request; 030 031 private final HttpServletResponse response; 032 033 /** 034 * @param request 035 * @param response 036 */ 037 public ParseTreePrinter(final HttpServletRequest request, 038 final HttpServletResponse response) { 039 this.request = request; 040 this.response = response; 041 } 042 043 private String scrubUrl(String urlStr) { 044 if (urlStr == null) { 045 return null; 046 } 047 try { 048 return IRIFactory.iriImplementation().construct(urlStr).toASCIIString(); 049 } catch (IRIException e) { 050 return null; 051 } catch (MalformedURLException e) { 052 return null; 053 } 054 } 055 056 public void service() throws IOException { 057 String document = scrubUrl(request.getParameter("doc")); 058 document = ("".equals(document)) ? null : document; 059 Writer writer = new OutputStreamWriter(response.getOutputStream(), "UTF-8"); 060 if (document == null) { 061 response.setContentType("text/html; charset=utf-8"); 062 writer.write(FORM_HTML); 063 writer.flush(); 064 writer.close(); 065 return; 066 } else { 067 response.setContentType("text/plain; charset=utf-8"); 068 try { 069 PrudentHttpEntityResolver httpRes = new PrudentHttpEntityResolver( 070 2048 * 1024, false, null); 071 httpRes.setAllowGenericXml(false); 072 httpRes.setAcceptAllKnownXmlTypes(false); 073 httpRes.setAllowHtml(true); 074 httpRes.setAllowXhtml(true); 075 TypedInputSource documentInput = (TypedInputSource) httpRes.resolveEntity( 076 null, document); 077 String type = documentInput.getType(); 078 XMLReader parser; 079 if ("text/html".equals(type)) { 080 writer.write("HTML parser\n\n#document\n"); 081 parser = new nu.validator.htmlparser.sax.HtmlParser(); 082 } else if ("application/xhtml+xml".equals(type)) { 083 writer.write("XML parser\n\n#document\n"); 084 parser = new SAXDriver(); 085 parser.setFeature( 086 "http://xml.org/sax/features/external-general-entities", 087 false); 088 parser.setFeature( 089 "http://xml.org/sax/features/external-parameter-entities", 090 false); 091 parser.setEntityResolver(new NullEntityResolver()); 092 } else { 093 writer.write("Unsupported content type.\n"); 094 writer.flush(); 095 writer.close(); 096 return; 097 } 098 TreeDumpContentHandler treeDumpContentHandler = new TreeDumpContentHandler(writer, false); 099 ListErrorHandler listErrorHandler = new ListErrorHandler(); 100 parser.setContentHandler(treeDumpContentHandler); 101 parser.setProperty("http://xml.org/sax/properties/lexical-handler", treeDumpContentHandler); 102 parser.setErrorHandler(listErrorHandler); 103 parser.parse(documentInput); 104 writer.write("#errors\n"); 105 for (String err : listErrorHandler.getErrors()) { 106 writer.write(err); 107 writer.write('\n'); 108 } 109 } catch (SAXException e) { 110 writer.write("Exception:\n"); 111 writer.write(e.getMessage()); 112 writer.write("\n"); 113 } catch (IOException e) { 114 writer.write("Exception:\n"); 115 writer.write(e.getMessage()); 116 writer.write("\n"); 117 } finally { 118 writer.flush(); 119 writer.close(); 120 } 121 } 122 } 123 124 }