001    /*
002     * Copyright (c) 2005 Henri Sivonen
003     * Copyright (c) 2007 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.xml;
025    
026    import java.io.IOException;
027    import java.io.InputStream;
028    import java.net.MalformedURLException;
029    import java.util.Iterator;
030    import java.util.Set;
031    import java.util.TreeSet;
032    import java.util.regex.Matcher;
033    import java.util.regex.Pattern;
034    
035    import nu.validator.io.BoundednputStream;
036    import nu.validator.io.ObservableInputStream;
037    import nu.validator.io.StreamObserver;
038    
039    import org.apache.commons.httpclient.Header;
040    import org.apache.commons.httpclient.HostConfiguration;
041    import org.apache.commons.httpclient.HttpClient;
042    import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
043    import org.apache.commons.httpclient.cookie.CookiePolicy;
044    import org.apache.commons.httpclient.methods.GetMethod;
045    import org.apache.commons.httpclient.params.HttpClientParams;
046    import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
047    import org.apache.log4j.Logger;
048    import org.xml.sax.EntityResolver;
049    import org.xml.sax.ErrorHandler;
050    import org.xml.sax.InputSource;
051    import org.xml.sax.SAXException;
052    import org.xml.sax.SAXParseException;
053    
054    import com.hp.hpl.jena.iri.IRI;
055    import com.hp.hpl.jena.iri.IRIException;
056    import com.hp.hpl.jena.iri.IRIFactory;
057    
058    
059    /**
060     * @version $Id: PrudentHttpEntityResolver.java,v 1.1 2005/01/08 08:11:26
061     *          hsivonen Exp $
062     * @author hsivonen
063     */
064    public class PrudentHttpEntityResolver implements EntityResolver {
065    
066        private static final Logger log4j = Logger.getLogger(PrudentHttpEntityResolver.class);
067    
068        private static final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
069    
070        private static final HttpClient client = new HttpClient(manager);
071    
072        private static int maxRequests;
073    
074        private int sizeLimit;
075    
076        private final ErrorHandler errorHandler;
077    
078        private int requestsLeft;
079    
080        private boolean laxContentType;
081    
082        private boolean allowRnc = false;
083    
084        private boolean allowHtml = false;
085    
086        private boolean allowXhtml = false;
087    
088        private boolean acceptAllKnownXmlTypes = false;
089    
090        private boolean allowGenericXml = true;
091    
092        private final IRIFactory iriFactory;
093        
094        private final ContentTypeParser contentTypeParser;
095    
096        /**
097         * Sets the timeouts of the HTTP client.
098         * 
099         * @param connectionTimeout
100         *            timeout until connection established in milliseconds. Zero
101         *            means no timeout.
102         * @param socketTimeout
103         *            timeout for waiting for data in milliseconds. Zero means no
104         *            timeout.
105         */
106        public static void setParams(int connectionTimeout, int socketTimeout,
107                int maxRequests) {
108            HttpConnectionManagerParams hcmp = client.getHttpConnectionManager().getParams();
109            hcmp.setConnectionTimeout(connectionTimeout);
110            hcmp.setSoTimeout(socketTimeout);
111            hcmp.setMaxConnectionsPerHost(HostConfiguration.ANY_HOST_CONFIGURATION,
112                    maxRequests);
113            hcmp.setMaxTotalConnections(maxRequests * 2);
114            PrudentHttpEntityResolver.maxRequests = maxRequests;
115            HttpClientParams hcp = client.getParams();
116            hcp.setBooleanParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, true);
117            hcp.setIntParameter(HttpClientParams.MAX_REDIRECTS, 20); // Gecko default 
118        }
119    
120        public static void setUserAgent(String ua) {
121            client.getParams().setParameter("http.useragent", ua);
122        }
123    
124        /**
125         * @param connectionTimeout
126         * @param socketTimeout
127         * @param sizeLimit
128         */
129        public PrudentHttpEntityResolver(int sizeLimit, boolean laxContentType,
130                ErrorHandler errorHandler) {
131            this.sizeLimit = sizeLimit;
132            this.requestsLeft = maxRequests;
133            this.laxContentType = laxContentType;
134            this.errorHandler = errorHandler;
135            this.iriFactory = new IRIFactory();
136            this.iriFactory.useSpecificationXMLSystemID(true);
137            this.iriFactory.useSchemeSpecificRules("http", true);
138            this.iriFactory.useSchemeSpecificRules("https", true);
139            this.contentTypeParser = new ContentTypeParser(errorHandler, laxContentType, this.allowRnc, this.allowHtml, this.allowXhtml, this.acceptAllKnownXmlTypes, this.allowGenericXml);
140        }
141    
142        /**
143         * @see org.xml.sax.EntityResolver#resolveEntity(java.lang.String,
144         *      java.lang.String)
145         */
146        public InputSource resolveEntity(String publicId, String systemId)
147                throws SAXException, IOException {
148            if (requestsLeft > -1) {
149                if (requestsLeft == 0) {
150                    throw new IOException(
151                            "Number of permitted HTTP requests exceeded.");
152                } else {
153                    requestsLeft--;
154                }
155            }
156            GetMethod m = null;
157            try {
158                IRI iri;
159                try {
160                    iri = iriFactory.construct(systemId);
161                } catch (IRIException e) {
162                    IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
163                    SAXParseException spe = new SAXParseException(e.getMessage(),
164                            publicId, systemId, -1, -1, ioe);
165                    if (errorHandler != null) {
166                        errorHandler.fatalError(spe);
167                    }
168                    throw spe;
169                }
170                if (!iri.isAbsolute()) {
171                    SAXParseException spe = new SAXParseException(
172                            "Not an absolute URI.", publicId, systemId, -1, -1,
173                            new IOException("Not an absolute URI."));
174                    if (errorHandler != null) {
175                        errorHandler.fatalError(spe);
176                    }
177                    throw spe;
178                }
179                String scheme = iri.getScheme();
180                if (!("http".equals(scheme) || "https".equals(scheme))) {
181                    String msg = "Unsupported URI scheme: \u201C" + scheme + "\u201D.";
182                    SAXParseException spe = new SAXParseException(
183                            msg, publicId,
184                            systemId, -1, -1, new IOException(msg));
185                    if (errorHandler != null) {
186                        errorHandler.fatalError(spe);
187                    }
188                    throw spe;
189                }
190                String host = iri.getHost();
191                if ("127.0.0.1".equals(host) || "localhost".equals(host)) {
192                    SAXParseException spe = new SAXParseException(
193                            "Attempted to connect to localhost.", publicId,
194                            systemId, -1, -1, new IOException("Attempted to connect to localhost."));
195                    if (errorHandler != null) {
196                        errorHandler.fatalError(spe);
197                    }
198                    throw spe;
199                }
200                try {
201                    systemId = iri.toASCIIString();
202                } catch (MalformedURLException e) {
203                    IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
204                    SAXParseException spe = new SAXParseException(e.getMessage(),
205                            publicId, systemId, -1, -1, ioe);
206                    if (errorHandler != null) {
207                        errorHandler.fatalError(spe);
208                    }
209                    throw spe;
210                }
211                try {
212                    m = new GetMethod(systemId);
213                } catch (IllegalArgumentException e) {
214                    SAXParseException spe = new SAXParseException(
215                            e.getMessage(),
216                            publicId,
217                            systemId,
218                            -1,
219                            -1,
220                            (IOException) new IOException(e.getMessage()).initCause(e));
221                    if (errorHandler != null) {
222                        errorHandler.fatalError(spe);
223                    }
224                    throw spe;
225                }
226                m.setFollowRedirects(true);
227                m.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
228                m.addRequestHeader("Accept", buildAccept());
229                log4j.info(systemId);
230                client.executeMethod(m);
231                int statusCode = m.getStatusCode();
232                if (statusCode != 200) {
233                    String msg = "HTTP resource not retrievable. The HTTP status from the remote server was: " + statusCode + ".";
234                    SAXParseException spe = new SAXParseException(
235                            msg, publicId,
236                            m.getURI().toString(), -1, -1, new IOException(msg));
237                    if (errorHandler != null) {
238                        errorHandler.fatalError(spe);
239                    }
240                    throw spe;
241                }
242                long len = m.getResponseContentLength();
243                if (sizeLimit > -1 && len > sizeLimit) {
244                    SAXParseException spe = new SAXParseException(
245                            "Resource size exceeds limit.", publicId,
246                            m.getURI().toString(), -1, -1, new IOException("Resource size exceeds limit."));
247                    if (errorHandler != null) {
248                        errorHandler.fatalError(spe);
249                    }
250                    throw spe;
251                }
252                TypedInputSource is;
253                Header ct = m.getResponseHeader("Content-Type");
254                String contentType = null;
255                String baseUri = m.getURI().toString();
256                if (ct != null) {
257                    contentType = ct.getValue();
258                }
259                is = contentTypeParser.buildTypedInputSource(baseUri, publicId, contentType);
260                final GetMethod meth = m;
261                InputStream stream = m.getResponseBodyAsStream();
262                if (sizeLimit > -1) {
263                    stream = new BoundednputStream(stream, sizeLimit);
264                }
265                is.setByteStream(new ObservableInputStream(stream,
266                        new StreamObserver() {
267                            private final Logger log4j = Logger.getLogger("nu.validator.xml.PrudentEntityResolver.StreamObserver");
268    
269                            private boolean released = false;
270    
271                            public void closeCalled() {
272                                log4j.debug("closeCalled");
273                                if (!released) {
274                                    log4j.debug("closeCalled, not yet released");
275                                    released = true;
276                                    try {
277                                        meth.releaseConnection();
278                                    } catch (Exception e) {
279                                        log4j.debug(
280                                                "closeCalled, releaseConnection", e);
281                                    }
282                                }
283                            }
284    
285                            public void exceptionOccurred(Exception ex) {
286                                if (!released) {
287                                    released = true;
288                                    try {
289                                        meth.abort();
290                                    } catch (Exception e) {
291                                        log4j.debug("exceptionOccurred, abort", e);
292                                    } finally {
293                                        try {
294                                            meth.releaseConnection();
295                                        } catch (Exception e) {
296                                            log4j.debug(
297                                                    "exceptionOccurred, releaseConnection",
298                                                    e);
299                                        }
300                                    }
301                                }
302                            }
303    
304                            public void finalizerCalled() {
305                                if (!released) {
306                                    released = true;
307                                    try {
308                                        meth.abort();
309                                    } catch (Exception e) {
310                                        log4j.debug("finalizerCalled, abort", e);
311                                    } finally {
312                                        try {
313                                            meth.releaseConnection();
314                                        } catch (Exception e) {
315                                            log4j.debug(
316                                                    "finalizerCalled, releaseConnection",
317                                                    e);
318                                        }
319                                    }
320                                }
321                            }
322    
323                        }));
324                return is;
325            } catch (IOException e) {
326                if (m != null) {
327                    try {
328                        m.abort();
329                    } catch (Exception ex) {
330                        log4j.debug("abort", ex);
331                    } finally {
332                        try {
333                            m.releaseConnection();
334                        } catch (Exception ex) {
335                            log4j.debug("releaseConnection", ex);
336                        }
337                    }
338                }
339                throw e;
340            } catch (SAXException e) {
341                if (m != null) {
342                    try {
343                        m.abort();
344                    } catch (Exception ex) {
345                        log4j.debug("abort", ex);
346                    } finally {
347                        try {
348                            m.releaseConnection();
349                        } catch (Exception ex) {
350                            log4j.debug("releaseConnection", ex);
351                        }
352                    }
353                }
354                throw e;
355            } catch (RuntimeException e) {
356                if (m != null) {
357                    try {
358                        m.abort();
359                    } catch (Exception ex) {
360                        log4j.debug("abort", ex);
361                    } finally {
362                        try {
363                            m.releaseConnection();
364                        } catch (Exception ex) {
365                            log4j.debug("releaseConnection", ex);
366                        }
367                    }
368                }
369                throw e;
370            }
371        }
372    
373        /**
374         * @return Returns the allowRnc.
375         */
376        public boolean isAllowRnc() {
377            return allowRnc;
378        }
379    
380        /**
381         * @param allowRnc
382         *            The allowRnc to set.
383         */
384        public void setAllowRnc(boolean allowRnc) {
385            this.allowRnc = allowRnc;
386            this.contentTypeParser.setAllowRnc(allowRnc);
387        }
388    
389        /**
390         * @param b
391         */
392        public void setAllowHtml(boolean allowHtml) {
393            this.allowHtml = allowHtml;
394            this.contentTypeParser.setAllowHtml(allowHtml);
395        }
396    
397        /**
398         * Returns the acceptAllKnownXmlTypes.
399         * 
400         * @return the acceptAllKnownXmlTypes
401         */
402        public boolean isAcceptAllKnownXmlTypes() {
403            return acceptAllKnownXmlTypes;
404        }
405    
406        /**
407         * Sets the acceptAllKnownXmlTypes.
408         * 
409         * @param acceptAllKnownXmlTypes
410         *            the acceptAllKnownXmlTypes to set
411         */
412        public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) {
413            this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes;
414            this.contentTypeParser.setAcceptAllKnownXmlTypes(acceptAllKnownXmlTypes);
415        }
416    
417        /**
418         * Returns the allowGenericXml.
419         * 
420         * @return the allowGenericXml
421         */
422        public boolean isAllowGenericXml() {
423            return allowGenericXml;
424        }
425    
426        /**
427         * Sets the allowGenericXml.
428         * 
429         * @param allowGenericXml
430         *            the allowGenericXml to set
431         */
432        public void setAllowGenericXml(boolean allowGenericXml) {
433            this.allowGenericXml = allowGenericXml;
434            this.contentTypeParser.setAllowGenericXml(allowGenericXml);
435        }
436    
437        /**
438         * Returns the allowXhtml.
439         * 
440         * @return the allowXhtml
441         */
442        public boolean isAllowXhtml() {
443            return allowXhtml;
444        }
445    
446        /**
447         * Sets the allowXhtml.
448         * 
449         * @param allowXhtml
450         *            the allowXhtml to set
451         */
452        public void setAllowXhtml(boolean allowXhtml) {
453            this.allowXhtml = allowXhtml;
454            this.contentTypeParser.setAllowXhtml(allowXhtml);
455        }
456    
457        private String buildAccept() {
458            Set<String> types = new TreeSet<String>();
459            if (isAllowRnc()) {
460                types.add("application/relax-ng-compact-syntax");
461            }
462            if (isAllowHtml()) {
463                types.add("text/html; q=0.9");
464            }
465            if (isAllowXhtml()) {
466                types.add("application/xhtml+xml");
467                types.add("application/xml; q=0.5");
468            }
469            if (isAcceptAllKnownXmlTypes()) {
470                types.add("application/xhtml+xml");
471    //            types.add("application/atom+xml");
472                types.add("image/svg+xml");
473                types.add("application/docbook+xml");
474                types.add("application/xml; q=0.5");
475                types.add("text/xml; q=0.3");
476                types.add("*/*; q=0.1");
477            }
478            if (isAllowGenericXml()) {
479                types.add("application/xml; q=0.5");
480                types.add("text/xml; q=0.3");
481                types.add("*/*; q=0.1");
482            }
483            StringBuilder buf = new StringBuilder();
484            for (Iterator<String> iter = types.iterator(); iter.hasNext();) {
485                String str = iter.next();
486                buf.append(str);
487                buf.append(", ");
488            }
489            for (int i = 0; i < 2; i++) {
490                int len = buf.length();
491                if (len > 0) {
492                    buf.deleteCharAt(len - 1);
493                }
494            }
495            return buf.toString();
496        }
497    
498        /**
499         * Returns the allowHtml.
500         * 
501         * @return the allowHtml
502         */
503        public boolean isAllowHtml() {
504            return allowHtml;
505        }
506    
507        public boolean isOnlyHtmlAllowed() {
508            return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml();
509        }
510    }