001 /*
002 * Copyright (c) 2005 Henri Sivonen
003 * Copyright (c) 2007 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.xml;
025
026 import java.io.IOException;
027 import java.io.InputStream;
028 import java.net.MalformedURLException;
029 import java.util.Iterator;
030 import java.util.Set;
031 import java.util.TreeSet;
032 import java.util.regex.Matcher;
033 import java.util.regex.Pattern;
034
035 import nu.validator.io.BoundednputStream;
036 import nu.validator.io.ObservableInputStream;
037 import nu.validator.io.StreamObserver;
038
039 import org.apache.commons.httpclient.Header;
040 import org.apache.commons.httpclient.HostConfiguration;
041 import org.apache.commons.httpclient.HttpClient;
042 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
043 import org.apache.commons.httpclient.cookie.CookiePolicy;
044 import org.apache.commons.httpclient.methods.GetMethod;
045 import org.apache.commons.httpclient.params.HttpClientParams;
046 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
047 import org.apache.log4j.Logger;
048 import org.xml.sax.EntityResolver;
049 import org.xml.sax.ErrorHandler;
050 import org.xml.sax.InputSource;
051 import org.xml.sax.SAXException;
052 import org.xml.sax.SAXParseException;
053
054 import com.hp.hpl.jena.iri.IRI;
055 import com.hp.hpl.jena.iri.IRIException;
056 import com.hp.hpl.jena.iri.IRIFactory;
057
058
059 /**
060 * @version $Id: PrudentHttpEntityResolver.java,v 1.1 2005/01/08 08:11:26
061 * hsivonen Exp $
062 * @author hsivonen
063 */
064 public class PrudentHttpEntityResolver implements EntityResolver {
065
066 private static final Logger log4j = Logger.getLogger(PrudentHttpEntityResolver.class);
067
068 private static final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
069
070 private static final HttpClient client = new HttpClient(manager);
071
072 private static int maxRequests;
073
074 private int sizeLimit;
075
076 private final ErrorHandler errorHandler;
077
078 private int requestsLeft;
079
080 private boolean laxContentType;
081
082 private boolean allowRnc = false;
083
084 private boolean allowHtml = false;
085
086 private boolean allowXhtml = false;
087
088 private boolean acceptAllKnownXmlTypes = false;
089
090 private boolean allowGenericXml = true;
091
092 private final IRIFactory iriFactory;
093
094 private final ContentTypeParser contentTypeParser;
095
096 /**
097 * Sets the timeouts of the HTTP client.
098 *
099 * @param connectionTimeout
100 * timeout until connection established in milliseconds. Zero
101 * means no timeout.
102 * @param socketTimeout
103 * timeout for waiting for data in milliseconds. Zero means no
104 * timeout.
105 */
106 public static void setParams(int connectionTimeout, int socketTimeout,
107 int maxRequests) {
108 HttpConnectionManagerParams hcmp = client.getHttpConnectionManager().getParams();
109 hcmp.setConnectionTimeout(connectionTimeout);
110 hcmp.setSoTimeout(socketTimeout);
111 hcmp.setMaxConnectionsPerHost(HostConfiguration.ANY_HOST_CONFIGURATION,
112 maxRequests);
113 hcmp.setMaxTotalConnections(maxRequests * 2);
114 PrudentHttpEntityResolver.maxRequests = maxRequests;
115 HttpClientParams hcp = client.getParams();
116 hcp.setBooleanParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, true);
117 hcp.setIntParameter(HttpClientParams.MAX_REDIRECTS, 20); // Gecko default
118 }
119
120 public static void setUserAgent(String ua) {
121 client.getParams().setParameter("http.useragent", ua);
122 }
123
124 /**
125 * @param connectionTimeout
126 * @param socketTimeout
127 * @param sizeLimit
128 */
129 public PrudentHttpEntityResolver(int sizeLimit, boolean laxContentType,
130 ErrorHandler errorHandler) {
131 this.sizeLimit = sizeLimit;
132 this.requestsLeft = maxRequests;
133 this.laxContentType = laxContentType;
134 this.errorHandler = errorHandler;
135 this.iriFactory = new IRIFactory();
136 this.iriFactory.useSpecificationXMLSystemID(true);
137 this.iriFactory.useSchemeSpecificRules("http", true);
138 this.iriFactory.useSchemeSpecificRules("https", true);
139 this.contentTypeParser = new ContentTypeParser(errorHandler, laxContentType, this.allowRnc, this.allowHtml, this.allowXhtml, this.acceptAllKnownXmlTypes, this.allowGenericXml);
140 }
141
142 /**
143 * @see org.xml.sax.EntityResolver#resolveEntity(java.lang.String,
144 * java.lang.String)
145 */
146 public InputSource resolveEntity(String publicId, String systemId)
147 throws SAXException, IOException {
148 if (requestsLeft > -1) {
149 if (requestsLeft == 0) {
150 throw new IOException(
151 "Number of permitted HTTP requests exceeded.");
152 } else {
153 requestsLeft--;
154 }
155 }
156 GetMethod m = null;
157 try {
158 IRI iri;
159 try {
160 iri = iriFactory.construct(systemId);
161 } catch (IRIException e) {
162 IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
163 SAXParseException spe = new SAXParseException(e.getMessage(),
164 publicId, systemId, -1, -1, ioe);
165 if (errorHandler != null) {
166 errorHandler.fatalError(spe);
167 }
168 throw spe;
169 }
170 if (!iri.isAbsolute()) {
171 SAXParseException spe = new SAXParseException(
172 "Not an absolute URI.", publicId, systemId, -1, -1,
173 new IOException("Not an absolute URI."));
174 if (errorHandler != null) {
175 errorHandler.fatalError(spe);
176 }
177 throw spe;
178 }
179 String scheme = iri.getScheme();
180 if (!("http".equals(scheme) || "https".equals(scheme))) {
181 String msg = "Unsupported URI scheme: \u201C" + scheme + "\u201D.";
182 SAXParseException spe = new SAXParseException(
183 msg, publicId,
184 systemId, -1, -1, new IOException(msg));
185 if (errorHandler != null) {
186 errorHandler.fatalError(spe);
187 }
188 throw spe;
189 }
190 String host = iri.getHost();
191 if ("127.0.0.1".equals(host) || "localhost".equals(host)) {
192 SAXParseException spe = new SAXParseException(
193 "Attempted to connect to localhost.", publicId,
194 systemId, -1, -1, new IOException("Attempted to connect to localhost."));
195 if (errorHandler != null) {
196 errorHandler.fatalError(spe);
197 }
198 throw spe;
199 }
200 try {
201 systemId = iri.toASCIIString();
202 } catch (MalformedURLException e) {
203 IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
204 SAXParseException spe = new SAXParseException(e.getMessage(),
205 publicId, systemId, -1, -1, ioe);
206 if (errorHandler != null) {
207 errorHandler.fatalError(spe);
208 }
209 throw spe;
210 }
211 try {
212 m = new GetMethod(systemId);
213 } catch (IllegalArgumentException e) {
214 SAXParseException spe = new SAXParseException(
215 e.getMessage(),
216 publicId,
217 systemId,
218 -1,
219 -1,
220 (IOException) new IOException(e.getMessage()).initCause(e));
221 if (errorHandler != null) {
222 errorHandler.fatalError(spe);
223 }
224 throw spe;
225 }
226 m.setFollowRedirects(true);
227 m.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
228 m.addRequestHeader("Accept", buildAccept());
229 log4j.info(systemId);
230 client.executeMethod(m);
231 int statusCode = m.getStatusCode();
232 if (statusCode != 200) {
233 String msg = "HTTP resource not retrievable. The HTTP status from the remote server was: " + statusCode + ".";
234 SAXParseException spe = new SAXParseException(
235 msg, publicId,
236 m.getURI().toString(), -1, -1, new IOException(msg));
237 if (errorHandler != null) {
238 errorHandler.fatalError(spe);
239 }
240 throw spe;
241 }
242 long len = m.getResponseContentLength();
243 if (sizeLimit > -1 && len > sizeLimit) {
244 SAXParseException spe = new SAXParseException(
245 "Resource size exceeds limit.", publicId,
246 m.getURI().toString(), -1, -1, new IOException("Resource size exceeds limit."));
247 if (errorHandler != null) {
248 errorHandler.fatalError(spe);
249 }
250 throw spe;
251 }
252 TypedInputSource is;
253 Header ct = m.getResponseHeader("Content-Type");
254 String contentType = null;
255 String baseUri = m.getURI().toString();
256 if (ct != null) {
257 contentType = ct.getValue();
258 }
259 is = contentTypeParser.buildTypedInputSource(baseUri, publicId, contentType);
260 final GetMethod meth = m;
261 InputStream stream = m.getResponseBodyAsStream();
262 if (sizeLimit > -1) {
263 stream = new BoundednputStream(stream, sizeLimit);
264 }
265 is.setByteStream(new ObservableInputStream(stream,
266 new StreamObserver() {
267 private final Logger log4j = Logger.getLogger("nu.validator.xml.PrudentEntityResolver.StreamObserver");
268
269 private boolean released = false;
270
271 public void closeCalled() {
272 log4j.debug("closeCalled");
273 if (!released) {
274 log4j.debug("closeCalled, not yet released");
275 released = true;
276 try {
277 meth.releaseConnection();
278 } catch (Exception e) {
279 log4j.debug(
280 "closeCalled, releaseConnection", e);
281 }
282 }
283 }
284
285 public void exceptionOccurred(Exception ex) {
286 if (!released) {
287 released = true;
288 try {
289 meth.abort();
290 } catch (Exception e) {
291 log4j.debug("exceptionOccurred, abort", e);
292 } finally {
293 try {
294 meth.releaseConnection();
295 } catch (Exception e) {
296 log4j.debug(
297 "exceptionOccurred, releaseConnection",
298 e);
299 }
300 }
301 }
302 }
303
304 public void finalizerCalled() {
305 if (!released) {
306 released = true;
307 try {
308 meth.abort();
309 } catch (Exception e) {
310 log4j.debug("finalizerCalled, abort", e);
311 } finally {
312 try {
313 meth.releaseConnection();
314 } catch (Exception e) {
315 log4j.debug(
316 "finalizerCalled, releaseConnection",
317 e);
318 }
319 }
320 }
321 }
322
323 }));
324 return is;
325 } catch (IOException e) {
326 if (m != null) {
327 try {
328 m.abort();
329 } catch (Exception ex) {
330 log4j.debug("abort", ex);
331 } finally {
332 try {
333 m.releaseConnection();
334 } catch (Exception ex) {
335 log4j.debug("releaseConnection", ex);
336 }
337 }
338 }
339 throw e;
340 } catch (SAXException e) {
341 if (m != null) {
342 try {
343 m.abort();
344 } catch (Exception ex) {
345 log4j.debug("abort", ex);
346 } finally {
347 try {
348 m.releaseConnection();
349 } catch (Exception ex) {
350 log4j.debug("releaseConnection", ex);
351 }
352 }
353 }
354 throw e;
355 } catch (RuntimeException e) {
356 if (m != null) {
357 try {
358 m.abort();
359 } catch (Exception ex) {
360 log4j.debug("abort", ex);
361 } finally {
362 try {
363 m.releaseConnection();
364 } catch (Exception ex) {
365 log4j.debug("releaseConnection", ex);
366 }
367 }
368 }
369 throw e;
370 }
371 }
372
373 /**
374 * @return Returns the allowRnc.
375 */
376 public boolean isAllowRnc() {
377 return allowRnc;
378 }
379
380 /**
381 * @param allowRnc
382 * The allowRnc to set.
383 */
384 public void setAllowRnc(boolean allowRnc) {
385 this.allowRnc = allowRnc;
386 this.contentTypeParser.setAllowRnc(allowRnc);
387 }
388
389 /**
390 * @param b
391 */
392 public void setAllowHtml(boolean allowHtml) {
393 this.allowHtml = allowHtml;
394 this.contentTypeParser.setAllowHtml(allowHtml);
395 }
396
397 /**
398 * Returns the acceptAllKnownXmlTypes.
399 *
400 * @return the acceptAllKnownXmlTypes
401 */
402 public boolean isAcceptAllKnownXmlTypes() {
403 return acceptAllKnownXmlTypes;
404 }
405
406 /**
407 * Sets the acceptAllKnownXmlTypes.
408 *
409 * @param acceptAllKnownXmlTypes
410 * the acceptAllKnownXmlTypes to set
411 */
412 public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) {
413 this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes;
414 this.contentTypeParser.setAcceptAllKnownXmlTypes(acceptAllKnownXmlTypes);
415 }
416
417 /**
418 * Returns the allowGenericXml.
419 *
420 * @return the allowGenericXml
421 */
422 public boolean isAllowGenericXml() {
423 return allowGenericXml;
424 }
425
426 /**
427 * Sets the allowGenericXml.
428 *
429 * @param allowGenericXml
430 * the allowGenericXml to set
431 */
432 public void setAllowGenericXml(boolean allowGenericXml) {
433 this.allowGenericXml = allowGenericXml;
434 this.contentTypeParser.setAllowGenericXml(allowGenericXml);
435 }
436
437 /**
438 * Returns the allowXhtml.
439 *
440 * @return the allowXhtml
441 */
442 public boolean isAllowXhtml() {
443 return allowXhtml;
444 }
445
446 /**
447 * Sets the allowXhtml.
448 *
449 * @param allowXhtml
450 * the allowXhtml to set
451 */
452 public void setAllowXhtml(boolean allowXhtml) {
453 this.allowXhtml = allowXhtml;
454 this.contentTypeParser.setAllowXhtml(allowXhtml);
455 }
456
457 private String buildAccept() {
458 Set<String> types = new TreeSet<String>();
459 if (isAllowRnc()) {
460 types.add("application/relax-ng-compact-syntax");
461 }
462 if (isAllowHtml()) {
463 types.add("text/html; q=0.9");
464 }
465 if (isAllowXhtml()) {
466 types.add("application/xhtml+xml");
467 types.add("application/xml; q=0.5");
468 }
469 if (isAcceptAllKnownXmlTypes()) {
470 types.add("application/xhtml+xml");
471 // types.add("application/atom+xml");
472 types.add("image/svg+xml");
473 types.add("application/docbook+xml");
474 types.add("application/xml; q=0.5");
475 types.add("text/xml; q=0.3");
476 types.add("*/*; q=0.1");
477 }
478 if (isAllowGenericXml()) {
479 types.add("application/xml; q=0.5");
480 types.add("text/xml; q=0.3");
481 types.add("*/*; q=0.1");
482 }
483 StringBuilder buf = new StringBuilder();
484 for (Iterator<String> iter = types.iterator(); iter.hasNext();) {
485 String str = iter.next();
486 buf.append(str);
487 buf.append(", ");
488 }
489 for (int i = 0; i < 2; i++) {
490 int len = buf.length();
491 if (len > 0) {
492 buf.deleteCharAt(len - 1);
493 }
494 }
495 return buf.toString();
496 }
497
498 /**
499 * Returns the allowHtml.
500 *
501 * @return the allowHtml
502 */
503 public boolean isAllowHtml() {
504 return allowHtml;
505 }
506
507 public boolean isOnlyHtmlAllowed() {
508 return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml();
509 }
510 }