001 /* 002 * Copyright (c) 2005 Henri Sivonen 003 * Copyright (c) 2007 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.xml; 025 026 import java.io.IOException; 027 import java.io.InputStream; 028 import java.net.MalformedURLException; 029 import java.util.Iterator; 030 import java.util.Set; 031 import java.util.TreeSet; 032 import java.util.regex.Matcher; 033 import java.util.regex.Pattern; 034 035 import nu.validator.io.BoundednputStream; 036 import nu.validator.io.ObservableInputStream; 037 import nu.validator.io.StreamObserver; 038 039 import org.apache.commons.httpclient.Header; 040 import org.apache.commons.httpclient.HostConfiguration; 041 import org.apache.commons.httpclient.HttpClient; 042 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; 043 import org.apache.commons.httpclient.cookie.CookiePolicy; 044 import org.apache.commons.httpclient.methods.GetMethod; 045 import org.apache.commons.httpclient.params.HttpClientParams; 046 import org.apache.commons.httpclient.params.HttpConnectionManagerParams; 047 import org.apache.log4j.Logger; 048 import org.xml.sax.EntityResolver; 049 import org.xml.sax.ErrorHandler; 050 import org.xml.sax.InputSource; 051 import org.xml.sax.SAXException; 052 import org.xml.sax.SAXParseException; 053 054 import com.hp.hpl.jena.iri.IRI; 055 import com.hp.hpl.jena.iri.IRIException; 056 import com.hp.hpl.jena.iri.IRIFactory; 057 058 059 /** 060 * @version $Id: PrudentHttpEntityResolver.java,v 1.1 2005/01/08 08:11:26 061 * hsivonen Exp $ 062 * @author hsivonen 063 */ 064 public class PrudentHttpEntityResolver implements EntityResolver { 065 066 private static final Logger log4j = Logger.getLogger(PrudentHttpEntityResolver.class); 067 068 private static final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager(); 069 070 private static final HttpClient client = new HttpClient(manager); 071 072 private static int maxRequests; 073 074 private int sizeLimit; 075 076 private final ErrorHandler errorHandler; 077 078 private int requestsLeft; 079 080 private boolean laxContentType; 081 082 private boolean allowRnc = false; 083 084 private boolean allowHtml = false; 085 086 private boolean allowXhtml = false; 087 088 private boolean acceptAllKnownXmlTypes = false; 089 090 private boolean allowGenericXml = true; 091 092 private final IRIFactory iriFactory; 093 094 private final ContentTypeParser contentTypeParser; 095 096 /** 097 * Sets the timeouts of the HTTP client. 098 * 099 * @param connectionTimeout 100 * timeout until connection established in milliseconds. Zero 101 * means no timeout. 102 * @param socketTimeout 103 * timeout for waiting for data in milliseconds. Zero means no 104 * timeout. 105 */ 106 public static void setParams(int connectionTimeout, int socketTimeout, 107 int maxRequests) { 108 HttpConnectionManagerParams hcmp = client.getHttpConnectionManager().getParams(); 109 hcmp.setConnectionTimeout(connectionTimeout); 110 hcmp.setSoTimeout(socketTimeout); 111 hcmp.setMaxConnectionsPerHost(HostConfiguration.ANY_HOST_CONFIGURATION, 112 maxRequests); 113 hcmp.setMaxTotalConnections(maxRequests * 2); 114 PrudentHttpEntityResolver.maxRequests = maxRequests; 115 HttpClientParams hcp = client.getParams(); 116 hcp.setBooleanParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, true); 117 hcp.setIntParameter(HttpClientParams.MAX_REDIRECTS, 20); // Gecko default 118 } 119 120 public static void setUserAgent(String ua) { 121 client.getParams().setParameter("http.useragent", ua); 122 } 123 124 /** 125 * @param connectionTimeout 126 * @param socketTimeout 127 * @param sizeLimit 128 */ 129 public PrudentHttpEntityResolver(int sizeLimit, boolean laxContentType, 130 ErrorHandler errorHandler) { 131 this.sizeLimit = sizeLimit; 132 this.requestsLeft = maxRequests; 133 this.laxContentType = laxContentType; 134 this.errorHandler = errorHandler; 135 this.iriFactory = new IRIFactory(); 136 this.iriFactory.useSpecificationXMLSystemID(true); 137 this.iriFactory.useSchemeSpecificRules("http", true); 138 this.iriFactory.useSchemeSpecificRules("https", true); 139 this.contentTypeParser = new ContentTypeParser(errorHandler, laxContentType, this.allowRnc, this.allowHtml, this.allowXhtml, this.acceptAllKnownXmlTypes, this.allowGenericXml); 140 } 141 142 /** 143 * @see org.xml.sax.EntityResolver#resolveEntity(java.lang.String, 144 * java.lang.String) 145 */ 146 public InputSource resolveEntity(String publicId, String systemId) 147 throws SAXException, IOException { 148 if (requestsLeft > -1) { 149 if (requestsLeft == 0) { 150 throw new IOException( 151 "Number of permitted HTTP requests exceeded."); 152 } else { 153 requestsLeft--; 154 } 155 } 156 GetMethod m = null; 157 try { 158 IRI iri; 159 try { 160 iri = iriFactory.construct(systemId); 161 } catch (IRIException e) { 162 IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e); 163 SAXParseException spe = new SAXParseException(e.getMessage(), 164 publicId, systemId, -1, -1, ioe); 165 if (errorHandler != null) { 166 errorHandler.fatalError(spe); 167 } 168 throw spe; 169 } 170 if (!iri.isAbsolute()) { 171 SAXParseException spe = new SAXParseException( 172 "Not an absolute URI.", publicId, systemId, -1, -1, 173 new IOException("Not an absolute URI.")); 174 if (errorHandler != null) { 175 errorHandler.fatalError(spe); 176 } 177 throw spe; 178 } 179 String scheme = iri.getScheme(); 180 if (!("http".equals(scheme) || "https".equals(scheme))) { 181 String msg = "Unsupported URI scheme: \u201C" + scheme + "\u201D."; 182 SAXParseException spe = new SAXParseException( 183 msg, publicId, 184 systemId, -1, -1, new IOException(msg)); 185 if (errorHandler != null) { 186 errorHandler.fatalError(spe); 187 } 188 throw spe; 189 } 190 String host = iri.getHost(); 191 if ("127.0.0.1".equals(host) || "localhost".equals(host)) { 192 SAXParseException spe = new SAXParseException( 193 "Attempted to connect to localhost.", publicId, 194 systemId, -1, -1, new IOException("Attempted to connect to localhost.")); 195 if (errorHandler != null) { 196 errorHandler.fatalError(spe); 197 } 198 throw spe; 199 } 200 try { 201 systemId = iri.toASCIIString(); 202 } catch (MalformedURLException e) { 203 IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e); 204 SAXParseException spe = new SAXParseException(e.getMessage(), 205 publicId, systemId, -1, -1, ioe); 206 if (errorHandler != null) { 207 errorHandler.fatalError(spe); 208 } 209 throw spe; 210 } 211 try { 212 m = new GetMethod(systemId); 213 } catch (IllegalArgumentException e) { 214 SAXParseException spe = new SAXParseException( 215 e.getMessage(), 216 publicId, 217 systemId, 218 -1, 219 -1, 220 (IOException) new IOException(e.getMessage()).initCause(e)); 221 if (errorHandler != null) { 222 errorHandler.fatalError(spe); 223 } 224 throw spe; 225 } 226 m.setFollowRedirects(true); 227 m.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES); 228 m.addRequestHeader("Accept", buildAccept()); 229 log4j.info(systemId); 230 client.executeMethod(m); 231 int statusCode = m.getStatusCode(); 232 if (statusCode != 200) { 233 String msg = "HTTP resource not retrievable. The HTTP status from the remote server was: " + statusCode + "."; 234 SAXParseException spe = new SAXParseException( 235 msg, publicId, 236 m.getURI().toString(), -1, -1, new IOException(msg)); 237 if (errorHandler != null) { 238 errorHandler.fatalError(spe); 239 } 240 throw spe; 241 } 242 long len = m.getResponseContentLength(); 243 if (sizeLimit > -1 && len > sizeLimit) { 244 SAXParseException spe = new SAXParseException( 245 "Resource size exceeds limit.", publicId, 246 m.getURI().toString(), -1, -1, new IOException("Resource size exceeds limit.")); 247 if (errorHandler != null) { 248 errorHandler.fatalError(spe); 249 } 250 throw spe; 251 } 252 TypedInputSource is; 253 Header ct = m.getResponseHeader("Content-Type"); 254 String contentType = null; 255 String baseUri = m.getURI().toString(); 256 if (ct != null) { 257 contentType = ct.getValue(); 258 } 259 is = contentTypeParser.buildTypedInputSource(baseUri, publicId, contentType); 260 final GetMethod meth = m; 261 InputStream stream = m.getResponseBodyAsStream(); 262 if (sizeLimit > -1) { 263 stream = new BoundednputStream(stream, sizeLimit); 264 } 265 is.setByteStream(new ObservableInputStream(stream, 266 new StreamObserver() { 267 private final Logger log4j = Logger.getLogger("nu.validator.xml.PrudentEntityResolver.StreamObserver"); 268 269 private boolean released = false; 270 271 public void closeCalled() { 272 log4j.debug("closeCalled"); 273 if (!released) { 274 log4j.debug("closeCalled, not yet released"); 275 released = true; 276 try { 277 meth.releaseConnection(); 278 } catch (Exception e) { 279 log4j.debug( 280 "closeCalled, releaseConnection", e); 281 } 282 } 283 } 284 285 public void exceptionOccurred(Exception ex) { 286 if (!released) { 287 released = true; 288 try { 289 meth.abort(); 290 } catch (Exception e) { 291 log4j.debug("exceptionOccurred, abort", e); 292 } finally { 293 try { 294 meth.releaseConnection(); 295 } catch (Exception e) { 296 log4j.debug( 297 "exceptionOccurred, releaseConnection", 298 e); 299 } 300 } 301 } 302 } 303 304 public void finalizerCalled() { 305 if (!released) { 306 released = true; 307 try { 308 meth.abort(); 309 } catch (Exception e) { 310 log4j.debug("finalizerCalled, abort", e); 311 } finally { 312 try { 313 meth.releaseConnection(); 314 } catch (Exception e) { 315 log4j.debug( 316 "finalizerCalled, releaseConnection", 317 e); 318 } 319 } 320 } 321 } 322 323 })); 324 return is; 325 } catch (IOException e) { 326 if (m != null) { 327 try { 328 m.abort(); 329 } catch (Exception ex) { 330 log4j.debug("abort", ex); 331 } finally { 332 try { 333 m.releaseConnection(); 334 } catch (Exception ex) { 335 log4j.debug("releaseConnection", ex); 336 } 337 } 338 } 339 throw e; 340 } catch (SAXException e) { 341 if (m != null) { 342 try { 343 m.abort(); 344 } catch (Exception ex) { 345 log4j.debug("abort", ex); 346 } finally { 347 try { 348 m.releaseConnection(); 349 } catch (Exception ex) { 350 log4j.debug("releaseConnection", ex); 351 } 352 } 353 } 354 throw e; 355 } catch (RuntimeException e) { 356 if (m != null) { 357 try { 358 m.abort(); 359 } catch (Exception ex) { 360 log4j.debug("abort", ex); 361 } finally { 362 try { 363 m.releaseConnection(); 364 } catch (Exception ex) { 365 log4j.debug("releaseConnection", ex); 366 } 367 } 368 } 369 throw e; 370 } 371 } 372 373 /** 374 * @return Returns the allowRnc. 375 */ 376 public boolean isAllowRnc() { 377 return allowRnc; 378 } 379 380 /** 381 * @param allowRnc 382 * The allowRnc to set. 383 */ 384 public void setAllowRnc(boolean allowRnc) { 385 this.allowRnc = allowRnc; 386 this.contentTypeParser.setAllowRnc(allowRnc); 387 } 388 389 /** 390 * @param b 391 */ 392 public void setAllowHtml(boolean allowHtml) { 393 this.allowHtml = allowHtml; 394 this.contentTypeParser.setAllowHtml(allowHtml); 395 } 396 397 /** 398 * Returns the acceptAllKnownXmlTypes. 399 * 400 * @return the acceptAllKnownXmlTypes 401 */ 402 public boolean isAcceptAllKnownXmlTypes() { 403 return acceptAllKnownXmlTypes; 404 } 405 406 /** 407 * Sets the acceptAllKnownXmlTypes. 408 * 409 * @param acceptAllKnownXmlTypes 410 * the acceptAllKnownXmlTypes to set 411 */ 412 public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) { 413 this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes; 414 this.contentTypeParser.setAcceptAllKnownXmlTypes(acceptAllKnownXmlTypes); 415 } 416 417 /** 418 * Returns the allowGenericXml. 419 * 420 * @return the allowGenericXml 421 */ 422 public boolean isAllowGenericXml() { 423 return allowGenericXml; 424 } 425 426 /** 427 * Sets the allowGenericXml. 428 * 429 * @param allowGenericXml 430 * the allowGenericXml to set 431 */ 432 public void setAllowGenericXml(boolean allowGenericXml) { 433 this.allowGenericXml = allowGenericXml; 434 this.contentTypeParser.setAllowGenericXml(allowGenericXml); 435 } 436 437 /** 438 * Returns the allowXhtml. 439 * 440 * @return the allowXhtml 441 */ 442 public boolean isAllowXhtml() { 443 return allowXhtml; 444 } 445 446 /** 447 * Sets the allowXhtml. 448 * 449 * @param allowXhtml 450 * the allowXhtml to set 451 */ 452 public void setAllowXhtml(boolean allowXhtml) { 453 this.allowXhtml = allowXhtml; 454 this.contentTypeParser.setAllowXhtml(allowXhtml); 455 } 456 457 private String buildAccept() { 458 Set<String> types = new TreeSet<String>(); 459 if (isAllowRnc()) { 460 types.add("application/relax-ng-compact-syntax"); 461 } 462 if (isAllowHtml()) { 463 types.add("text/html; q=0.9"); 464 } 465 if (isAllowXhtml()) { 466 types.add("application/xhtml+xml"); 467 types.add("application/xml; q=0.5"); 468 } 469 if (isAcceptAllKnownXmlTypes()) { 470 types.add("application/xhtml+xml"); 471 // types.add("application/atom+xml"); 472 types.add("image/svg+xml"); 473 types.add("application/docbook+xml"); 474 types.add("application/xml; q=0.5"); 475 types.add("text/xml; q=0.3"); 476 types.add("*/*; q=0.1"); 477 } 478 if (isAllowGenericXml()) { 479 types.add("application/xml; q=0.5"); 480 types.add("text/xml; q=0.3"); 481 types.add("*/*; q=0.1"); 482 } 483 StringBuilder buf = new StringBuilder(); 484 for (Iterator<String> iter = types.iterator(); iter.hasNext();) { 485 String str = iter.next(); 486 buf.append(str); 487 buf.append(", "); 488 } 489 for (int i = 0; i < 2; i++) { 490 int len = buf.length(); 491 if (len > 0) { 492 buf.deleteCharAt(len - 1); 493 } 494 } 495 return buf.toString(); 496 } 497 498 /** 499 * Returns the allowHtml. 500 * 501 * @return the allowHtml 502 */ 503 public boolean isAllowHtml() { 504 return allowHtml; 505 } 506 507 public boolean isOnlyHtmlAllowed() { 508 return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml(); 509 } 510 }