001 /* 002 * Copyright (c) 2005 Henri Sivonen 003 * Copyright (c) 2007 Mozilla Foundation 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package nu.validator.xml; 025 026 import java.io.IOException; 027 import java.util.regex.Matcher; 028 import java.util.regex.Pattern; 029 030 import org.xml.sax.ErrorHandler; 031 import org.xml.sax.InputSource; 032 import org.xml.sax.SAXException; 033 import org.xml.sax.SAXParseException; 034 035 public class ContentTypeParser { 036 037 private static final Pattern CHARSET = Pattern.compile("^\\s*charset\\s*=\\s*(\\S+)\\s*$"); 038 039 private final ErrorHandler errorHandler; 040 041 private boolean laxContentType; 042 043 private boolean allowRnc = false; 044 045 private boolean allowHtml = false; 046 047 private boolean allowXhtml = false; 048 049 private boolean acceptAllKnownXmlTypes = false; 050 051 private boolean allowGenericXml = true; 052 053 /** 054 * @param errorHandler 055 * @param laxContentType 056 * @param allowRnc 057 * @param allowHtml 058 * @param allowXhtml 059 * @param acceptAllKnownXmlTypes 060 * @param allowGenericXml 061 */ 062 public ContentTypeParser(final ErrorHandler errorHandler, boolean laxContentType, boolean allowRnc, boolean allowHtml, boolean allowXhtml, boolean acceptAllKnownXmlTypes, boolean allowGenericXml) { 063 this.errorHandler = errorHandler; 064 this.laxContentType = laxContentType; 065 this.allowRnc = allowRnc; 066 this.allowHtml = allowHtml; 067 this.allowXhtml = allowXhtml; 068 this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes; 069 this.allowGenericXml = allowGenericXml; 070 } 071 072 public ContentTypeParser(final ErrorHandler errorHandler, boolean laxContentType) { 073 this.errorHandler = errorHandler; 074 this.laxContentType = laxContentType; 075 } 076 protected boolean xmlContentType(String type, InputSource is) 077 throws SAXException { 078 if ("application/xhtml-voice+xml".equals(type)) { 079 if (errorHandler != null) { 080 errorHandler.warning(new SAXParseException( 081 "application/xhtml-voice+xml is an obsolete type.", 082 is.getPublicId(), is.getSystemId(), -1, -1)); 083 } 084 } 085 boolean typeOk = "application/xml".equals(type) 086 || "text/xml".equals(type) || type.endsWith("+xml") 087 || "application/xml-external-parsed-entity".equals(type) 088 || "text/xml-external-parsed-entity".equals(type) 089 || "application/xml-dtd".equals(type) 090 || "application/octet-stream".equals(type); 091 if (!typeOk && laxContentType) { 092 boolean laxOk = "text/plain".equals(type) 093 || "text/html".equals(type) || "text/xsl".equals(type); 094 if (laxOk && errorHandler != null) { 095 errorHandler.warning(new SAXParseException( 096 "Being lax about non-XML Content-Type: " + type, 097 is.getPublicId(), is.getSystemId(), -1, -1)); 098 } 099 return laxOk; 100 } else { 101 return typeOk; 102 } 103 } 104 105 106 protected boolean rncContentType(String type, InputSource is) 107 throws SAXException { 108 boolean typeOk = "application/relax-ng-compact-syntax".equals(type); 109 if (!typeOk) { 110 typeOk = "application/vnd.relax-ng.rnc".equals(type); 111 if (typeOk && errorHandler != null) { 112 errorHandler.warning(new SAXParseException( 113 "application/vnd.relax-ng.rnc is an unregistered type. application/relax-ng-compact-syntax is the registered type.", 114 is.getPublicId(), is.getSystemId(), -1, -1)); 115 } 116 } 117 if (!typeOk) { 118 typeOk = "application/octet-stream".equals(type) 119 && is.getSystemId().endsWith(".rnc"); 120 } 121 if (!typeOk && laxContentType) { 122 boolean laxOk = "text/plain".equals(type) 123 && is.getSystemId().endsWith(".rnc"); 124 if (laxOk && errorHandler != null) { 125 errorHandler.warning(new SAXParseException( 126 "Being lax about non-RNC Content-Type: " + type, 127 is.getPublicId(), is.getSystemId(), -1, -1)); 128 } 129 return laxOk; 130 } else { 131 return typeOk; 132 } 133 } 134 135 /** 136 * @param baseUri 137 * @param publicId 138 * @param contentType 139 * @return 140 * @throws SAXException 141 * @throws SAXParseException 142 */ 143 public TypedInputSource buildTypedInputSource(String baseUri, 144 String publicId, String contentType) 145 throws SAXException, SAXParseException { 146 TypedInputSource is; 147 is = new TypedInputSource(); 148 is.setPublicId(publicId); 149 is.setSystemId(baseUri); 150 if (contentType != null) { 151 String[] params = contentType.split(";"); 152 String type = params[0].trim(); 153 boolean wasRnc = false; 154 boolean wasHtml = false; 155 if (isAllowRnc()) { 156 if (rncContentType(type, is)) { 157 wasRnc = true; 158 is.setType("application/relax-ng-compact-syntax"); 159 } 160 } 161 if (!wasRnc) { 162 if (isAllowHtml()) { 163 if ("text/html".equals(type)) { 164 is.setType(type); 165 wasHtml = true; 166 } else if (isOnlyHtmlAllowed()) { 167 if (laxContentType && "text/plain".equals(type)) { 168 is.setType(type); 169 wasHtml = true; 170 if (errorHandler != null) { 171 errorHandler.warning(new SAXParseException( 172 "Being lax about non-HTML Content-Type: " 173 + type, is.getPublicId(), 174 is.getSystemId(), -1, -1)); 175 } 176 } else { 177 String msg = "Non-HTML Content-Type: \u201C" + type 178 + "\u201D."; 179 SAXParseException spe = new SAXParseException(msg, 180 publicId, baseUri, -1, -1, new IOException( 181 msg)); 182 if (errorHandler != null) { 183 errorHandler.fatalError(spe); 184 } 185 throw spe; 186 } 187 } 188 } 189 if (!wasHtml 190 && (isAllowGenericXml() || isAllowXhtml() || isAcceptAllKnownXmlTypes())) { 191 if (!xmlContentType(type, is)) { 192 String msg = "Non-XML Content-Type: \u201C" + type 193 + "\u201D."; 194 SAXParseException spe = new SAXParseException(msg, 195 publicId, baseUri, -1, -1, new IOException(msg)); 196 if (errorHandler != null) { 197 errorHandler.fatalError(spe); 198 } 199 throw spe; 200 } else { 201 is.setType(type); 202 } 203 } 204 } 205 String charset = null; 206 for (int i = 1; i < params.length; i++) { 207 Matcher matcher = CHARSET.matcher(params[i]); 208 if (matcher.matches()) { 209 charset = matcher.group(1); 210 break; 211 } 212 } 213 if (charset != null) { 214 is.setEncoding(charset); 215 } else if (type.startsWith("text/") && !wasHtml) { 216 if (laxContentType) { 217 if (errorHandler != null) { 218 errorHandler.warning(new SAXParseException( 219 "text/* type without a charset parameter seen. Would have defaulted to US-ASCII had the lax option not been chosen.", 220 is.getPublicId(), is.getSystemId(), -1, -1)); 221 } 222 } else { 223 is.setEncoding("US-ASCII"); 224 if (errorHandler != null) { 225 errorHandler.warning(new SAXParseException( 226 "text/* type without a charset parameter seen. Defaulting to US-ASCII per section 3.1 of RFC 3023.", 227 is.getPublicId(), is.getSystemId(), -1, -1)); 228 } 229 } 230 } 231 } 232 return is; 233 } 234 235 236 /** 237 * Returns the acceptAllKnownXmlTypes. 238 * 239 * @return the acceptAllKnownXmlTypes 240 */ 241 public boolean isAcceptAllKnownXmlTypes() { 242 return acceptAllKnownXmlTypes; 243 } 244 245 246 /** 247 * Sets the acceptAllKnownXmlTypes. 248 * 249 * @param acceptAllKnownXmlTypes the acceptAllKnownXmlTypes to set 250 */ 251 public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) { 252 this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes; 253 } 254 255 256 /** 257 * Returns the allowGenericXml. 258 * 259 * @return the allowGenericXml 260 */ 261 public boolean isAllowGenericXml() { 262 return allowGenericXml; 263 } 264 265 266 /** 267 * Sets the allowGenericXml. 268 * 269 * @param allowGenericXml the allowGenericXml to set 270 */ 271 public void setAllowGenericXml(boolean allowGenericXml) { 272 this.allowGenericXml = allowGenericXml; 273 } 274 275 276 /** 277 * Returns the allowHtml. 278 * 279 * @return the allowHtml 280 */ 281 public boolean isAllowHtml() { 282 return allowHtml; 283 } 284 285 286 /** 287 * Sets the allowHtml. 288 * 289 * @param allowHtml the allowHtml to set 290 */ 291 public void setAllowHtml(boolean allowHtml) { 292 this.allowHtml = allowHtml; 293 } 294 295 296 /** 297 * Returns the allowRnc. 298 * 299 * @return the allowRnc 300 */ 301 public boolean isAllowRnc() { 302 return allowRnc; 303 } 304 305 306 /** 307 * Sets the allowRnc. 308 * 309 * @param allowRnc the allowRnc to set 310 */ 311 public void setAllowRnc(boolean allowRnc) { 312 this.allowRnc = allowRnc; 313 } 314 315 316 /** 317 * Returns the allowXhtml. 318 * 319 * @return the allowXhtml 320 */ 321 public boolean isAllowXhtml() { 322 return allowXhtml; 323 } 324 325 326 /** 327 * Sets the allowXhtml. 328 * 329 * @param allowXhtml the allowXhtml to set 330 */ 331 public void setAllowXhtml(boolean allowXhtml) { 332 this.allowXhtml = allowXhtml; 333 } 334 335 336 /** 337 * Returns the laxContentType. 338 * 339 * @return the laxContentType 340 */ 341 public boolean isLaxContentType() { 342 return laxContentType; 343 } 344 345 346 /** 347 * Sets the laxContentType. 348 * 349 * @param laxContentType the laxContentType to set 350 */ 351 public void setLaxContentType(boolean laxContentType) { 352 this.laxContentType = laxContentType; 353 } 354 355 356 public boolean isOnlyHtmlAllowed() { 357 return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml(); 358 } 359 360 }