001 /*
002 * Copyright (c) 2005 Henri Sivonen
003 * Copyright (c) 2007 Mozilla Foundation
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package nu.validator.xml;
025
026 import java.io.IOException;
027 import java.util.regex.Matcher;
028 import java.util.regex.Pattern;
029
030 import org.xml.sax.ErrorHandler;
031 import org.xml.sax.InputSource;
032 import org.xml.sax.SAXException;
033 import org.xml.sax.SAXParseException;
034
035 public class ContentTypeParser {
036
037 private static final Pattern CHARSET = Pattern.compile("^\\s*charset\\s*=\\s*(\\S+)\\s*$");
038
039 private final ErrorHandler errorHandler;
040
041 private boolean laxContentType;
042
043 private boolean allowRnc = false;
044
045 private boolean allowHtml = false;
046
047 private boolean allowXhtml = false;
048
049 private boolean acceptAllKnownXmlTypes = false;
050
051 private boolean allowGenericXml = true;
052
053 /**
054 * @param errorHandler
055 * @param laxContentType
056 * @param allowRnc
057 * @param allowHtml
058 * @param allowXhtml
059 * @param acceptAllKnownXmlTypes
060 * @param allowGenericXml
061 */
062 public ContentTypeParser(final ErrorHandler errorHandler, boolean laxContentType, boolean allowRnc, boolean allowHtml, boolean allowXhtml, boolean acceptAllKnownXmlTypes, boolean allowGenericXml) {
063 this.errorHandler = errorHandler;
064 this.laxContentType = laxContentType;
065 this.allowRnc = allowRnc;
066 this.allowHtml = allowHtml;
067 this.allowXhtml = allowXhtml;
068 this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes;
069 this.allowGenericXml = allowGenericXml;
070 }
071
072 public ContentTypeParser(final ErrorHandler errorHandler, boolean laxContentType) {
073 this.errorHandler = errorHandler;
074 this.laxContentType = laxContentType;
075 }
076 protected boolean xmlContentType(String type, InputSource is)
077 throws SAXException {
078 if ("application/xhtml-voice+xml".equals(type)) {
079 if (errorHandler != null) {
080 errorHandler.warning(new SAXParseException(
081 "application/xhtml-voice+xml is an obsolete type.",
082 is.getPublicId(), is.getSystemId(), -1, -1));
083 }
084 }
085 boolean typeOk = "application/xml".equals(type)
086 || "text/xml".equals(type) || type.endsWith("+xml")
087 || "application/xml-external-parsed-entity".equals(type)
088 || "text/xml-external-parsed-entity".equals(type)
089 || "application/xml-dtd".equals(type)
090 || "application/octet-stream".equals(type);
091 if (!typeOk && laxContentType) {
092 boolean laxOk = "text/plain".equals(type)
093 || "text/html".equals(type) || "text/xsl".equals(type);
094 if (laxOk && errorHandler != null) {
095 errorHandler.warning(new SAXParseException(
096 "Being lax about non-XML Content-Type: " + type,
097 is.getPublicId(), is.getSystemId(), -1, -1));
098 }
099 return laxOk;
100 } else {
101 return typeOk;
102 }
103 }
104
105
106 protected boolean rncContentType(String type, InputSource is)
107 throws SAXException {
108 boolean typeOk = "application/relax-ng-compact-syntax".equals(type);
109 if (!typeOk) {
110 typeOk = "application/vnd.relax-ng.rnc".equals(type);
111 if (typeOk && errorHandler != null) {
112 errorHandler.warning(new SAXParseException(
113 "application/vnd.relax-ng.rnc is an unregistered type. application/relax-ng-compact-syntax is the registered type.",
114 is.getPublicId(), is.getSystemId(), -1, -1));
115 }
116 }
117 if (!typeOk) {
118 typeOk = "application/octet-stream".equals(type)
119 && is.getSystemId().endsWith(".rnc");
120 }
121 if (!typeOk && laxContentType) {
122 boolean laxOk = "text/plain".equals(type)
123 && is.getSystemId().endsWith(".rnc");
124 if (laxOk && errorHandler != null) {
125 errorHandler.warning(new SAXParseException(
126 "Being lax about non-RNC Content-Type: " + type,
127 is.getPublicId(), is.getSystemId(), -1, -1));
128 }
129 return laxOk;
130 } else {
131 return typeOk;
132 }
133 }
134
135 /**
136 * @param baseUri
137 * @param publicId
138 * @param contentType
139 * @return
140 * @throws SAXException
141 * @throws SAXParseException
142 */
143 public TypedInputSource buildTypedInputSource(String baseUri,
144 String publicId, String contentType)
145 throws SAXException, SAXParseException {
146 TypedInputSource is;
147 is = new TypedInputSource();
148 is.setPublicId(publicId);
149 is.setSystemId(baseUri);
150 if (contentType != null) {
151 String[] params = contentType.split(";");
152 String type = params[0].trim();
153 boolean wasRnc = false;
154 boolean wasHtml = false;
155 if (isAllowRnc()) {
156 if (rncContentType(type, is)) {
157 wasRnc = true;
158 is.setType("application/relax-ng-compact-syntax");
159 }
160 }
161 if (!wasRnc) {
162 if (isAllowHtml()) {
163 if ("text/html".equals(type)) {
164 is.setType(type);
165 wasHtml = true;
166 } else if (isOnlyHtmlAllowed()) {
167 if (laxContentType && "text/plain".equals(type)) {
168 is.setType(type);
169 wasHtml = true;
170 if (errorHandler != null) {
171 errorHandler.warning(new SAXParseException(
172 "Being lax about non-HTML Content-Type: "
173 + type, is.getPublicId(),
174 is.getSystemId(), -1, -1));
175 }
176 } else {
177 String msg = "Non-HTML Content-Type: \u201C" + type
178 + "\u201D.";
179 SAXParseException spe = new SAXParseException(msg,
180 publicId, baseUri, -1, -1, new IOException(
181 msg));
182 if (errorHandler != null) {
183 errorHandler.fatalError(spe);
184 }
185 throw spe;
186 }
187 }
188 }
189 if (!wasHtml
190 && (isAllowGenericXml() || isAllowXhtml() || isAcceptAllKnownXmlTypes())) {
191 if (!xmlContentType(type, is)) {
192 String msg = "Non-XML Content-Type: \u201C" + type
193 + "\u201D.";
194 SAXParseException spe = new SAXParseException(msg,
195 publicId, baseUri, -1, -1, new IOException(msg));
196 if (errorHandler != null) {
197 errorHandler.fatalError(spe);
198 }
199 throw spe;
200 } else {
201 is.setType(type);
202 }
203 }
204 }
205 String charset = null;
206 for (int i = 1; i < params.length; i++) {
207 Matcher matcher = CHARSET.matcher(params[i]);
208 if (matcher.matches()) {
209 charset = matcher.group(1);
210 break;
211 }
212 }
213 if (charset != null) {
214 is.setEncoding(charset);
215 } else if (type.startsWith("text/") && !wasHtml) {
216 if (laxContentType) {
217 if (errorHandler != null) {
218 errorHandler.warning(new SAXParseException(
219 "text/* type without a charset parameter seen. Would have defaulted to US-ASCII had the lax option not been chosen.",
220 is.getPublicId(), is.getSystemId(), -1, -1));
221 }
222 } else {
223 is.setEncoding("US-ASCII");
224 if (errorHandler != null) {
225 errorHandler.warning(new SAXParseException(
226 "text/* type without a charset parameter seen. Defaulting to US-ASCII per section 3.1 of RFC 3023.",
227 is.getPublicId(), is.getSystemId(), -1, -1));
228 }
229 }
230 }
231 }
232 return is;
233 }
234
235
236 /**
237 * Returns the acceptAllKnownXmlTypes.
238 *
239 * @return the acceptAllKnownXmlTypes
240 */
241 public boolean isAcceptAllKnownXmlTypes() {
242 return acceptAllKnownXmlTypes;
243 }
244
245
246 /**
247 * Sets the acceptAllKnownXmlTypes.
248 *
249 * @param acceptAllKnownXmlTypes the acceptAllKnownXmlTypes to set
250 */
251 public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) {
252 this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes;
253 }
254
255
256 /**
257 * Returns the allowGenericXml.
258 *
259 * @return the allowGenericXml
260 */
261 public boolean isAllowGenericXml() {
262 return allowGenericXml;
263 }
264
265
266 /**
267 * Sets the allowGenericXml.
268 *
269 * @param allowGenericXml the allowGenericXml to set
270 */
271 public void setAllowGenericXml(boolean allowGenericXml) {
272 this.allowGenericXml = allowGenericXml;
273 }
274
275
276 /**
277 * Returns the allowHtml.
278 *
279 * @return the allowHtml
280 */
281 public boolean isAllowHtml() {
282 return allowHtml;
283 }
284
285
286 /**
287 * Sets the allowHtml.
288 *
289 * @param allowHtml the allowHtml to set
290 */
291 public void setAllowHtml(boolean allowHtml) {
292 this.allowHtml = allowHtml;
293 }
294
295
296 /**
297 * Returns the allowRnc.
298 *
299 * @return the allowRnc
300 */
301 public boolean isAllowRnc() {
302 return allowRnc;
303 }
304
305
306 /**
307 * Sets the allowRnc.
308 *
309 * @param allowRnc the allowRnc to set
310 */
311 public void setAllowRnc(boolean allowRnc) {
312 this.allowRnc = allowRnc;
313 }
314
315
316 /**
317 * Returns the allowXhtml.
318 *
319 * @return the allowXhtml
320 */
321 public boolean isAllowXhtml() {
322 return allowXhtml;
323 }
324
325
326 /**
327 * Sets the allowXhtml.
328 *
329 * @param allowXhtml the allowXhtml to set
330 */
331 public void setAllowXhtml(boolean allowXhtml) {
332 this.allowXhtml = allowXhtml;
333 }
334
335
336 /**
337 * Returns the laxContentType.
338 *
339 * @return the laxContentType
340 */
341 public boolean isLaxContentType() {
342 return laxContentType;
343 }
344
345
346 /**
347 * Sets the laxContentType.
348 *
349 * @param laxContentType the laxContentType to set
350 */
351 public void setLaxContentType(boolean laxContentType) {
352 this.laxContentType = laxContentType;
353 }
354
355
356 public boolean isOnlyHtmlAllowed() {
357 return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml();
358 }
359
360 }