001    /*
002     * Copyright (c) 2005 Henri Sivonen
003     * Copyright (c) 2007 Mozilla Foundation
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package nu.validator.xml;
025    
026    import java.io.IOException;
027    import java.util.regex.Matcher;
028    import java.util.regex.Pattern;
029    
030    import org.xml.sax.ErrorHandler;
031    import org.xml.sax.InputSource;
032    import org.xml.sax.SAXException;
033    import org.xml.sax.SAXParseException;
034    
035    public class ContentTypeParser {
036    
037        private static final Pattern CHARSET = Pattern.compile("^\\s*charset\\s*=\\s*(\\S+)\\s*$");
038        
039        private final ErrorHandler errorHandler;
040    
041        private boolean laxContentType;
042    
043        private boolean allowRnc = false;
044    
045        private boolean allowHtml = false;
046    
047        private boolean allowXhtml = false;
048    
049        private boolean acceptAllKnownXmlTypes = false;
050    
051        private boolean allowGenericXml = true;
052    
053        /**
054         * @param errorHandler
055         * @param laxContentType
056         * @param allowRnc
057         * @param allowHtml
058         * @param allowXhtml
059         * @param acceptAllKnownXmlTypes
060         * @param allowGenericXml
061         */
062        public ContentTypeParser(final ErrorHandler errorHandler, boolean laxContentType, boolean allowRnc, boolean allowHtml, boolean allowXhtml, boolean acceptAllKnownXmlTypes, boolean allowGenericXml) {
063            this.errorHandler = errorHandler;
064            this.laxContentType = laxContentType;
065            this.allowRnc = allowRnc;
066            this.allowHtml = allowHtml;
067            this.allowXhtml = allowXhtml;
068            this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes;
069            this.allowGenericXml = allowGenericXml;
070        }
071    
072        public ContentTypeParser(final ErrorHandler errorHandler, boolean laxContentType) {
073            this.errorHandler = errorHandler;
074            this.laxContentType = laxContentType;        
075        }
076        protected boolean xmlContentType(String type, InputSource is)
077                throws SAXException {
078            if ("application/xhtml-voice+xml".equals(type)) {
079                if (errorHandler != null) {
080                    errorHandler.warning(new SAXParseException(
081                            "application/xhtml-voice+xml is an obsolete type.",
082                            is.getPublicId(), is.getSystemId(), -1, -1));
083                }
084            }
085            boolean typeOk = "application/xml".equals(type)
086                    || "text/xml".equals(type) || type.endsWith("+xml")
087                    || "application/xml-external-parsed-entity".equals(type)
088                    || "text/xml-external-parsed-entity".equals(type)
089                    || "application/xml-dtd".equals(type)
090                    || "application/octet-stream".equals(type);
091            if (!typeOk && laxContentType) {
092                boolean laxOk = "text/plain".equals(type)
093                        || "text/html".equals(type) || "text/xsl".equals(type);
094                if (laxOk && errorHandler != null) {
095                    errorHandler.warning(new SAXParseException(
096                            "Being lax about non-XML Content-Type: " + type,
097                            is.getPublicId(), is.getSystemId(), -1, -1));
098                }
099                return laxOk;
100            } else {
101                return typeOk;
102            }
103        }
104    
105        
106        protected boolean rncContentType(String type, InputSource is)
107                throws SAXException {
108            boolean typeOk = "application/relax-ng-compact-syntax".equals(type);
109            if (!typeOk) {
110                typeOk = "application/vnd.relax-ng.rnc".equals(type);
111                if (typeOk && errorHandler != null) {
112                    errorHandler.warning(new SAXParseException(
113                            "application/vnd.relax-ng.rnc is an unregistered type. application/relax-ng-compact-syntax is the registered type.",
114                            is.getPublicId(), is.getSystemId(), -1, -1));
115                }
116            }
117            if (!typeOk) {
118                typeOk = "application/octet-stream".equals(type)
119                        && is.getSystemId().endsWith(".rnc");
120            }
121            if (!typeOk && laxContentType) {
122                boolean laxOk = "text/plain".equals(type)
123                        && is.getSystemId().endsWith(".rnc");
124                if (laxOk && errorHandler != null) {
125                    errorHandler.warning(new SAXParseException(
126                            "Being lax about non-RNC Content-Type: " + type,
127                            is.getPublicId(), is.getSystemId(), -1, -1));
128                }
129                return laxOk;
130            } else {
131                return typeOk;
132            }
133        }
134    
135        /**
136         * @param baseUri
137         * @param publicId
138         * @param contentType
139         * @return
140         * @throws SAXException
141         * @throws SAXParseException
142         */
143        public TypedInputSource buildTypedInputSource(String baseUri,
144                String publicId, String contentType)
145                throws SAXException, SAXParseException {
146            TypedInputSource is;
147            is = new TypedInputSource();
148            is.setPublicId(publicId);
149            is.setSystemId(baseUri);
150            if (contentType != null) {
151                String[] params = contentType.split(";");
152                String type = params[0].trim();
153                boolean wasRnc = false;
154                boolean wasHtml = false;
155                if (isAllowRnc()) {
156                    if (rncContentType(type, is)) {
157                        wasRnc = true;
158                        is.setType("application/relax-ng-compact-syntax");
159                    }
160                }
161                if (!wasRnc) {
162                    if (isAllowHtml()) {
163                        if ("text/html".equals(type)) {
164                            is.setType(type);
165                            wasHtml = true;
166                        } else if (isOnlyHtmlAllowed()) {
167                            if (laxContentType && "text/plain".equals(type)) {
168                                is.setType(type);
169                                wasHtml = true;
170                                if (errorHandler != null) {
171                                    errorHandler.warning(new SAXParseException(
172                                            "Being lax about non-HTML Content-Type: "
173                                                    + type, is.getPublicId(),
174                                            is.getSystemId(), -1, -1));
175                                }
176                            } else {
177                                String msg = "Non-HTML Content-Type: \u201C" + type
178                                        + "\u201D.";
179                                SAXParseException spe = new SAXParseException(msg,
180                                        publicId, baseUri, -1, -1, new IOException(
181                                                msg));
182                                if (errorHandler != null) {
183                                    errorHandler.fatalError(spe);
184                                }
185                                throw spe;
186                            }
187                        }
188                    }
189                    if (!wasHtml
190                            && (isAllowGenericXml() || isAllowXhtml() || isAcceptAllKnownXmlTypes())) {
191                        if (!xmlContentType(type, is)) {
192                            String msg = "Non-XML Content-Type: \u201C" + type
193                                    + "\u201D.";
194                            SAXParseException spe = new SAXParseException(msg,
195                                    publicId, baseUri, -1, -1, new IOException(msg));
196                            if (errorHandler != null) {
197                                errorHandler.fatalError(spe);
198                            }
199                            throw spe;
200                        } else {
201                            is.setType(type);
202                        }
203                    }
204                }
205                String charset = null;
206                for (int i = 1; i < params.length; i++) {
207                    Matcher matcher = CHARSET.matcher(params[i]);
208                    if (matcher.matches()) {
209                        charset = matcher.group(1);
210                        break;
211                    }
212                }
213                if (charset != null) {
214                    is.setEncoding(charset);
215                } else if (type.startsWith("text/") && !wasHtml) {
216                    if (laxContentType) {
217                        if (errorHandler != null) {
218                            errorHandler.warning(new SAXParseException(
219                                    "text/* type without a charset parameter seen. Would have defaulted to US-ASCII had the lax option not been chosen.",
220                                    is.getPublicId(), is.getSystemId(), -1, -1));
221                        }
222                    } else {
223                        is.setEncoding("US-ASCII");
224                        if (errorHandler != null) {
225                            errorHandler.warning(new SAXParseException(
226                                    "text/* type without a charset parameter seen. Defaulting to US-ASCII per section 3.1 of RFC 3023.",
227                                    is.getPublicId(), is.getSystemId(), -1, -1));
228                        }
229                    }
230                }
231            }
232            return is;
233        }
234    
235    
236        /**
237         * Returns the acceptAllKnownXmlTypes.
238         * 
239         * @return the acceptAllKnownXmlTypes
240         */
241        public boolean isAcceptAllKnownXmlTypes() {
242            return acceptAllKnownXmlTypes;
243        }
244    
245    
246        /**
247         * Sets the acceptAllKnownXmlTypes.
248         * 
249         * @param acceptAllKnownXmlTypes the acceptAllKnownXmlTypes to set
250         */
251        public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) {
252            this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes;
253        }
254    
255    
256        /**
257         * Returns the allowGenericXml.
258         * 
259         * @return the allowGenericXml
260         */
261        public boolean isAllowGenericXml() {
262            return allowGenericXml;
263        }
264    
265    
266        /**
267         * Sets the allowGenericXml.
268         * 
269         * @param allowGenericXml the allowGenericXml to set
270         */
271        public void setAllowGenericXml(boolean allowGenericXml) {
272            this.allowGenericXml = allowGenericXml;
273        }
274    
275    
276        /**
277         * Returns the allowHtml.
278         * 
279         * @return the allowHtml
280         */
281        public boolean isAllowHtml() {
282            return allowHtml;
283        }
284    
285    
286        /**
287         * Sets the allowHtml.
288         * 
289         * @param allowHtml the allowHtml to set
290         */
291        public void setAllowHtml(boolean allowHtml) {
292            this.allowHtml = allowHtml;
293        }
294    
295    
296        /**
297         * Returns the allowRnc.
298         * 
299         * @return the allowRnc
300         */
301        public boolean isAllowRnc() {
302            return allowRnc;
303        }
304    
305    
306        /**
307         * Sets the allowRnc.
308         * 
309         * @param allowRnc the allowRnc to set
310         */
311        public void setAllowRnc(boolean allowRnc) {
312            this.allowRnc = allowRnc;
313        }
314    
315    
316        /**
317         * Returns the allowXhtml.
318         * 
319         * @return the allowXhtml
320         */
321        public boolean isAllowXhtml() {
322            return allowXhtml;
323        }
324    
325    
326        /**
327         * Sets the allowXhtml.
328         * 
329         * @param allowXhtml the allowXhtml to set
330         */
331        public void setAllowXhtml(boolean allowXhtml) {
332            this.allowXhtml = allowXhtml;
333        }
334    
335    
336        /**
337         * Returns the laxContentType.
338         * 
339         * @return the laxContentType
340         */
341        public boolean isLaxContentType() {
342            return laxContentType;
343        }
344    
345    
346        /**
347         * Sets the laxContentType.
348         * 
349         * @param laxContentType the laxContentType to set
350         */
351        public void setLaxContentType(boolean laxContentType) {
352            this.laxContentType = laxContentType;
353        }
354    
355    
356        public boolean isOnlyHtmlAllowed() {
357            return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml();
358        }
359    
360    }