001    /*
002     * Copyright (c) 2007 Mozilla Foundation
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package nu.validator.spec.html5;
024    
025    import java.io.IOException;
026    import java.util.HashMap;
027    import java.util.Map;
028    import java.util.regex.Matcher;
029    import java.util.regex.Pattern;
030    
031    import nu.validator.htmlparser.common.XmlViolationPolicy;
032    import nu.validator.htmlparser.sax.HtmlParser;
033    import nu.validator.saxtree.DocumentFragment;
034    import nu.validator.saxtree.TreeBuilder;
035    import nu.validator.spec.Spec;
036    import nu.validator.xml.AttributesImpl;
037    import nu.validator.xml.EmptyAttributes;
038    
039    import org.xml.sax.Attributes;
040    import org.xml.sax.ContentHandler;
041    import org.xml.sax.InputSource;
042    import org.xml.sax.Locator;
043    import org.xml.sax.SAXException;
044    
045    import com.thaiopensource.xml.util.Name;
046    
047    public class Html5SpecBuilder implements ContentHandler {
048    
049        private static final String NS = "http://www.w3.org/1999/xhtml";
050        
051        private static final String SPEC_URI = "http://www.whatwg.org/specs/web-apps/current-work/";
052        
053        private static final Pattern ELEMENT = Pattern.compile("^.*element\\s*$");
054        
055        private static final Pattern CONTEXT = Pattern.compile("^\\s*Contexts\\s+in\\s+which\\s+this\\s+element\\s+may\\s+be\\s+used:\\s*");
056    
057        private static final Pattern CONTENT_MODEL = Pattern.compile("^\\s*Content\\s+model:\\s*$");
058    
059        private static final Pattern ATTRIBUTES = Pattern.compile("^\\s*Element-specific\\s+attributes:\\s*$");
060        
061        private enum State {
062            AWAITING_HEADING,
063            IN_H4,
064            IN_CODE_IN_H4,
065            AWAITING_ELEMENT_DL,
066            IN_ELEMENT_DL_START,
067            IN_CONTEXT_DT,
068            CAPTURING_CONTEXT_DDS,
069            IN_CONTENT_MODEL_DT,
070            CAPTURING_CONTENT_MODEL_DDS,
071            IN_ATTRIBUTES_DT,
072            CAPTURING_ATTRIBUTES_DDS
073        }
074        
075        private State state = State.AWAITING_HEADING;
076        
077        private int captureDepth = 0;
078        
079        private String currentId;
080        
081        private StringBuilder nameText = new StringBuilder();
082        
083        private StringBuilder referenceText = new StringBuilder();
084        
085        private TreeBuilder fragmentBuilder;
086        
087        private Name currentName;
088    
089        private Map<Name, String> urisByElement = new HashMap<Name, String>();
090    
091        private Map<Name, DocumentFragment> contextsByElement = new HashMap<Name, DocumentFragment>();
092    
093        private Map<Name, DocumentFragment> contentModelsByElement = new HashMap<Name, DocumentFragment>();
094    
095        private Map<Name, DocumentFragment> attributesByElement = new HashMap<Name, DocumentFragment>();
096        
097        public static Spec parseSpec(InputSource in) throws IOException, SAXException {
098            HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
099            Html5SpecBuilder handler = new Html5SpecBuilder();
100            parser.setContentHandler(handler);
101            parser.parse(in);
102            return handler.buildSpec();
103        }
104        
105        public static void main(String[] args) throws IOException, SAXException {
106            parseSpec(new InputSource("http://www.whatwg.org/specs/web-apps/current-work/"));
107        }
108        
109        private Spec buildSpec() {
110            return new Spec(urisByElement, contextsByElement, contentModelsByElement, attributesByElement);
111        }
112    
113        /**
114         * 
115         */
116        private Html5SpecBuilder() {
117            super();
118        }
119    
120        public void characters(char[] ch, int start, int length) throws SAXException {
121            switch(state) {
122                case AWAITING_HEADING:
123                    break;
124                case IN_H4:
125                    referenceText.append(ch, start, length);
126                    break;
127                case IN_CODE_IN_H4:
128                    nameText.append(ch, start, length);
129                    break;
130                case AWAITING_ELEMENT_DL:
131                    break;
132                case IN_ELEMENT_DL_START:
133                    break;
134                case IN_CONTEXT_DT:
135                case IN_CONTENT_MODEL_DT:
136                case IN_ATTRIBUTES_DT:
137                    referenceText.append(ch, start, length);
138                    break;
139                case CAPTURING_CONTEXT_DDS:
140                case CAPTURING_CONTENT_MODEL_DDS:
141                case CAPTURING_ATTRIBUTES_DDS:
142                    fragmentBuilder.characters(ch, start, length);
143                    break;
144            }
145        }
146    
147        public void endDocument() throws SAXException {
148            switch(state) {
149                case AWAITING_HEADING:
150                    // XXX finish
151                    break;
152                case IN_H4:
153                case IN_CODE_IN_H4:
154                case AWAITING_ELEMENT_DL:
155                case IN_ELEMENT_DL_START:
156                case IN_CONTEXT_DT:
157                case IN_CONTENT_MODEL_DT:
158                case IN_ATTRIBUTES_DT:
159                case CAPTURING_CONTEXT_DDS:
160                case CAPTURING_CONTENT_MODEL_DDS:
161                case CAPTURING_ATTRIBUTES_DDS:
162                    throw new SAXException(
163                    "Malformed spec: Wrong state for document end.");
164            }
165        }
166    
167        public void endElement(String uri, String localName, String qName) throws SAXException {
168            switch(state) {
169                case AWAITING_HEADING:
170                    break;
171                case IN_H4:
172                    if ("h4" == localName && NS == uri) {
173                        Matcher m = ELEMENT.matcher(referenceText);
174                        if (m.matches()) {
175                            String ln = nameText.toString().intern();
176                            if ("" == ln) {
177                                throw new SAXException(
178                                        "Malformed spec: no element currentName.");
179                            }
180                            if (currentId == null) {
181                                throw new SAXException(
182                                        "Malformed spec: no element id.");
183                            }
184                            currentName = new Name(NS, ln);
185                            urisByElement.put(currentName, SPEC_URI + "#" + currentId);
186                            state = State.AWAITING_ELEMENT_DL;
187                        } else {
188                            currentId = null;
189                            nameText.setLength(0);
190                            state = State.AWAITING_HEADING;
191                        }
192                    }
193                    break;
194                case IN_CODE_IN_H4:
195                    if ("code" == localName && NS == uri) {
196                        state = State.IN_H4;
197                    }
198                    break;
199                case AWAITING_ELEMENT_DL:
200                    break;
201                case IN_ELEMENT_DL_START:
202                    throw new SAXException(
203                            "Malformed spec: no children in element dl.");
204                case IN_CONTEXT_DT:
205                    if ("dt" == localName && NS == uri) {
206                        Matcher m = CONTEXT.matcher(referenceText);
207                        if (m.matches()) {
208                            state = State.CAPTURING_CONTEXT_DDS;
209                            captureDepth = 0;
210                            fragmentBuilder = new TreeBuilder(true, true);
211                        } else {
212                            throw new SAXException(
213                            "Malformed spec: Expected dt to be context dt but it was not.");                        
214                        }
215                    }
216                    break;
217                case IN_CONTENT_MODEL_DT:
218                    if ("dt" == localName && NS == uri) {
219                        Matcher m = CONTENT_MODEL.matcher(referenceText);
220                        if (m.matches()) {
221                            state = State.CAPTURING_CONTENT_MODEL_DDS;
222                            captureDepth = 0;
223                            fragmentBuilder = new TreeBuilder(true, true);
224                        } else {
225                            throw new SAXException(
226                            "Malformed spec: Expected dt to be context dt but it was not.");                        
227                        }
228                    }
229                    break;
230                case IN_ATTRIBUTES_DT:
231                    if ("dt" == localName && NS == uri) {
232                        Matcher m = ATTRIBUTES.matcher(referenceText);
233                        if (m.matches()) {
234                            state = State.CAPTURING_ATTRIBUTES_DDS;
235                            captureDepth = 0;
236                            fragmentBuilder = new TreeBuilder(true, true);
237                        } else {
238                            throw new SAXException(
239                            "Malformed spec: Expected dt to be context dt but it was not.");                        
240                        }
241                    }
242                    break;
243                case CAPTURING_CONTEXT_DDS:
244                case CAPTURING_CONTENT_MODEL_DDS:
245                case CAPTURING_ATTRIBUTES_DDS:
246                    if (captureDepth == 0) {
247                        throw new SAXException(
248                                "Malformed spec: Did not see following dt when capturing dds.");                    
249                    }
250                    captureDepth--;
251                    fragmentBuilder.endElement(uri, localName, qName);
252                    break;
253            }
254        }
255    
256        public void endPrefixMapping(String prefix) throws SAXException {
257        }
258    
259        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
260        }
261    
262        public void processingInstruction(String target, String data) throws SAXException {
263        }
264    
265        public void setDocumentLocator(Locator locator) {
266        }
267    
268        public void skippedEntity(String name) throws SAXException {
269        }
270    
271        public void startDocument() throws SAXException {
272            // TODO Auto-generated method stub
273            
274        }
275    
276        public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
277            switch(state) {
278                case AWAITING_HEADING:
279                    if ("h4" == localName && NS == uri) {
280                        referenceText.setLength(0);
281                        currentId = null;
282                        state = State.IN_H4;
283                    }
284                    break;
285                case IN_H4:
286                    if ("code" == localName && NS == uri) {
287                        nameText.setLength(0);
288                        state = State.IN_CODE_IN_H4;
289                    } else if ("dfn" == localName && NS == uri) {
290                        currentId = atts.getValue("", "id");
291                    } 
292                    break;
293                case IN_CODE_IN_H4:
294                    break;
295                case AWAITING_ELEMENT_DL:
296                    if ("dl" == localName && NS == uri && "element".equals(atts.getValue("", "class"))) {
297                        state = State.IN_ELEMENT_DL_START;
298                    }
299                    break;
300                case IN_ELEMENT_DL_START:
301                    if ("dt" == localName && NS == uri) {
302                        referenceText.setLength(0);
303                        state = State.IN_CONTEXT_DT;
304                    } else {
305                        throw new SAXException(
306                        "Malformed spec: Expected dt in dl.");                    
307                    } 
308                    break;
309                case IN_CONTEXT_DT:
310                case IN_CONTENT_MODEL_DT:
311                case IN_ATTRIBUTES_DT:
312                    throw new SAXException(
313                            "Malformed spec: Not expecting children in dts.");                    
314                case CAPTURING_CONTEXT_DDS:
315                case CAPTURING_CONTENT_MODEL_DDS:
316                case CAPTURING_ATTRIBUTES_DDS:
317                    if ("dt" == localName && NS == uri && captureDepth == 0) {
318                        DocumentFragment fragment = (DocumentFragment) fragmentBuilder.getRoot();
319                        fragmentBuilder = null;
320                        referenceText.setLength(0);
321                        if (state == State.CAPTURING_CONTEXT_DDS) {
322                            contextsByElement.put(currentName, fragment);
323                            state = State.IN_CONTENT_MODEL_DT;
324                        } else if (state == State.CAPTURING_CONTENT_MODEL_DDS) {
325                            contentModelsByElement.put(currentName, fragment);                        
326                            state = State.IN_ATTRIBUTES_DT;
327                        } else {
328                            attributesByElement.put(currentName, fragment);
329                            state = State.AWAITING_HEADING;
330                        }
331                    } else {
332                        captureDepth++;
333                        String href = null;
334                        if ("a" == localName && NS == uri && (href = atts.getValue("", "href")) != null) {
335                            if (href.startsWith("#")) {
336                                href = SPEC_URI + href;
337                            }
338                            AttributesImpl attributesImpl = new AttributesImpl();
339                            attributesImpl.addAttribute("href", href);
340                            fragmentBuilder.startElement(uri, localName, qName, attributesImpl);
341                        } else {
342                            fragmentBuilder.startElement(uri, localName, qName, EmptyAttributes.EMPTY_ATTRIBUTES);
343                        }
344                    }
345                    break;
346            }
347        }
348    
349        public void startPrefixMapping(String prefix, String uri) throws SAXException {
350        }
351        
352    }