001 /* 002 * Copyright (c) 2007 Mozilla Foundation 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package nu.validator.spec.html5; 024 025 import java.io.IOException; 026 import java.util.HashMap; 027 import java.util.Map; 028 import java.util.regex.Matcher; 029 import java.util.regex.Pattern; 030 031 import nu.validator.htmlparser.common.XmlViolationPolicy; 032 import nu.validator.htmlparser.sax.HtmlParser; 033 import nu.validator.saxtree.DocumentFragment; 034 import nu.validator.saxtree.TreeBuilder; 035 import nu.validator.spec.Spec; 036 import nu.validator.xml.AttributesImpl; 037 import nu.validator.xml.EmptyAttributes; 038 039 import org.xml.sax.Attributes; 040 import org.xml.sax.ContentHandler; 041 import org.xml.sax.InputSource; 042 import org.xml.sax.Locator; 043 import org.xml.sax.SAXException; 044 045 import com.thaiopensource.xml.util.Name; 046 047 public class Html5SpecBuilder implements ContentHandler { 048 049 private static final String NS = "http://www.w3.org/1999/xhtml"; 050 051 private static final String SPEC_URI = "http://www.whatwg.org/specs/web-apps/current-work/"; 052 053 private static final Pattern ELEMENT = Pattern.compile("^.*element\\s*$"); 054 055 private static final Pattern CONTEXT = Pattern.compile("^\\s*Contexts\\s+in\\s+which\\s+this\\s+element\\s+may\\s+be\\s+used:\\s*"); 056 057 private static final Pattern CONTENT_MODEL = Pattern.compile("^\\s*Content\\s+model:\\s*$"); 058 059 private static final Pattern ATTRIBUTES = Pattern.compile("^\\s*Element-specific\\s+attributes:\\s*$"); 060 061 private enum State { 062 AWAITING_HEADING, 063 IN_H4, 064 IN_CODE_IN_H4, 065 AWAITING_ELEMENT_DL, 066 IN_ELEMENT_DL_START, 067 IN_CONTEXT_DT, 068 CAPTURING_CONTEXT_DDS, 069 IN_CONTENT_MODEL_DT, 070 CAPTURING_CONTENT_MODEL_DDS, 071 IN_ATTRIBUTES_DT, 072 CAPTURING_ATTRIBUTES_DDS 073 } 074 075 private State state = State.AWAITING_HEADING; 076 077 private int captureDepth = 0; 078 079 private String currentId; 080 081 private StringBuilder nameText = new StringBuilder(); 082 083 private StringBuilder referenceText = new StringBuilder(); 084 085 private TreeBuilder fragmentBuilder; 086 087 private Name currentName; 088 089 private Map<Name, String> urisByElement = new HashMap<Name, String>(); 090 091 private Map<Name, DocumentFragment> contextsByElement = new HashMap<Name, DocumentFragment>(); 092 093 private Map<Name, DocumentFragment> contentModelsByElement = new HashMap<Name, DocumentFragment>(); 094 095 private Map<Name, DocumentFragment> attributesByElement = new HashMap<Name, DocumentFragment>(); 096 097 public static Spec parseSpec(InputSource in) throws IOException, SAXException { 098 HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALTER_INFOSET); 099 Html5SpecBuilder handler = new Html5SpecBuilder(); 100 parser.setContentHandler(handler); 101 parser.parse(in); 102 return handler.buildSpec(); 103 } 104 105 public static void main(String[] args) throws IOException, SAXException { 106 parseSpec(new InputSource("http://www.whatwg.org/specs/web-apps/current-work/")); 107 } 108 109 private Spec buildSpec() { 110 return new Spec(urisByElement, contextsByElement, contentModelsByElement, attributesByElement); 111 } 112 113 /** 114 * 115 */ 116 private Html5SpecBuilder() { 117 super(); 118 } 119 120 public void characters(char[] ch, int start, int length) throws SAXException { 121 switch(state) { 122 case AWAITING_HEADING: 123 break; 124 case IN_H4: 125 referenceText.append(ch, start, length); 126 break; 127 case IN_CODE_IN_H4: 128 nameText.append(ch, start, length); 129 break; 130 case AWAITING_ELEMENT_DL: 131 break; 132 case IN_ELEMENT_DL_START: 133 break; 134 case IN_CONTEXT_DT: 135 case IN_CONTENT_MODEL_DT: 136 case IN_ATTRIBUTES_DT: 137 referenceText.append(ch, start, length); 138 break; 139 case CAPTURING_CONTEXT_DDS: 140 case CAPTURING_CONTENT_MODEL_DDS: 141 case CAPTURING_ATTRIBUTES_DDS: 142 fragmentBuilder.characters(ch, start, length); 143 break; 144 } 145 } 146 147 public void endDocument() throws SAXException { 148 switch(state) { 149 case AWAITING_HEADING: 150 // XXX finish 151 break; 152 case IN_H4: 153 case IN_CODE_IN_H4: 154 case AWAITING_ELEMENT_DL: 155 case IN_ELEMENT_DL_START: 156 case IN_CONTEXT_DT: 157 case IN_CONTENT_MODEL_DT: 158 case IN_ATTRIBUTES_DT: 159 case CAPTURING_CONTEXT_DDS: 160 case CAPTURING_CONTENT_MODEL_DDS: 161 case CAPTURING_ATTRIBUTES_DDS: 162 throw new SAXException( 163 "Malformed spec: Wrong state for document end."); 164 } 165 } 166 167 public void endElement(String uri, String localName, String qName) throws SAXException { 168 switch(state) { 169 case AWAITING_HEADING: 170 break; 171 case IN_H4: 172 if ("h4" == localName && NS == uri) { 173 Matcher m = ELEMENT.matcher(referenceText); 174 if (m.matches()) { 175 String ln = nameText.toString().intern(); 176 if ("" == ln) { 177 throw new SAXException( 178 "Malformed spec: no element currentName."); 179 } 180 if (currentId == null) { 181 throw new SAXException( 182 "Malformed spec: no element id."); 183 } 184 currentName = new Name(NS, ln); 185 urisByElement.put(currentName, SPEC_URI + "#" + currentId); 186 state = State.AWAITING_ELEMENT_DL; 187 } else { 188 currentId = null; 189 nameText.setLength(0); 190 state = State.AWAITING_HEADING; 191 } 192 } 193 break; 194 case IN_CODE_IN_H4: 195 if ("code" == localName && NS == uri) { 196 state = State.IN_H4; 197 } 198 break; 199 case AWAITING_ELEMENT_DL: 200 break; 201 case IN_ELEMENT_DL_START: 202 throw new SAXException( 203 "Malformed spec: no children in element dl."); 204 case IN_CONTEXT_DT: 205 if ("dt" == localName && NS == uri) { 206 Matcher m = CONTEXT.matcher(referenceText); 207 if (m.matches()) { 208 state = State.CAPTURING_CONTEXT_DDS; 209 captureDepth = 0; 210 fragmentBuilder = new TreeBuilder(true, true); 211 } else { 212 throw new SAXException( 213 "Malformed spec: Expected dt to be context dt but it was not."); 214 } 215 } 216 break; 217 case IN_CONTENT_MODEL_DT: 218 if ("dt" == localName && NS == uri) { 219 Matcher m = CONTENT_MODEL.matcher(referenceText); 220 if (m.matches()) { 221 state = State.CAPTURING_CONTENT_MODEL_DDS; 222 captureDepth = 0; 223 fragmentBuilder = new TreeBuilder(true, true); 224 } else { 225 throw new SAXException( 226 "Malformed spec: Expected dt to be context dt but it was not."); 227 } 228 } 229 break; 230 case IN_ATTRIBUTES_DT: 231 if ("dt" == localName && NS == uri) { 232 Matcher m = ATTRIBUTES.matcher(referenceText); 233 if (m.matches()) { 234 state = State.CAPTURING_ATTRIBUTES_DDS; 235 captureDepth = 0; 236 fragmentBuilder = new TreeBuilder(true, true); 237 } else { 238 throw new SAXException( 239 "Malformed spec: Expected dt to be context dt but it was not."); 240 } 241 } 242 break; 243 case CAPTURING_CONTEXT_DDS: 244 case CAPTURING_CONTENT_MODEL_DDS: 245 case CAPTURING_ATTRIBUTES_DDS: 246 if (captureDepth == 0) { 247 throw new SAXException( 248 "Malformed spec: Did not see following dt when capturing dds."); 249 } 250 captureDepth--; 251 fragmentBuilder.endElement(uri, localName, qName); 252 break; 253 } 254 } 255 256 public void endPrefixMapping(String prefix) throws SAXException { 257 } 258 259 public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { 260 } 261 262 public void processingInstruction(String target, String data) throws SAXException { 263 } 264 265 public void setDocumentLocator(Locator locator) { 266 } 267 268 public void skippedEntity(String name) throws SAXException { 269 } 270 271 public void startDocument() throws SAXException { 272 // TODO Auto-generated method stub 273 274 } 275 276 public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { 277 switch(state) { 278 case AWAITING_HEADING: 279 if ("h4" == localName && NS == uri) { 280 referenceText.setLength(0); 281 currentId = null; 282 state = State.IN_H4; 283 } 284 break; 285 case IN_H4: 286 if ("code" == localName && NS == uri) { 287 nameText.setLength(0); 288 state = State.IN_CODE_IN_H4; 289 } else if ("dfn" == localName && NS == uri) { 290 currentId = atts.getValue("", "id"); 291 } 292 break; 293 case IN_CODE_IN_H4: 294 break; 295 case AWAITING_ELEMENT_DL: 296 if ("dl" == localName && NS == uri && "element".equals(atts.getValue("", "class"))) { 297 state = State.IN_ELEMENT_DL_START; 298 } 299 break; 300 case IN_ELEMENT_DL_START: 301 if ("dt" == localName && NS == uri) { 302 referenceText.setLength(0); 303 state = State.IN_CONTEXT_DT; 304 } else { 305 throw new SAXException( 306 "Malformed spec: Expected dt in dl."); 307 } 308 break; 309 case IN_CONTEXT_DT: 310 case IN_CONTENT_MODEL_DT: 311 case IN_ATTRIBUTES_DT: 312 throw new SAXException( 313 "Malformed spec: Not expecting children in dts."); 314 case CAPTURING_CONTEXT_DDS: 315 case CAPTURING_CONTENT_MODEL_DDS: 316 case CAPTURING_ATTRIBUTES_DDS: 317 if ("dt" == localName && NS == uri && captureDepth == 0) { 318 DocumentFragment fragment = (DocumentFragment) fragmentBuilder.getRoot(); 319 fragmentBuilder = null; 320 referenceText.setLength(0); 321 if (state == State.CAPTURING_CONTEXT_DDS) { 322 contextsByElement.put(currentName, fragment); 323 state = State.IN_CONTENT_MODEL_DT; 324 } else if (state == State.CAPTURING_CONTENT_MODEL_DDS) { 325 contentModelsByElement.put(currentName, fragment); 326 state = State.IN_ATTRIBUTES_DT; 327 } else { 328 attributesByElement.put(currentName, fragment); 329 state = State.AWAITING_HEADING; 330 } 331 } else { 332 captureDepth++; 333 String href = null; 334 if ("a" == localName && NS == uri && (href = atts.getValue("", "href")) != null) { 335 if (href.startsWith("#")) { 336 href = SPEC_URI + href; 337 } 338 AttributesImpl attributesImpl = new AttributesImpl(); 339 attributesImpl.addAttribute("href", href); 340 fragmentBuilder.startElement(uri, localName, qName, attributesImpl); 341 } else { 342 fragmentBuilder.startElement(uri, localName, qName, EmptyAttributes.EMPTY_ATTRIBUTES); 343 } 344 } 345 break; 346 } 347 } 348 349 public void startPrefixMapping(String prefix, String uri) throws SAXException { 350 } 351 352 }