001 /* XmlParser.java -- 002 Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. 003 Portions Copyright 2006 Henri Sivonen. 004 005 This file is part of GNU JAXP. 006 007 GNU JAXP is free software; you can redistribute it and/or modify 008 it under the terms of the GNU General Public License as published by 009 the Free Software Foundation; either version 2, or (at your option) 010 any later version. 011 012 GNU JAXP is distributed in the hope that it will be useful, but 013 WITHOUT ANY WARRANTY; without even the implied warranty of 014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 General Public License for more details. 016 017 You should have received a copy of the GNU General Public License 018 along with GNU JAXP; see the file COPYING. If not, write to the 019 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 020 02111-1307 USA. 021 022 Linking this library statically or dynamically with other modules is 023 making a combined work based on this library. Thus, the terms and 024 conditions of the GNU General Public License cover the whole 025 combination. 026 027 As a special exception, the copyright holders of this library give you 028 permission to link this library with independent modules to produce an 029 executable, regardless of the license terms of these independent 030 modules, and to copy and distribute the resulting executable under 031 terms of your choice, provided that you also meet, for each linked 032 independent module, the terms and conditions of the license of that 033 module. An independent module is a module which is not derived from 034 or based on this library. If you modify this library, you may extend 035 this exception to your version of the library, but you are not 036 obligated to do so. If you do not wish to do so, delete this 037 exception statement from your version. 038 039 Partly derived from code which carried the following notice: 040 041 Copyright (c) 1997, 1998 by Microstar Software Ltd. 042 043 AElfred is free for both commercial and non-commercial use and 044 redistribution, provided that Microstar's copyright and disclaimer are 045 retained intact. You are free to modify AElfred for your own use and 046 to redistribute AElfred with your modifications, provided that the 047 modifications are clearly documented. 048 049 This program is distributed in the hope that it will be useful, but 050 WITHOUT ANY WARRANTY; without even the implied warranty of 051 merchantability or fitness for a particular purpose. Please use it AT 052 YOUR OWN RISK. 053 */ 054 055 package nu.validator.gnu.xml.aelfred2; 056 057 import java.io.BufferedInputStream; 058 import java.io.EOFException; 059 import java.io.IOException; 060 import java.io.InputStream; 061 import java.io.InputStreamReader; 062 import java.io.Reader; 063 import java.nio.charset.CharacterCodingException; 064 import java.nio.charset.Charset; 065 import java.nio.charset.CharsetDecoder; 066 import java.nio.charset.CodingErrorAction; 067 import java.nio.charset.IllegalCharsetNameException; 068 import java.nio.charset.UnsupportedCharsetException; 069 import java.util.HashMap; 070 import java.util.Iterator; 071 import java.util.LinkedList; 072 073 import nu.validator.htmlparser.impl.CharacterHandler; 074 import nu.validator.htmlparser.impl.NormalizationChecker; 075 import nu.validator.io.EncodingInfo; 076 077 import org.xml.sax.InputSource; 078 import org.xml.sax.SAXException; 079 080 // Organized imports -- 2005-08-20 hsivonen 081 082 /** 083 * Parse XML documents and return parse events through call-backs. Use the 084 * <code>SAXDriver</code> class as your entry point, as all internal parser 085 * interfaces are subject to change. 086 * 087 * @author Written by David Megginson <dmeggins@microstar.com> (version 088 * 1.2a with bugfixes) 089 * @author Updated by David Brownell <dbrownell@users.sourceforge.net> 090 * @author Modified by Henri Sivonen <hsivonen@iki.fi> 091 * @see SAXDriver 092 */ 093 final class XmlParser { 094 095 // avoid slow per-character readCh() 096 private final static boolean USE_CHEATS = false; 097 098 // ////////////////////////////////////////////////////////////////////// 099 // Constants. 100 // ////////////////////////////////////////////////////////////////////// 101 102 private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00; 103 104 // 105 // Constants for element content type. 106 // 107 108 /** 109 * Constant: an element has not been declared. 110 * 111 * @see #getElementContentType 112 */ 113 public final static int CONTENT_UNDECLARED = 0; 114 115 /** 116 * Constant: the element has a content model of ANY. 117 * 118 * @see #getElementContentType 119 */ 120 public final static int CONTENT_ANY = 1; 121 122 /** 123 * Constant: the element has declared content of EMPTY. 124 * 125 * @see #getElementContentType 126 */ 127 public final static int CONTENT_EMPTY = 2; 128 129 /** 130 * Constant: the element has mixed content. 131 * 132 * @see #getElementContentType 133 */ 134 public final static int CONTENT_MIXED = 3; 135 136 /** 137 * Constant: the element has element content. 138 * 139 * @see #getElementContentType 140 */ 141 public final static int CONTENT_ELEMENTS = 4; 142 143 // 144 // Constants for the entity type. 145 // 146 147 /** 148 * Constant: the entity has not been declared. 149 * 150 * @see #getEntityType 151 */ 152 public final static int ENTITY_UNDECLARED = 0; 153 154 /** 155 * Constant: the entity is internal. 156 * 157 * @see #getEntityType 158 */ 159 public final static int ENTITY_INTERNAL = 1; 160 161 /** 162 * Constant: the entity is external, non-parsable data. 163 * 164 * @see #getEntityType 165 */ 166 public final static int ENTITY_NDATA = 2; 167 168 /** 169 * Constant: the entity is external XML data. 170 * 171 * @see #getEntityType 172 */ 173 public final static int ENTITY_TEXT = 3; 174 175 // 176 // Attribute type constants are interned literal strings. 177 // 178 179 // 180 // Constants for attribute default value. 181 // 182 183 /** 184 * Constant: the attribute is not declared. 185 * 186 * @see #getAttributeDefaultValueType 187 */ 188 public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; 189 190 /** 191 * Constant: the attribute has a literal default value specified. 192 * 193 * @see #getAttributeDefaultValueType 194 * @see #getAttributeDefaultValue 195 */ 196 public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; 197 198 /** 199 * Constant: the attribute was declared #IMPLIED. 200 * 201 * @see #getAttributeDefaultValueType 202 */ 203 public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; 204 205 /** 206 * Constant: the attribute was declared #REQUIRED. 207 * 208 * @see #getAttributeDefaultValueType 209 */ 210 public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; 211 212 /** 213 * Constant: the attribute was declared #FIXED. 214 * 215 * @see #getAttributeDefaultValueType 216 * @see #getAttributeDefaultValue 217 */ 218 public final static int ATTRIBUTE_DEFAULT_FIXED = 34; 219 220 // 221 // Constants for input. 222 // 223 private final static int INPUT_NONE = 0; 224 225 private final static int INPUT_INTERNAL = 1; 226 227 private final static int INPUT_READER = 5; 228 229 // 230 // Flags for reading literals. 231 // 232 // expand general entity refs (attribute values in dtd and content) 233 private final static int LIT_ENTITY_REF = 2; 234 235 // normalize this value (space chars) (attributes, public ids) 236 private final static int LIT_NORMALIZE = 4; 237 238 // literal is an attribute value 239 private final static int LIT_ATTRIBUTE = 8; 240 241 // don't expand parameter entities 242 private final static int LIT_DISABLE_PE = 16; 243 244 // don't expand [or parse] character refs 245 private final static int LIT_DISABLE_CREF = 32; 246 247 // don't parse general entity refs 248 private final static int LIT_DISABLE_EREF = 64; 249 250 // literal is a public ID value 251 private final static int LIT_PUBID = 256; 252 253 // 254 // Flags affecting PE handling in DTDs (if expandPE is true). 255 // PEs expand with space padding, except inside literals. 256 // 257 private final static int CONTEXT_NORMAL = 0; 258 259 private final static int CONTEXT_LITERAL = 1; 260 261 // Emit warnings for relative URIs with no base URI. 262 static boolean uriWarnings; 263 static { 264 String key = "gnu.xml.aelfred2.XmlParser.uriWarnings"; 265 try { 266 uriWarnings = "true".equals(System.getProperty(key)); 267 } catch (SecurityException e) { 268 uriWarnings = false; 269 } 270 } 271 272 // 273 // The current XML handler interface. 274 // 275 private SAXDriver handler; 276 277 // 278 // I/O information. 279 // 280 private Reader reader; // current reader 281 282 private InputStream is; // current input stream 283 284 private int line; // current line number 285 286 private int linePrev; // the line of the previous character -- hsivonen 287 // 2007-09-28 288 289 private int column; // current column number 290 291 private int columnPrev; // the column of the previous character -- hsivonen 292 // 2007-09-28 293 294 private boolean nextCharOnNewLine; // indicates whether the next character 295 // is on the next line -- hsivonen 296 // 2007-09-28 297 298 private int sourceType; // type of input source 299 300 private LinkedList<Input> inputStack; // stack of input soruces 301 302 private String characterEncoding; // current character encoding 303 304 private int currentByteCount; // bytes read from current source 305 306 private InputSource scratch; // temporary 307 308 // 309 // Buffers for decoded but unparsed character input. 310 // 311 private char[] readBuffer; 312 313 private int readBufferPos; 314 315 private int readBufferLength; 316 317 private int readBufferOverflow; // overflow from last data chunk. 318 319 // 320 // Buffer for undecoded raw byte input. 321 // 322 private final static int READ_BUFFER_MAX = 16384; 323 324 private byte[] rawReadBuffer; 325 326 // 327 // Buffer for attribute values, char refs, DTD stuff. 328 // 329 private static int DATA_BUFFER_INITIAL = 4096; 330 331 private char[] dataBuffer; 332 333 private int dataBufferPos; 334 335 // 336 // Buffer for parsed names. 337 // 338 private static int NAME_BUFFER_INITIAL = 1024; 339 340 private char[] nameBuffer; 341 342 private int nameBufferPos; 343 344 // 345 // Save any standalone flag 346 // 347 private boolean docIsStandalone; 348 349 // 350 // Hashtables for DTD information on elements, entities, and notations. 351 // Populated until we start ignoring decls (because of skipping a PE) 352 // 353 private HashMap<String, ElementDecl> elementInfo; 354 355 private HashMap<String, EntityInfo> entityInfo; 356 357 private HashMap<String, String> notationInfo; 358 359 private boolean skippedPE; 360 361 // 362 // Element type currently in force. 363 // 364 private String currentElement; 365 366 private int currentElementContent; 367 368 // 369 // Stack of entity names, to detect recursion. 370 // 371 private LinkedList<String> entityStack; 372 373 // 374 // PE expansion is enabled in most chunks of the DTD, not all. 375 // When it's enabled, literals are treated differently. 376 // 377 private boolean inLiteral; 378 379 private boolean expandPE; 380 381 private boolean peIsError; 382 383 // 384 // can't report entity expansion inside two constructs: 385 // - attribute expansions (internal entities only) 386 // - markup declarations (parameter entities only) 387 // 388 private boolean doReport; 389 390 // 391 // Symbol table, for caching interned names. 392 // 393 // These show up wherever XML names or nmtokens are used: naming elements, 394 // attributes, PIs, notations, entities, and enumerated attribute values. 395 // 396 // NOTE: This hashtable doesn't grow. The default size is intended to be 397 // rather large for most documents. Example: one snapshot of the DocBook 398 // XML 4.1 DTD used only about 350 such names. As a rule, only pathological 399 // documents (ones that don't reuse names) should ever see much collision. 400 // 401 // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing. 402 // "2039" keeps the hash table size at about two memory pages on typical 403 // 32 bit hardware. 404 // 405 private final static int SYMBOL_TABLE_LENGTH = 2039; 406 407 private Object[][] symbolTable; 408 409 // 410 // Hash table of attributes found in current start tag. 411 // 412 private String[] tagAttributes; 413 414 private int tagAttributePos; 415 416 // 417 // Utility flag: have we noticed a CR while reading the last 418 // data chunk? If so, we will have to go back and normalise 419 // CR or CR/LF line ends. 420 // 421 private boolean sawCR; 422 423 // 424 // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. 425 // 426 private boolean inCDATA; 427 428 // 429 // Xml version. 430 // 431 private static final int XML_10 = 0; 432 433 private static final int XML_11 = 1; 434 435 private int xmlVersion = XML_10; 436 437 // 438 // Normalization checking 439 // 440 441 private NormalizationChecker normalizationChecker; 442 443 private CharacterHandler characterHandler; 444 445 // //////////////////////////////////////////////////////////////////// 446 // Constructors. 447 // ////////////////////////////////////////////////////////////////////// 448 449 /** 450 * Construct a new parser with no associated handler. 451 * 452 * @see #setHandler 453 * @see #parse 454 */ 455 // package private 456 XmlParser() { 457 } 458 459 /** 460 * Set the handler that will receive parsing events. 461 * 462 * @param handler 463 * The handler to receive callback events. 464 * @see #parse 465 */ 466 // package private 467 void setHandler(SAXDriver handler) { 468 this.handler = handler; 469 } 470 471 /** 472 * Parse an XML document from the character stream, byte stream, or URI that 473 * you provide (in that order of preference). Any URI that you supply will 474 * become the base URI for resolving relative URI, and may be used to 475 * acquire a reader or byte stream. 476 * 477 * <p> 478 * Only one thread at a time may use this parser; since it is private to 479 * this package, post-parse cleanup is done by the caller, which MUST NOT 480 * REUSE the parser (just null it). 481 * 482 * @param systemId 483 * Absolute URI of the document; should never be null, but may be 484 * so iff a reader <em>or</em> a stream is provided. 485 * @param publicId 486 * The public identifier of the document, or null. 487 * @param reader 488 * A character stream; must be null if stream isn't. 489 * @param stream 490 * A byte input stream; must be null if reader isn't. 491 * @param characterEncoding 492 * The suggested encoding, or null if unknown. 493 * @exception java.lang.Exception 494 * Basically SAXException or IOException 495 */ 496 // package private 497 void doParse(String systemId, String publicId, Reader reader, 498 InputStream stream, String encoding) throws Exception { 499 if (handler == null) { 500 throw new IllegalStateException("no callback handler"); 501 } 502 503 alreadyWarnedAboutPrivateUseCharacters = false; 504 initializeVariables(); 505 506 // predeclare the built-in entities here (replacement texts) 507 // we don't need to intern(), since we're guaranteed literals 508 // are always (globally) interned. 509 setInternalEntity("amp", "&"); 510 setInternalEntity("lt", "<"); 511 setInternalEntity("gt", ">"); 512 setInternalEntity("apos", "'"); 513 setInternalEntity("quot", """); 514 515 try { 516 // pushURL first to ensure locator is correct in startDocument 517 // ... it might report an IO or encoding exception. 518 handler.startDocument(); 519 pushURL(false, "[document]", 520 // default baseURI: null 521 new ExternalIdentifiers(publicId, systemId, null), reader, 522 stream, encoding, false); 523 524 parseDocument(); 525 } catch (EOFException e) { 526 // empty input 527 fatal("empty document, with no root element."); 528 } finally { 529 if (reader != null) { 530 try { 531 reader.close(); 532 } catch (IOException e) { 533 /* ignore */ 534 } 535 } 536 if (stream != null) { 537 try { 538 stream.close(); 539 } catch (IOException e) { 540 /* ignore */ 541 } 542 } 543 if (is != null) { 544 try { 545 is.close(); 546 } catch (IOException e) { 547 /* ignore */ 548 } 549 } 550 } 551 } 552 553 // //////////////////////////////////////////////////////////////////// 554 // Error reporting. 555 // //////////////////////////////////////////////////////////////////// 556 557 /** 558 * Report an error. 559 * 560 * @param message 561 * The error message. 562 * @param textFound 563 * The text that caused the error (or null). 564 * @see SAXDriver#error 565 * @see #line 566 */ 567 private void fatal(String message, String textFound, String textExpected) 568 throws SAXException { 569 // smart quotes -- 2005-08-20 hsivonen 570 if (textFound != null) { 571 message = message + " (found \u201C" + textFound + "\u201D)"; 572 } 573 if (textExpected != null) { 574 message = message + " (expected \u201C" + textExpected + "\u201D)"; 575 } 576 handler.fatal(message); 577 578 // "can't happen" 579 throw new SAXException(message); 580 } 581 582 /** 583 * Report a serious error. 584 * 585 * @param message 586 * The error message. 587 * @param textFound 588 * The text that caused the error (or null). 589 */ 590 private void fatal(String message, char textFound, String textExpected) 591 throws SAXException { 592 fatal(message, new Character(textFound).toString(), textExpected); 593 } 594 595 /** 596 * Report typical case fatal errors. 597 */ 598 private void fatal(String message) throws SAXException { 599 handler.fatal(message); 600 } 601 602 /** 603 * Report non-fatal errors. 604 */ 605 private void err(String message) throws SAXException { 606 handler.verror(message); 607 } 608 609 // //////////////////////////////////////////////////////////////////// 610 // Major syntactic productions. 611 // //////////////////////////////////////////////////////////////////// 612 613 /** 614 * Parse an XML document. 615 * 616 * <pre> 617 * [1] document ::= prolog element Misc* 618 * </pre> 619 * 620 * <p> 621 * This is the top-level parsing function for a single XML document. As a 622 * minimum, a well-formed document must have a document element, and a valid 623 * document must have a prolog (one with doctype) as well. 624 */ 625 private void parseDocument() throws Exception { 626 try { // added by MHK 627 boolean sawDTD = parseProlog(); 628 require('<'); 629 parseElement(!sawDTD); 630 } catch (EOFException ee) { // added by MHK 631 fatal("premature end of file", "[EOF]", null); 632 } 633 634 try { 635 parseMisc(); // skip all white, PIs, and comments 636 char c = readCh(); // if this doesn't throw an exception... 637 fatal("unexpected characters after document end", c, null); 638 } catch (EOFException e) { 639 if (characterHandler != null) { 640 characterHandler.end(); 641 } 642 if (normalizationChecker != null) { 643 normalizationChecker.end(); 644 } 645 return; 646 } 647 } 648 649 static final char[] startDelimComment = { '<', '!', '-', '-' }; 650 651 static final char[] endDelimComment = { '-', '-' }; 652 653 /** 654 * Skip a comment. 655 * 656 * <pre> 657 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" 658 * </pre> 659 * 660 * <p> 661 * (The <code><!--</code> has already been read.) 662 */ 663 private void parseComment() throws Exception { 664 boolean saved = expandPE; 665 666 expandPE = false; 667 parseUntil(endDelimComment); 668 require('>'); 669 expandPE = saved; 670 handler.comment(dataBuffer, 0, dataBufferPos); 671 dataBufferPos = 0; 672 } 673 674 static final char[] startDelimPI = { '<', '?' }; 675 676 static final char[] endDelimPI = { '?', '>' }; 677 678 /** 679 * Parse a processing instruction and do a call-back. 680 * 681 * <pre> 682 * [16] PI ::= '<?' PITarget 683 * (S (Char* - (Char* '?>' Char*)))? 684 * '?>' 685 * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) 686 * </pre> 687 * 688 * <p> 689 * (The <code><?</code> has already been read.) 690 */ 691 private void parsePI() throws SAXException, IOException { 692 String name; 693 boolean saved = expandPE; 694 695 expandPE = false; 696 name = readNmtoken(true); 697 // NE08 698 if (name.indexOf(':') >= 0) { 699 fatal("Illegal character(':') in processing instruction name ", 700 name, null); 701 } 702 if ("xml".equalsIgnoreCase(name)) { 703 fatal("Illegal processing instruction target", name, null); 704 } 705 if (!tryRead(endDelimPI)) { 706 requireWhitespace(); 707 parseUntil(endDelimPI); 708 } 709 expandPE = saved; 710 handler.processingInstruction(name, dataBufferToString()); 711 } 712 713 static final char[] endDelimCDATA = { ']', ']', '>' }; 714 715 private boolean isDirtyCurrentElement; 716 717 private boolean alreadyWarnedAboutPrivateUseCharacters; 718 719 private char prev; 720 721 /** 722 * Parse a CDATA section. 723 * 724 * <pre> 725 * [18] CDSect ::= CDStart CData CDEnd 726 * [19] CDStart ::= '<![CDATA[' 727 * [20] CData ::= (Char* - (Char* ']]>' Char*)) 728 * [21] CDEnd ::= ']]>' 729 * </pre> 730 * 731 * <p> 732 * (The '<![CDATA[' has already been read.) 733 */ 734 private void parseCDSect() throws Exception { 735 parseUntil(endDelimCDATA); 736 dataBufferFlush(); 737 } 738 739 /** 740 * Parse the prolog of an XML document. 741 * 742 * <pre> 743 * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? 744 * </pre> 745 * 746 * <p> 747 * We do not look for the XML declaration here, because it was handled by 748 * pushURL (). 749 * 750 * @see pushURL 751 * @return true if a DTD was read. 752 */ 753 private boolean parseProlog() throws Exception { 754 parseMisc(); 755 756 if (tryRead("<!DOCTYPE")) { 757 parseDoctypedecl(); 758 parseMisc(); 759 return true; 760 } 761 return false; 762 } 763 764 private void checkLegalVersion(String version) throws SAXException { 765 int len = version.length(); 766 for (int i = 0; i < len; i++) { 767 char c = version.charAt(i); 768 if ('0' <= c && c <= '9') { 769 continue; 770 } 771 if (c == '_' || c == '.' || c == ':' || c == '-') { 772 continue; 773 } 774 if ('a' <= c && c <= 'z') { 775 continue; 776 } 777 if ('A' <= c && c <= 'Z') { 778 continue; 779 } 780 fatal("illegal character in version", version, "1.0"); 781 } 782 } 783 784 /** 785 * Parse the XML declaration. 786 * 787 * <pre> 788 * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 789 * [24] VersionInfo ::= S 'version' Eq 790 * ("'" VersionNum "'" | '"' VersionNum '"' ) 791 * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* 792 * [32] SDDecl ::= S 'standalone' Eq 793 * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) 794 * [80] EncodingDecl ::= S 'encoding' Eq 795 * ( "'" EncName "'" | "'" EncName "'" ) 796 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 797 * </pre> 798 * 799 * <p> 800 * (The <code><?xml</code> and whitespace have already been read.) 801 * 802 * @return the encoding in the declaration, uppercased; or null 803 * @see #parseTextDecl 804 * @see #setupDecoding 805 */ 806 private String parseXMLDecl(String encoding) throws SAXException, 807 IOException { 808 String version; 809 String encodingName = null; 810 String standalone = null; 811 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 812 813 // Read the version. 814 require("version"); 815 parseEq(); 816 checkLegalVersion(version = readLiteral(flags)); 817 if (!version.equals("1.0")) { 818 if (version.equals("1.1")) { 819 fatal("XML 1.1 not supported."); // 2006-04-24 hsivonen 820 } else { 821 fatal("illegal XML version", version, "1.0"); // removed 1.1 822 // -- 2006-04-24 823 // hsivonen 824 } 825 } else { 826 xmlVersion = XML_10; 827 } 828 // Try reading an encoding declaration. 829 boolean white = tryWhitespace(); 830 831 if (tryRead("encoding")) { 832 if (!white) { 833 fatal("whitespace required before 'encoding='"); 834 } 835 parseEq(); 836 encodingName = readLiteral(flags); 837 checkEncodingLiteral(encodingName); // 2006-04-28 hsivonen 838 if (reader == null) { 839 draconianInputStreamReader(encodingName, is, true); 840 } else { 841 checkEncodingMatch(encoding, encodingName); 842 } 843 } 844 845 // Try reading a standalone declaration 846 if (encodingName != null) { 847 white = tryWhitespace(); 848 } else { 849 if (encoding == null) { 850 draconianInputStreamReader("UTF-8", is, false); // 2006-04-24 851 // hsivonen 852 } 853 warnAboutLackOfEncodingDecl(encoding); 854 } 855 if (tryRead("standalone")) { 856 if (!white) { 857 fatal("whitespace required before 'standalone='"); 858 } 859 parseEq(); 860 standalone = readLiteral(flags); 861 if ("yes".equals(standalone)) { 862 docIsStandalone = true; 863 } else if (!"no".equals(standalone)) { 864 fatal("standalone flag must be 'yes' or 'no'"); 865 } 866 } 867 868 skipWhitespace(); 869 require("?>"); 870 871 return encodingName; 872 } 873 874 // hsivonen 2006-04-28 875 private void checkEncodingLiteral(String encodingName) throws SAXException { 876 if (encodingName == null) { 877 return; 878 } 879 if (encodingName.length() == 0) { 880 fatal("The empty string does not a legal encoding name."); 881 } 882 char c = encodingName.charAt(0); 883 if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) { 884 fatal("The encoding name must start with an ASCII letter."); 885 } 886 for (int i = 1; i < encodingName.length(); i++) { 887 c = encodingName.charAt(i); 888 if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 889 || (c >= '0' && c <= '9') || (c == '.') || (c == '_') || (c == '-'))) { 890 fatal("Illegal character in encoding name: U+" 891 + Integer.toHexString(c) + "."); 892 } 893 } 894 } 895 896 /** 897 * Parse a text declaration. 898 * 899 * <pre> 900 * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' 901 * [80] EncodingDecl ::= S 'encoding' Eq 902 * ( '"' EncName '"' | "'" EncName "'" ) 903 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 904 * </pre> 905 * 906 * <p> 907 * (The <code><?xml</code>' and whitespace have already been read.) 908 * 909 * @return the encoding in the declaration, uppercased; or null 910 * @see #parseXMLDecl 911 * @see #setupDecoding 912 */ 913 private String parseTextDecl(String encoding) throws SAXException, 914 IOException { 915 String encodingName = null; 916 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 917 918 // Read an optional version. 919 if (tryRead("version")) { 920 String version; 921 parseEq(); 922 checkLegalVersion(version = readLiteral(flags)); 923 if (!version.equals("1.0")) { 924 if (version.equals("1.1")) { 925 fatal("XML 1.1 not supported."); // 2006-04-24 hsivonen 926 } else { 927 fatal("illegal XML version", version, "1.0"); // removed 928 // 1.1 -- 929 // 2006-04-24 930 // hsivonen 931 } 932 } 933 requireWhitespace(); 934 } 935 936 // Read the encoding. 937 require("encoding"); 938 parseEq(); 939 encodingName = readLiteral(flags); 940 checkEncodingLiteral(encodingName); // 2006-04-28 hsivonen 941 if (reader == null) { 942 draconianInputStreamReader(encodingName, is, true); 943 } else { 944 checkEncodingMatch(encoding, encodingName); 945 } 946 skipWhitespace(); 947 require("?>"); 948 949 return encodingName; 950 } 951 952 private void checkEncodingMatch(String used, String detected) 953 throws SAXException { 954 // method added -- 2006-02-03 hsivonen 955 if (used == null) { 956 if (!characterEncoding.equalsIgnoreCase(detected)) { 957 fatal( 958 "Declared character encoding was not the one sniffed from the BOM.", 959 detected, characterEncoding); 960 } 961 } else { 962 if (!"".equals(used) && !used.equalsIgnoreCase(detected)) { 963 handler.warn("External encoding information specified " 964 + used 965 + ", but XML declaration specified " 966 + detected 967 + ". Allowing external to override per RFC 3023. The well-formedness status of this document may change when decoupled from the external character encoding information."); 968 } 969 } 970 } 971 972 private void draconianInputStreamReader(String encoding, 973 InputStream stream, boolean requireAsciiSuperset) 974 throws SAXException, IOException { 975 draconianInputStreamReader(encoding, stream, requireAsciiSuperset, 976 encoding); 977 } 978 979 private void draconianInputStreamReader(String encoding, 980 InputStream stream, boolean requireAsciiSuperset, String actualName) 981 throws SAXException, IOException { 982 // method added -- 2005-08-21 hsivonen 983 sourceType = INPUT_READER; 984 characterEncoding = actualName.toUpperCase(); 985 encoding = encoding.toUpperCase(); 986 try { 987 Charset cs = Charset.forName(encoding); 988 String canonName = cs.name(); 989 if (requireAsciiSuperset) { 990 if (!EncodingInfo.isAsciiSuperset(canonName)) { 991 fatal("The encoding \u201C" 992 + encoding 993 + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration."); 994 } 995 } 996 if (canonName.startsWith("X-") || canonName.startsWith("x-") 997 || canonName.startsWith("Mac")) { 998 if (encoding.startsWith("X-")) { 999 err(encoding 1000 + " is not an IANA-registered encoding. (Charmod C022)"); 1001 } else { 1002 err(encoding 1003 + "is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)"); 1004 } 1005 } else if (!canonName.equalsIgnoreCase(encoding)) { 1006 err(encoding 1007 + " is not the preferred name of the character encoding in use. The preferred name is " 1008 + canonName + ". (Charmod C024)"); 1009 } 1010 if (!("UTF-8".equals(encoding) || "UTF-16".equals(encoding) 1011 || "UTF-16BE".equals(encoding) 1012 || "UTF-16LE".equals(encoding) 1013 || "ISO-8859-1".equals(encoding) || "US-ASCII".equals(encoding))) { 1014 handler.warn("XML processors are required to support the UTF-8 and UTF-16 character encodings. The encoding was " 1015 + actualName 1016 + " instead, which is an incompatibility risk."); 1017 } 1018 CharsetDecoder decoder = cs.newDecoder(); 1019 decoder.onMalformedInput(CodingErrorAction.REPORT); 1020 decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 1021 this.reader = new InputStreamReader(stream, decoder); 1022 } catch (IllegalCharsetNameException e) { 1023 fatal("Illegal character encoding name: " + encoding); 1024 } catch (UnsupportedCharsetException e) { 1025 handler.fatal("Unsupported character encoding: " + encoding); 1026 } 1027 } 1028 1029 /** 1030 * Parse miscellaneous markup outside the document element and DOCTYPE 1031 * declaration. 1032 * 1033 * <pre> 1034 * [27] Misc ::= Comment | PI | S 1035 * </pre> 1036 */ 1037 private void parseMisc() throws Exception { 1038 while (true) { 1039 skipWhitespace(); 1040 if (tryRead(startDelimPI)) { 1041 parsePI(); 1042 } else if (tryRead(startDelimComment)) { 1043 parseComment(); 1044 } else { 1045 return; 1046 } 1047 } 1048 } 1049 1050 /** 1051 * Parse a document type declaration. 1052 * 1053 * <pre> 1054 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 1055 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 1056 * </pre> 1057 * 1058 * <p> 1059 * (The <code><!DOCTYPE</code> has already been read.) 1060 */ 1061 private void parseDoctypedecl() throws Exception { 1062 String rootName; 1063 ExternalIdentifiers ids; 1064 1065 // Read the document type name. 1066 requireWhitespace(); 1067 rootName = readNmtoken(true); 1068 1069 // Read the External subset's IDs 1070 skipWhitespace(); 1071 ids = readExternalIds(false, true); 1072 1073 // report (a) declaration of name, (b) lexical info (ids) 1074 handler.doctypeDecl(rootName, ids.publicId, ids.systemId); 1075 1076 // Internal subset is parsed first, if present 1077 skipWhitespace(); 1078 if (tryRead('[')) { 1079 1080 // loop until the subset ends 1081 while (true) { 1082 doReport = expandPE = true; 1083 skipWhitespace(); 1084 doReport = expandPE = false; 1085 if (tryRead(']')) { 1086 break; // end of subset 1087 } else { 1088 // WFC, PEs in internal subset (only between decls) 1089 peIsError = expandPE = true; 1090 parseMarkupdecl(); 1091 peIsError = expandPE = false; 1092 } 1093 } 1094 } 1095 skipWhitespace(); 1096 require('>'); 1097 1098 // Read the external subset, if any 1099 InputSource subset; 1100 1101 if (ids.systemId == null) { 1102 subset = handler.getExternalSubset(rootName, handler.getSystemId()); 1103 } else { 1104 subset = null; 1105 } 1106 if (ids.systemId != null || subset != null) { 1107 pushString(null, ">"); 1108 1109 // NOTE: [dtd] is so we say what SAX2 expects, 1110 // though it's misleading (subset, not entire dtd) 1111 if (ids.systemId != null) { 1112 pushURL(true, "[dtd]", ids, null, null, null, true); 1113 } else { 1114 handler.warn("modifying document by adding external subset"); 1115 pushURL(true, "[dtd]", new ExternalIdentifiers( 1116 subset.getPublicId(), subset.getSystemId(), null), 1117 subset.getCharacterStream(), subset.getByteStream(), 1118 subset.getEncoding(), false); 1119 } 1120 1121 // Loop until we end up back at '>' 1122 while (true) { 1123 doReport = expandPE = true; 1124 skipWhitespace(); 1125 doReport = expandPE = false; 1126 if (tryRead('>')) { 1127 break; 1128 } else { 1129 expandPE = true; 1130 parseMarkupdecl(); 1131 expandPE = false; 1132 } 1133 } 1134 1135 // the ">" string isn't popped yet 1136 if (inputStack.size() != 1) { 1137 fatal("external subset has unmatched '>'"); 1138 } 1139 } 1140 1141 // done dtd 1142 handler.endDoctype(); 1143 expandPE = false; 1144 doReport = true; 1145 } 1146 1147 /** 1148 * Parse a markup declaration in the internal or external DTD subset. 1149 * 1150 * <pre> 1151 * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl 1152 * | NotationDecl | PI | Comment 1153 * [30] extSubsetDecl ::= (markupdecl | conditionalSect 1154 * | PEReference | S) * 1155 * </pre> 1156 * 1157 * <p> 1158 * Reading toplevel PE references is handled as a lexical issue by the 1159 * caller, as is whitespace. 1160 */ 1161 private void parseMarkupdecl() throws Exception { 1162 char[] saved = null; 1163 boolean savedPE = expandPE; 1164 1165 // prevent "<%foo;" and ensures saved entity is right 1166 require('<'); 1167 unread('<'); 1168 expandPE = false; 1169 1170 if (tryRead("<!ELEMENT")) { 1171 saved = readBuffer; 1172 expandPE = savedPE; 1173 parseElementDecl(); 1174 } else if (tryRead("<!ATTLIST")) { 1175 saved = readBuffer; 1176 expandPE = savedPE; 1177 parseAttlistDecl(); 1178 } else if (tryRead("<!ENTITY")) { 1179 saved = readBuffer; 1180 expandPE = savedPE; 1181 parseEntityDecl(); 1182 } else if (tryRead("<!NOTATION")) { 1183 saved = readBuffer; 1184 expandPE = savedPE; 1185 parseNotationDecl(); 1186 } else if (tryRead(startDelimPI)) { 1187 saved = readBuffer; 1188 expandPE = savedPE; 1189 parsePI(); 1190 } else if (tryRead(startDelimComment)) { 1191 saved = readBuffer; 1192 expandPE = savedPE; 1193 parseComment(); 1194 } else if (tryRead("<![")) { 1195 saved = readBuffer; 1196 expandPE = savedPE; 1197 if (inputStack.size() > 0) { 1198 parseConditionalSect(saved); 1199 } else { 1200 fatal("conditional sections illegal in internal subset"); 1201 } 1202 } else { 1203 fatal("expected markup declaration"); 1204 } 1205 1206 // VC: Proper Decl/PE Nesting 1207 if (readBuffer != saved) { 1208 handler.verror("Illegal Declaration/PE nesting"); 1209 } 1210 } 1211 1212 /** 1213 * Parse an element, with its tags. 1214 * 1215 * <pre> 1216 * [39] element ::= EmptyElementTag | STag content ETag 1217 * [40] STag ::= '<' Name (S Attribute)* S? '>' 1218 * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>' 1219 * </pre> 1220 * 1221 * <p> 1222 * (The '<' has already been read.) 1223 * <p> 1224 * NOTE: this method actually chains onto parseContent (), if necessary, and 1225 * parseContent () will take care of calling parseETag (). 1226 */ 1227 private void parseElement(boolean maybeGetSubset) throws Exception { 1228 String gi; 1229 char c; 1230 int oldElementContent = currentElementContent; 1231 String oldElement = currentElement; 1232 ElementDecl element; 1233 1234 // This is the (global) counter for the 1235 // array of specified attributes. 1236 tagAttributePos = 0; 1237 1238 // Read the element type name. 1239 gi = readNmtoken(true); 1240 1241 // If we saw no DTD, and this is the document root element, 1242 // let the application modify the input stream by providing one. 1243 if (maybeGetSubset) { 1244 InputSource subset = handler.getExternalSubset(gi, 1245 handler.getSystemId()); 1246 if (subset != null) { 1247 String publicId = subset.getPublicId(); 1248 String systemId = subset.getSystemId(); 1249 1250 handler.warn("modifying document by adding DTD"); 1251 handler.doctypeDecl(gi, publicId, systemId); 1252 pushString(null, ">"); 1253 1254 // NOTE: [dtd] is so we say what SAX2 expects, 1255 // though it's misleading (subset, not entire dtd) 1256 pushURL(true, "[dtd]", new ExternalIdentifiers(publicId, 1257 systemId, null), subset.getCharacterStream(), 1258 subset.getByteStream(), subset.getEncoding(), false); 1259 1260 // Loop until we end up back at '>' 1261 while (true) { 1262 doReport = expandPE = true; 1263 skipWhitespace(); 1264 doReport = expandPE = false; 1265 if (tryRead('>')) { 1266 break; 1267 } else { 1268 expandPE = true; 1269 parseMarkupdecl(); 1270 expandPE = false; 1271 } 1272 } 1273 1274 // the ">" string isn't popped yet 1275 if (inputStack.size() != 1) { 1276 fatal("external subset has unmatched '>'"); 1277 } 1278 1279 handler.endDoctype(); 1280 } 1281 } 1282 1283 // Determine the current content type. 1284 currentElement = gi; 1285 element = elementInfo.get(gi); 1286 currentElementContent = getContentType(element, CONTENT_ANY); 1287 1288 // Read the attributes, if any. 1289 // After this loop, "c" is the closing delimiter. 1290 boolean white = tryWhitespace(); 1291 c = readCh(); 1292 while (c != '/' && c != '>') { 1293 unread(c); 1294 if (!white) { 1295 fatal("need whitespace between attributes"); 1296 } 1297 parseAttribute(gi); 1298 white = tryWhitespace(); 1299 c = readCh(); 1300 } 1301 1302 // Supply any defaulted attributes. 1303 Iterator<String> atts = declaredAttributes(element); 1304 if (atts != null) { 1305 String aname; 1306 loop: while (atts.hasNext()) { 1307 aname = atts.next(); 1308 // See if it was specified. 1309 for (int i = 0; i < tagAttributePos; i++) { 1310 if (tagAttributes[i] == aname) { 1311 continue loop; 1312 } 1313 } 1314 // ... or has a default 1315 String value = getAttributeDefaultValue(gi, aname); 1316 1317 if (value == null) { 1318 continue; 1319 } 1320 handler.attribute(aname, value, false); 1321 } 1322 } 1323 1324 // Figure out if this is a start tag 1325 // or an empty element, and dispatch an 1326 // event accordingly. 1327 switch (c) { 1328 case '>': 1329 handler.startElement(gi); 1330 parseContent(); 1331 break; 1332 case '/': 1333 require('>'); 1334 handler.startElement(gi); 1335 handler.endElement(gi); 1336 break; 1337 } 1338 1339 // Restore the previous state. 1340 currentElement = oldElement; 1341 currentElementContent = oldElementContent; 1342 } 1343 1344 /** 1345 * Parse an attribute assignment. 1346 * 1347 * <pre> 1348 * [41] Attribute ::= Name Eq AttValue 1349 * </pre> 1350 * 1351 * @param name 1352 * The name of the attribute's element. 1353 * @see SAXDriver#attribute 1354 */ 1355 private void parseAttribute(String name) throws Exception { 1356 String aname; 1357 String type; 1358 String value; 1359 int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF; 1360 1361 // Read the attribute name. 1362 aname = readNmtoken(true); 1363 type = getAttributeType(name, aname); 1364 1365 // Parse '=' 1366 parseEq(); 1367 1368 // Read the value, normalizing whitespace 1369 // unless it is CDATA. 1370 if (handler.stringInterning) { 1371 if (type == "CDATA" || type == null) { 1372 value = readLiteral(flags); 1373 } else { 1374 value = readLiteral(flags | LIT_NORMALIZE); 1375 } 1376 } else { 1377 if (type.equals("CDATA") || type == null) { 1378 value = readLiteral(flags); 1379 } else { 1380 value = readLiteral(flags | LIT_NORMALIZE); 1381 } 1382 } 1383 1384 // WFC: no duplicate attributes 1385 for (int i = 0; i < tagAttributePos; i++) { 1386 if (aname.equals(tagAttributes[i])) { 1387 fatal("duplicate attribute", aname, null); 1388 } 1389 } 1390 1391 // Inform the handler about the 1392 // attribute. 1393 handler.attribute(aname, value, true); 1394 dataBufferPos = 0; 1395 1396 // Note that the attribute has been 1397 // specified. 1398 if (tagAttributePos == tagAttributes.length) { 1399 String newAttrib[] = new String[tagAttributes.length * 2]; 1400 System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos); 1401 tagAttributes = newAttrib; 1402 } 1403 tagAttributes[tagAttributePos++] = aname; 1404 } 1405 1406 /** 1407 * Parse an equals sign surrounded by optional whitespace. 1408 * 1409 * <pre> 1410 * [25] Eq ::= S? '=' S? 1411 * </pre> 1412 */ 1413 private void parseEq() throws SAXException, IOException { 1414 skipWhitespace(); 1415 require('='); 1416 skipWhitespace(); 1417 } 1418 1419 /** 1420 * Parse an end tag. 1421 * 1422 * <pre> 1423 * [42] ETag ::= '</' Name S? '>' 1424 * </pre> 1425 * 1426 * <p> 1427 * NOTE: parseContent () chains to here, we already read the "</". 1428 */ 1429 private void parseETag() throws Exception { 1430 require(currentElement); 1431 skipWhitespace(); 1432 require('>'); 1433 handler.endElement(currentElement); 1434 // not re-reporting any SAXException re bogus end tags, 1435 // even though that diagnostic might be clearer ... 1436 } 1437 1438 /** 1439 * Parse the content of an element. 1440 * 1441 * <pre> 1442 * [43] content ::= (element | CharData | Reference 1443 * | CDSect | PI | Comment)* 1444 * [67] Reference ::= EntityRef | CharRef 1445 * </pre> 1446 * 1447 * <p> 1448 * NOTE: consumes ETtag. 1449 */ 1450 private void parseContent() throws Exception { 1451 char c; 1452 1453 while (true) { 1454 // consume characters (or ignorable whitspace) until delimiter 1455 parseCharData(); 1456 1457 // Handle delimiters 1458 c = readCh(); 1459 switch (c) { 1460 case '&': // Found "&" 1461 c = readCh(); 1462 if (c == '#') { 1463 parseCharRef(); 1464 } else { 1465 unread(c); 1466 parseEntityRef(true); 1467 } 1468 isDirtyCurrentElement = true; 1469 break; 1470 1471 case '<': // Found "<" 1472 dataBufferFlush(); 1473 c = readCh(); 1474 switch (c) { 1475 case '!': // Found "<!" 1476 c = readCh(); 1477 switch (c) { 1478 case '-': // Found "<!-" 1479 require('-'); 1480 isDirtyCurrentElement = false; 1481 parseComment(); 1482 break; 1483 case '[': // Found "<![" 1484 isDirtyCurrentElement = false; 1485 require("CDATA["); 1486 handler.startCDATA(); 1487 inCDATA = true; 1488 parseCDSect(); 1489 inCDATA = false; 1490 handler.endCDATA(); 1491 break; 1492 default: 1493 fatal("expected comment or CDATA section", 1494 c, null); 1495 break; 1496 } 1497 break; 1498 1499 case '?': // Found "<?" 1500 isDirtyCurrentElement = false; 1501 parsePI(); 1502 break; 1503 1504 case '/': // Found "</" 1505 isDirtyCurrentElement = false; 1506 parseETag(); 1507 return; 1508 1509 default: // Found "<" followed by something else 1510 isDirtyCurrentElement = false; 1511 unread(c); 1512 parseElement(false); 1513 break; 1514 } 1515 } 1516 } 1517 } 1518 1519 /** 1520 * Parse an element type declaration. 1521 * 1522 * <pre> 1523 * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' 1524 * </pre> 1525 * 1526 * <p> 1527 * NOTE: the '<!ELEMENT' has already been read. 1528 */ 1529 private void parseElementDecl() throws Exception { 1530 String name; 1531 1532 requireWhitespace(); 1533 // Read the element type name. 1534 name = readNmtoken(true); 1535 1536 requireWhitespace(); 1537 // Read the content model. 1538 parseContentspec(name); 1539 1540 skipWhitespace(); 1541 require('>'); 1542 } 1543 1544 /** 1545 * Content specification. 1546 * 1547 * <pre> 1548 * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements 1549 * </pre> 1550 */ 1551 private void parseContentspec(String name) throws Exception { 1552 // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ... 1553 if (tryRead("EMPTY")) { 1554 setElement(name, CONTENT_EMPTY, null, null); 1555 if (!skippedPE) { 1556 handler.getDeclHandler().elementDecl(name, "EMPTY"); 1557 } 1558 return; 1559 } else if (tryRead("ANY")) { 1560 setElement(name, CONTENT_ANY, null, null); 1561 if (!skippedPE) { 1562 handler.getDeclHandler().elementDecl(name, "ANY"); 1563 } 1564 return; 1565 } else { 1566 String model; 1567 char[] saved; 1568 1569 require('('); 1570 saved = readBuffer; 1571 dataBufferAppend('('); 1572 skipWhitespace(); 1573 if (tryRead("#PCDATA")) { 1574 dataBufferAppend("#PCDATA"); 1575 parseMixed(saved); 1576 model = dataBufferToString(); 1577 setElement(name, CONTENT_MIXED, model, null); 1578 } else { 1579 parseElements(saved); 1580 model = dataBufferToString(); 1581 setElement(name, CONTENT_ELEMENTS, model, null); 1582 } 1583 if (!skippedPE) { 1584 handler.getDeclHandler().elementDecl(name, model); 1585 } 1586 } 1587 } 1588 1589 /** 1590 * Parse an element-content model. 1591 * 1592 * <pre> 1593 * [47] elements ::= (choice | seq) ('?' | '*' | '+')? 1594 * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')' 1595 * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')' 1596 * </pre> 1597 * 1598 * <p> 1599 * NOTE: the opening '(' and S have already been read. 1600 * 1601 * @param saved 1602 * Buffer for entity that should have the terminal ')' 1603 */ 1604 private void parseElements(char[] saved) throws Exception { 1605 char c; 1606 char sep; 1607 1608 // Parse the first content particle 1609 skipWhitespace(); 1610 parseCp(); 1611 1612 // Check for end or for a separator. 1613 skipWhitespace(); 1614 c = readCh(); 1615 switch (c) { 1616 case ')': 1617 // VC: Proper Group/PE Nesting 1618 if (readBuffer != saved) { 1619 handler.verror("Illegal Group/PE nesting"); 1620 } 1621 1622 dataBufferAppend(')'); 1623 c = readCh(); 1624 switch (c) { 1625 case '*': 1626 case '+': 1627 case '?': 1628 dataBufferAppend(c); 1629 break; 1630 default: 1631 unread(c); 1632 } 1633 return; 1634 case ',': // Register the separator. 1635 case '|': 1636 sep = c; 1637 dataBufferAppend(c); 1638 break; 1639 default: 1640 fatal("bad separator in content model", c, null); 1641 return; 1642 } 1643 1644 // Parse the rest of the content model. 1645 while (true) { 1646 skipWhitespace(); 1647 parseCp(); 1648 skipWhitespace(); 1649 c = readCh(); 1650 if (c == ')') { 1651 // VC: Proper Group/PE Nesting 1652 if (readBuffer != saved) { 1653 handler.verror("Illegal Group/PE nesting"); 1654 } 1655 1656 dataBufferAppend(')'); 1657 break; 1658 } else if (c != sep) { 1659 fatal("bad separator in content model", c, null); 1660 return; 1661 } else { 1662 dataBufferAppend(c); 1663 } 1664 } 1665 1666 // Check for the occurrence indicator. 1667 c = readCh(); 1668 switch (c) { 1669 case '?': 1670 case '*': 1671 case '+': 1672 dataBufferAppend(c); 1673 return; 1674 default: 1675 unread(c); 1676 return; 1677 } 1678 } 1679 1680 /** 1681 * Parse a content particle. 1682 * 1683 * <pre> 1684 * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? 1685 * </pre> 1686 */ 1687 private void parseCp() throws Exception { 1688 if (tryRead('(')) { 1689 dataBufferAppend('('); 1690 parseElements(readBuffer); 1691 } else { 1692 dataBufferAppend(readNmtoken(true)); 1693 char c = readCh(); 1694 switch (c) { 1695 case '?': 1696 case '*': 1697 case '+': 1698 dataBufferAppend(c); 1699 break; 1700 default: 1701 unread(c); 1702 break; 1703 } 1704 } 1705 } 1706 1707 /** 1708 * Parse mixed content. 1709 * 1710 * <pre> 1711 * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*' 1712 * | '(' S? ('#PCDATA') S? ')' 1713 * </pre> 1714 * 1715 * @param saved 1716 * Buffer for entity that should have the terminal ')' 1717 */ 1718 private void parseMixed(char[] saved) throws Exception { 1719 // Check for PCDATA alone. 1720 skipWhitespace(); 1721 if (tryRead(')')) { 1722 // VC: Proper Group/PE Nesting 1723 if (readBuffer != saved) { 1724 handler.verror("Illegal Group/PE nesting"); 1725 } 1726 1727 dataBufferAppend(")*"); 1728 tryRead('*'); 1729 return; 1730 } 1731 1732 // Parse mixed content. 1733 skipWhitespace(); 1734 while (!tryRead(")")) { 1735 require('|'); 1736 dataBufferAppend('|'); 1737 skipWhitespace(); 1738 dataBufferAppend(readNmtoken(true)); 1739 skipWhitespace(); 1740 } 1741 1742 // VC: Proper Group/PE Nesting 1743 if (readBuffer != saved) { 1744 handler.verror("Illegal Group/PE nesting"); 1745 } 1746 1747 require('*'); 1748 dataBufferAppend(")*"); 1749 } 1750 1751 /** 1752 * Parse an attribute list declaration. 1753 * 1754 * <pre> 1755 * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' 1756 * </pre> 1757 * 1758 * <p> 1759 * NOTE: the '<!ATTLIST' has already been read. 1760 */ 1761 private void parseAttlistDecl() throws Exception { 1762 String elementName; 1763 1764 requireWhitespace(); 1765 elementName = readNmtoken(true); 1766 boolean white = tryWhitespace(); 1767 while (!tryRead('>')) { 1768 if (!white) { 1769 fatal("whitespace required before attribute definition"); 1770 } 1771 parseAttDef(elementName); 1772 white = tryWhitespace(); 1773 } 1774 } 1775 1776 /** 1777 * Parse a single attribute definition. 1778 * 1779 * <pre> 1780 * [53] AttDef ::= S Name S AttType S DefaultDecl 1781 * </pre> 1782 */ 1783 private void parseAttDef(String elementName) throws Exception { 1784 String name; 1785 String type; 1786 String enumer = null; 1787 1788 // Read the attribute name. 1789 name = readNmtoken(true); 1790 1791 // Read the attribute type. 1792 requireWhitespace(); 1793 type = readAttType(); 1794 1795 // Get the string of enumerated values if necessary. 1796 if (handler.stringInterning) { 1797 if ("ENUMERATION" == type || "NOTATION" == type) { 1798 enumer = dataBufferToString(); 1799 } 1800 } else { 1801 if ("ENUMERATION".equals(type) || "NOTATION".equals(type)) { 1802 enumer = dataBufferToString(); 1803 } 1804 } 1805 1806 // Read the default value. 1807 requireWhitespace(); 1808 parseDefault(elementName, name, type, enumer); 1809 } 1810 1811 /** 1812 * Parse the attribute type. 1813 * 1814 * <pre> 1815 * [54] AttType ::= StringType | TokenizedType | EnumeratedType 1816 * [55] StringType ::= 'CDATA' 1817 * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' 1818 * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' 1819 * [57] EnumeratedType ::= NotationType | Enumeration 1820 * </pre> 1821 */ 1822 private String readAttType() throws Exception { 1823 if (tryRead('(')) { 1824 parseEnumeration(false); 1825 return "ENUMERATION"; 1826 } else { 1827 String typeString = readNmtoken(true); 1828 if (handler.stringInterning) { 1829 if ("NOTATION" == typeString) { 1830 parseNotationType(); 1831 return typeString; 1832 } else if ("CDATA" == typeString || "ID" == typeString 1833 || "IDREF" == typeString || "IDREFS" == typeString 1834 || "ENTITY" == typeString || "ENTITIES" == typeString 1835 || "NMTOKEN" == typeString || "NMTOKENS" == typeString) { 1836 return typeString; 1837 } 1838 } else { 1839 if ("NOTATION".equals(typeString)) { 1840 parseNotationType(); 1841 return typeString; 1842 } else if ("CDATA".equals(typeString) 1843 || "ID".equals(typeString) 1844 || "IDREF".equals(typeString) 1845 || "IDREFS".equals(typeString) 1846 || "ENTITY".equals(typeString) 1847 || "ENTITIES".equals(typeString) 1848 || "NMTOKEN".equals(typeString) 1849 || "NMTOKENS".equals(typeString)) { 1850 return typeString; 1851 } 1852 } 1853 fatal("illegal attribute type", typeString, null); 1854 return null; 1855 } 1856 } 1857 1858 /** 1859 * Parse an enumeration. 1860 * 1861 * <pre> 1862 * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' 1863 * </pre> 1864 * 1865 * <p> 1866 * NOTE: the '(' has already been read. 1867 */ 1868 private void parseEnumeration(boolean isNames) throws Exception { 1869 dataBufferAppend('('); 1870 1871 // Read the first token. 1872 skipWhitespace(); 1873 dataBufferAppend(readNmtoken(isNames)); 1874 // Read the remaining tokens. 1875 skipWhitespace(); 1876 while (!tryRead(')')) { 1877 require('|'); 1878 dataBufferAppend('|'); 1879 skipWhitespace(); 1880 dataBufferAppend(readNmtoken(isNames)); 1881 skipWhitespace(); 1882 } 1883 dataBufferAppend(')'); 1884 } 1885 1886 /** 1887 * Parse a notation type for an attribute. 1888 * 1889 * <pre> 1890 * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks 1891 * (S? '|' S? name)* S? ')' 1892 * </pre> 1893 * 1894 * <p> 1895 * NOTE: the 'NOTATION' has already been read 1896 */ 1897 private void parseNotationType() throws Exception { 1898 requireWhitespace(); 1899 require('('); 1900 1901 parseEnumeration(true); 1902 } 1903 1904 /** 1905 * Parse the default value for an attribute. 1906 * 1907 * <pre> 1908 * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' 1909 * | (('#FIXED' S)? AttValue) 1910 * </pre> 1911 */ 1912 private void parseDefault(String elementName, String name, String type, 1913 String enumer) throws Exception { 1914 int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; 1915 String value = null; 1916 int flags = LIT_ATTRIBUTE; 1917 boolean saved = expandPE; 1918 String defaultType = null; 1919 1920 // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace 1921 // chars to spaces (doesn't matter when that's done if it doesn't 1922 // interfere with char refs expanding to whitespace). 1923 1924 if (!skippedPE) { 1925 flags |= LIT_ENTITY_REF; 1926 if (handler.stringInterning) { 1927 if ("CDATA" != type) { 1928 flags |= LIT_NORMALIZE; 1929 } 1930 } else { 1931 if (!"CDATA".equals(type)) { 1932 flags |= LIT_NORMALIZE; 1933 } 1934 } 1935 } 1936 1937 expandPE = false; 1938 if (tryRead('#')) { 1939 if (tryRead("FIXED")) { 1940 defaultType = "#FIXED"; 1941 valueType = ATTRIBUTE_DEFAULT_FIXED; 1942 requireWhitespace(); 1943 value = readLiteral(flags); 1944 } else if (tryRead("REQUIRED")) { 1945 defaultType = "#REQUIRED"; 1946 valueType = ATTRIBUTE_DEFAULT_REQUIRED; 1947 } else if (tryRead("IMPLIED")) { 1948 defaultType = "#IMPLIED"; 1949 valueType = ATTRIBUTE_DEFAULT_IMPLIED; 1950 } else { 1951 fatal("illegal keyword for attribute default value"); 1952 } 1953 } else { 1954 value = readLiteral(flags); 1955 } 1956 expandPE = saved; 1957 setAttribute(elementName, name, type, enumer, value, valueType); 1958 if (handler.stringInterning) { 1959 if ("ENUMERATION" == type) { 1960 type = enumer; 1961 } else if ("NOTATION" == type) { 1962 type = "NOTATION " + enumer; 1963 } 1964 } else { 1965 if ("ENUMERATION".equals(type)) { 1966 type = enumer; 1967 } else if ("NOTATION".equals(type)) { 1968 type = "NOTATION " + enumer; 1969 } 1970 } 1971 if (!skippedPE) { 1972 handler.getDeclHandler().attributeDecl(elementName, name, type, 1973 defaultType, value); 1974 } 1975 } 1976 1977 /** 1978 * Parse a conditional section. 1979 * 1980 * <pre> 1981 * [61] conditionalSect ::= includeSect || ignoreSect 1982 * [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' 1983 * extSubsetDecl ']]>' 1984 * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' 1985 * ignoreSectContents* ']]>' 1986 * [64] ignoreSectContents ::= Ignore 1987 * ('<![' ignoreSectContents* ']]>' Ignore )* 1988 * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* ) 1989 * </pre> 1990 * 1991 * <p> 1992 * NOTE: the '>![' has already been read. 1993 */ 1994 private void parseConditionalSect(char[] saved) throws Exception { 1995 skipWhitespace(); 1996 if (tryRead("INCLUDE")) { 1997 skipWhitespace(); 1998 require('['); 1999 // VC: Proper Conditional Section/PE Nesting 2000 if (readBuffer != saved) { 2001 handler.verror("Illegal Conditional Section/PE nesting"); 2002 } 2003 skipWhitespace(); 2004 while (!tryRead("]]>")) { 2005 parseMarkupdecl(); 2006 skipWhitespace(); 2007 } 2008 } else if (tryRead("IGNORE")) { 2009 skipWhitespace(); 2010 require('['); 2011 // VC: Proper Conditional Section/PE Nesting 2012 if (readBuffer != saved) { 2013 handler.verror("Illegal Conditional Section/PE nesting"); 2014 } 2015 char c; 2016 expandPE = false; 2017 for (int nest = 1; nest > 0;) { 2018 c = readCh(); 2019 switch (c) { 2020 case '<': 2021 if (tryRead("![")) { 2022 nest++; 2023 } 2024 case ']': 2025 if (tryRead("]>")) { 2026 nest--; 2027 } 2028 } 2029 } 2030 expandPE = true; 2031 } else { 2032 fatal("conditional section must begin with INCLUDE or IGNORE"); 2033 } 2034 } 2035 2036 private void parseCharRef() throws SAXException, IOException { 2037 parseCharRef(true /* do flushDataBuffer by default */); 2038 } 2039 2040 /** 2041 * Try to read a character reference without consuming data from buffer. 2042 * 2043 * <pre> 2044 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' 2045 * </pre> 2046 * 2047 * <p> 2048 * NOTE: the '&#' has already been read. 2049 */ 2050 private void tryReadCharRef() throws SAXException, IOException { 2051 int value = 0; 2052 char c; 2053 2054 if (tryRead('x')) { 2055 loop1: while (true) { 2056 c = readCh(); 2057 if (c == ';') { 2058 break loop1; 2059 } else { 2060 int n = Character.digit(c, 16); 2061 if (n == -1) { 2062 fatal("illegal character in character reference", c, 2063 null); 2064 break loop1; 2065 } 2066 value *= 16; 2067 value += n; 2068 } 2069 } 2070 } else { 2071 loop2: while (true) { 2072 c = readCh(); 2073 if (c == ';') { 2074 break loop2; 2075 } else { 2076 int n = Character.digit(c, 10); 2077 if (n == -1) { 2078 fatal("illegal character in character reference", c, 2079 null); 2080 break loop2; 2081 } 2082 value *= 10; 2083 value += n; 2084 } 2085 } 2086 } 2087 2088 // check for character refs being legal XML 2089 if ((value < 0x0020 && !(value == '\n' || value == '\t' || value == '\r')) 2090 || (value >= 0xD800 && value <= 0xDFFF) 2091 || value == 0xFFFE 2092 || value == 0xFFFF || value > 0x0010ffff) { 2093 fatal("illegal XML character reference U+" 2094 + Integer.toHexString(value)); 2095 } else if (value >= 0x007F && value <= 0x009F) // 2006-11-13 hsivonen 2096 { 2097 handler.warn("Character reference expands to a control character: U+00" 2098 + Integer.toHexString(c) + "."); 2099 } 2100 if (isPrivateUse(value)) { 2101 warnAboutPrivateUseChar(); 2102 } 2103 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz 2104 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: 2105 if (value > 0x0010ffff) { 2106 // too big for surrogate 2107 fatal("character reference " + value + " is too large for UTF-16", 2108 new Integer(value).toString(), null); 2109 } 2110 2111 } 2112 2113 /** 2114 * Read and interpret a character reference. 2115 * 2116 * <pre> 2117 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' 2118 * </pre> 2119 * 2120 * <p> 2121 * NOTE: the '&#' has already been read. 2122 */ 2123 private void parseCharRef(boolean doFlush) throws SAXException, IOException { 2124 int value = 0; 2125 char c; 2126 2127 if (tryRead('x')) { 2128 loop1: while (true) { 2129 c = readCh(); 2130 if (c == ';') { 2131 break loop1; 2132 } else { 2133 int n = Character.digit(c, 16); 2134 if (n == -1) { 2135 fatal("illegal character in character reference", c, 2136 null); 2137 break loop1; 2138 } 2139 value *= 16; 2140 value += n; 2141 } 2142 } 2143 } else { 2144 loop2: while (true) { 2145 c = readCh(); 2146 if (c == ';') { 2147 break loop2; 2148 } else { 2149 int n = Character.digit(c, 10); 2150 if (n == -1) { 2151 fatal("illegal character in character reference", c, 2152 null); 2153 break loop2; 2154 } 2155 value *= 10; 2156 value += c - '0'; 2157 } 2158 } 2159 } 2160 2161 // check for character refs being legal XML 2162 if ((value < 0x0020 && !(value == '\n' || value == '\t' || value == '\r')) 2163 || (value >= 0xD800 && value <= 0xDFFF) 2164 || value == 0xFFFE 2165 || value == 0xFFFF || value > 0x0010ffff) { 2166 fatal("illegal XML character reference U+" 2167 + Integer.toHexString(value)); 2168 } else if (value >= 0x007F && value <= 0x009F) // 2006-11-13 hsivonen 2169 { 2170 handler.warn("Character reference expands to a control character: U+00" 2171 + Integer.toHexString(c) + "."); 2172 } 2173 if (isPrivateUse(value)) { 2174 warnAboutPrivateUseChar(); 2175 } 2176 2177 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz 2178 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: 2179 if (value <= 0x0000ffff) { 2180 // no surrogates needed 2181 dataBufferAppend((char) value); 2182 } else if (value <= 0x0010ffff) { 2183 value -= 0x10000; 2184 // > 16 bits, surrogate needed 2185 dataBufferAppend((char) (0xd800 | (value >> 10))); 2186 dataBufferAppend((char) (0xdc00 | (value & 0x0003ff))); 2187 } else { 2188 // too big for surrogate 2189 fatal("character reference " + value + " is too large for UTF-16", 2190 new Integer(value).toString(), null); 2191 } 2192 if (doFlush) { 2193 dataBufferFlush(); 2194 } 2195 } 2196 2197 /** 2198 * Parse and expand an entity reference. 2199 * 2200 * <pre> 2201 * [68] EntityRef ::= '&' Name ';' 2202 * </pre> 2203 * 2204 * <p> 2205 * NOTE: the '&' has already been read. 2206 * 2207 * @param externalAllowed 2208 * External entities are allowed here. 2209 */ 2210 private void parseEntityRef(boolean externalAllowed) throws SAXException, 2211 IOException { 2212 String name; 2213 2214 name = readNmtoken(true); 2215 require(';'); 2216 switch (getEntityType(name)) { 2217 case ENTITY_UNDECLARED: 2218 // NOTE: XML REC describes amazingly convoluted handling for 2219 // this case. Nothing as meaningful as being a WFness error 2220 // unless the processor might _legitimately_ not have seen a 2221 // declaration ... which is what this implements. 2222 String message; 2223 2224 message = "reference to undeclared general entity " + name; 2225 if (skippedPE && !docIsStandalone) { 2226 handler.verror(message); 2227 // we don't know this entity, and it might be external... 2228 if (externalAllowed) { 2229 handler.skippedEntity(name); 2230 } 2231 } else { 2232 fatal(message); 2233 } 2234 break; 2235 case ENTITY_INTERNAL: 2236 pushString(name, getEntityValue(name)); 2237 2238 // workaround for possible input pop before marking 2239 // the buffer reading position 2240 char t = readCh(); 2241 unread(t); 2242 int bufferPosMark = readBufferPos; 2243 2244 int end = readBufferPos + getEntityValue(name).length(); 2245 for (int k = readBufferPos; k < end; k++) { 2246 t = readCh(); 2247 if (t == '&') { 2248 t = readCh(); 2249 if (t == '#') { 2250 // try to match a character ref 2251 tryReadCharRef(); 2252 2253 // everything has been read 2254 if (readBufferPos >= end) { 2255 break; 2256 } 2257 k = readBufferPos; 2258 continue; 2259 } else if (Character.isLetter(t)) { 2260 // looks like an entity ref 2261 unread(t); 2262 readNmtoken(true); 2263 require(';'); 2264 2265 // everything has been read 2266 if (readBufferPos >= end) { 2267 break; 2268 } 2269 k = readBufferPos; 2270 continue; 2271 } 2272 fatal(" malformed entity reference"); 2273 } 2274 2275 } 2276 readBufferPos = bufferPosMark; 2277 break; 2278 case ENTITY_TEXT: 2279 if (externalAllowed) { 2280 pushURL(false, name, getEntityIds(name), null, null, null, 2281 true); 2282 } else { 2283 fatal("reference to external entity in attribute value.", 2284 name, null); 2285 } 2286 break; 2287 case ENTITY_NDATA: 2288 if (externalAllowed) { 2289 fatal("unparsed entity reference in content", name, null); 2290 } else { 2291 fatal("reference to external entity in attribute value.", 2292 name, null); 2293 } 2294 break; 2295 default: 2296 throw new RuntimeException(); 2297 } 2298 } 2299 2300 /** 2301 * Parse and expand a parameter entity reference. 2302 * 2303 * <pre> 2304 * [69] PEReference ::= '%' Name ';' 2305 * </pre> 2306 * 2307 * <p> 2308 * NOTE: the '%' has already been read. 2309 */ 2310 private void parsePEReference() throws SAXException, IOException { 2311 String name; 2312 2313 name = "%" + readNmtoken(true); 2314 require(';'); 2315 switch (getEntityType(name)) { 2316 case ENTITY_UNDECLARED: 2317 // VC: Entity Declared 2318 handler.verror("reference to undeclared parameter entity " 2319 + name); 2320 2321 // we should disable handling of all subsequent declarations 2322 // unless this is a standalone document (info discarded) 2323 break; 2324 case ENTITY_INTERNAL: 2325 if (inLiteral) { 2326 pushString(name, getEntityValue(name)); 2327 } else { 2328 pushString(name, ' ' + getEntityValue(name) + ' '); 2329 } 2330 break; 2331 case ENTITY_TEXT: 2332 if (!inLiteral) { 2333 pushString(null, " "); 2334 } 2335 pushURL(true, name, getEntityIds(name), null, null, null, true); 2336 if (!inLiteral) { 2337 pushString(null, " "); 2338 } 2339 break; 2340 } 2341 } 2342 2343 /** 2344 * Parse an entity declaration. 2345 * 2346 * <pre> 2347 * [70] EntityDecl ::= GEDecl | PEDecl 2348 * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' 2349 * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' 2350 * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) 2351 * [74] PEDef ::= EntityValue | ExternalID 2352 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 2353 * | 'PUBLIC' S PubidLiteral S SystemLiteral 2354 * [76] NDataDecl ::= S 'NDATA' S Name 2355 * </pre> 2356 * 2357 * <p> 2358 * NOTE: the '<!ENTITY' has already been read. 2359 */ 2360 private void parseEntityDecl() throws Exception { 2361 boolean peFlag = false; 2362 int flags = 0; 2363 2364 // Check for a parameter entity. 2365 expandPE = false; 2366 requireWhitespace(); 2367 if (tryRead('%')) { 2368 peFlag = true; 2369 requireWhitespace(); 2370 } 2371 expandPE = true; 2372 2373 // Read the entity name, and prepend 2374 // '%' if necessary. 2375 String name = readNmtoken(true); 2376 // NE08 2377 if (name.indexOf(':') >= 0) { 2378 fatal("Illegal character(':') in entity name ", name, null); 2379 } 2380 if (peFlag) { 2381 name = "%" + name; 2382 } 2383 2384 // Read the entity value. 2385 requireWhitespace(); 2386 char c = readCh(); 2387 unread(c); 2388 if (c == '"' || c == '\'') { 2389 // Internal entity ... replacement text has expanded refs 2390 // to characters and PEs, but not to general entities 2391 String value = readLiteral(flags); 2392 setInternalEntity(name, value); 2393 } else { 2394 // Read the external IDs 2395 ExternalIdentifiers ids = readExternalIds(false, false); 2396 2397 // Check for NDATA declaration. 2398 boolean white = tryWhitespace(); 2399 if (!peFlag && tryRead("NDATA")) { 2400 if (!white) { 2401 fatal("whitespace required before NDATA"); 2402 } 2403 requireWhitespace(); 2404 String notationName = readNmtoken(true); 2405 if (!skippedPE) { 2406 setExternalEntity(name, ENTITY_NDATA, ids, notationName); 2407 handler.unparsedEntityDecl(name, ids.publicId, 2408 ids.systemId, ids.baseUri, notationName); 2409 } 2410 } else if (!skippedPE) { 2411 setExternalEntity(name, ENTITY_TEXT, ids, null); 2412 handler.getDeclHandler().externalEntityDecl(name, ids.publicId, 2413 handler.resolveURIs() 2414 // FIXME: ASSUMES not skipped 2415 // "false" forces error on bad URI 2416 ? handler.absolutize(ids.baseUri, ids.systemId, false) 2417 : ids.systemId); 2418 } 2419 } 2420 2421 // Finish the declaration. 2422 skipWhitespace(); 2423 require('>'); 2424 } 2425 2426 /** 2427 * Parse a notation declaration. 2428 * 2429 * <pre> 2430 * [82] NotationDecl ::= '<!NOTATION' S Name S 2431 * (ExternalID | PublicID) S? '>' 2432 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 2433 * </pre> 2434 * 2435 * <P> 2436 * NOTE: the '<!NOTATION' has already been read. 2437 */ 2438 private void parseNotationDecl() throws Exception { 2439 String nname; 2440 ExternalIdentifiers ids; 2441 2442 requireWhitespace(); 2443 nname = readNmtoken(true); 2444 // NE08 2445 if (nname.indexOf(':') >= 0) { 2446 fatal("Illegal character(':') in notation name ", nname, null); 2447 } 2448 requireWhitespace(); 2449 2450 // Read the external identifiers. 2451 ids = readExternalIds(true, false); 2452 2453 // Register the notation. 2454 setNotation(nname, ids); 2455 2456 skipWhitespace(); 2457 require('>'); 2458 } 2459 2460 /** 2461 * Parse character data. 2462 * 2463 * <pre> 2464 * [14] CharData ::= [ˆ<&]* - ([ˆ<&]* ']]>' [ˆ<&]*) 2465 * </pre> 2466 */ 2467 private void parseCharData() throws Exception { 2468 char c; 2469 int state = 0; 2470 boolean pureWhite = false; 2471 2472 // assert (dataBufferPos == 0); 2473 2474 // are we expecting pure whitespace? it might be dirty... 2475 if ((currentElementContent == CONTENT_ELEMENTS) 2476 && !isDirtyCurrentElement) { 2477 pureWhite = true; 2478 } 2479 2480 // always report right out of readBuffer 2481 // to minimize (pointless) buffer copies 2482 while (true) { 2483 int i; 2484 2485 loop: for (i = readBufferPos; i < readBufferLength; i++) { 2486 advanceLocation(); 2487 switch (c = readBuffer[i]) { 2488 case '\n': 2489 nextCharOnNewLine = true; 2490 // pureWhite unmodified 2491 break; 2492 case '\r': // should not happen!! 2493 case '\t': 2494 case ' ': 2495 // pureWhite unmodified 2496 break; 2497 case '&': 2498 case '<': 2499 // pureWhite unmodified 2500 // CLEAN end of text sequence 2501 state = 1; 2502 break loop; 2503 case ']': 2504 // that's not a whitespace char, and 2505 // can not terminate pure whitespace either 2506 pureWhite = false; 2507 if ((i + 2) < readBufferLength) { 2508 if (readBuffer[i + 1] == ']' 2509 && readBuffer[i + 2] == '>') { 2510 // ERROR end of text sequence 2511 state = 2; 2512 break loop; 2513 } 2514 } else { 2515 // FIXME missing two end-of-buffer cases 2516 } 2517 break; 2518 default: 2519 if ((c < 0x0020 || c > 0xFFFD) 2520 || ((c >= 0x007f) && (c <= 0x009f) 2521 && (c != 0x0085) && xmlVersion == XML_11)) { 2522 fatal("illegal XML character U+" 2523 + Integer.toHexString(c)); 2524 } else if (c >= '\u007F' && c <= '\u009F') // 2006-04-25 2525 // hsivonen 2526 { 2527 handler.warn("Saw a control character: U+00" 2528 + Integer.toHexString(c) + "."); 2529 } 2530 // that's not a whitespace char 2531 pureWhite = false; 2532 } 2533 } 2534 rollbackLocation(); 2535 // report characters/whitspace 2536 int length = i - readBufferPos; 2537 2538 if (length != 0) { 2539 int saveLine = line; 2540 int saveColumn = column; 2541 line = linePrev; 2542 column = columnPrev; 2543 if (pureWhite) { 2544 handler.ignorableWhitespace(readBuffer, readBufferPos, 2545 length); 2546 } else { 2547 handler.charData(readBuffer, readBufferPos, length); 2548 } 2549 line = saveLine; 2550 column = saveColumn; 2551 readBufferPos = i; 2552 } 2553 2554 if (state != 0) { 2555 break; 2556 } 2557 2558 // fill next buffer from this entity, or 2559 // pop stack and continue with previous entity 2560 unread(readCh()); 2561 } 2562 if (!pureWhite) { 2563 isDirtyCurrentElement = true; 2564 } 2565 // finish, maybe with error 2566 if (state != 1) // finish, no error 2567 { 2568 fatal("character data may not contain ']]>'"); 2569 } 2570 } 2571 2572 /** 2573 * 2574 */ 2575 private void advanceLocation() { 2576 linePrev = line; 2577 columnPrev = column; 2578 if (nextCharOnNewLine) { 2579 line++; 2580 column = 1; 2581 } else { 2582 column++; 2583 } 2584 nextCharOnNewLine = false; 2585 } 2586 2587 // //////////////////////////////////////////////////////////////////// 2588 // High-level reading and scanning methods. 2589 // //////////////////////////////////////////////////////////////////// 2590 2591 /** 2592 * Require whitespace characters. 2593 */ 2594 private void requireWhitespace() throws SAXException, IOException { 2595 char c = readCh(); 2596 if (isWhitespace(c)) { 2597 skipWhitespace(); 2598 } else { 2599 fatal("whitespace required", c, null); 2600 } 2601 } 2602 2603 /** 2604 * Skip whitespace characters. 2605 * 2606 * <pre> 2607 * [3] S ::= (#x20 | #x9 | #xd | #xa)+ 2608 * </pre> 2609 */ 2610 private void skipWhitespace() throws SAXException, IOException { 2611 // Start with a little cheat. Most of 2612 // the time, the white space will fall 2613 // within the current read buffer; if 2614 // not, then fall through. 2615 if (USE_CHEATS) { 2616 2617 loop: for (int i = readBufferPos; i < readBufferLength; i++) { 2618 advanceLocation(); 2619 switch (readBuffer[i]) { 2620 case ' ': 2621 case '\t': 2622 case '\r': 2623 break; 2624 case '\n': 2625 nextCharOnNewLine = true; 2626 break; 2627 case '%': 2628 if (expandPE) { 2629 break loop; 2630 } 2631 // else fall through... 2632 default: 2633 readBufferPos = i; 2634 return; 2635 } 2636 } 2637 } 2638 2639 // OK, do it the slow way. 2640 char c = readCh(); 2641 while (isWhitespace(c)) { 2642 c = readCh(); 2643 } 2644 unread(c); 2645 } 2646 2647 /** 2648 * Read a name or (when parsing an enumeration) name token. 2649 * 2650 * <pre> 2651 * [5] Name ::= (Letter | '_' | ':') (NameChar)* 2652 * [7] Nmtoken ::= (NameChar)+ 2653 * </pre> 2654 */ 2655 private String readNmtoken(boolean isName) throws SAXException, IOException { 2656 char c; 2657 2658 if (USE_CHEATS) { 2659 loop: for (int i = readBufferPos; i < readBufferLength; i++) { 2660 c = readBuffer[i]; 2661 switch (c) { 2662 case '%': 2663 if (expandPE) { 2664 break loop; 2665 } 2666 // else fall through... 2667 2668 // What may legitimately come AFTER a name/nmtoken? 2669 case '<': 2670 case '>': 2671 case '&': 2672 case ',': 2673 case '|': 2674 case '*': 2675 case '+': 2676 case '?': 2677 case ')': 2678 case '=': 2679 case '\'': 2680 case '"': 2681 case '[': 2682 case ' ': 2683 case '\t': 2684 case '\r': 2685 case '\n': 2686 case ';': 2687 case '/': 2688 int start = readBufferPos; 2689 if (i == start) { 2690 fatal("name expected", readBuffer[i], null); 2691 } 2692 readBufferPos = i; 2693 return intern(readBuffer, start, i - start); 2694 2695 default: 2696 // FIXME ... per IBM's OASIS test submission, these: 2697 // ? U+06dd 2698 // Combining U+309B 2699 // these switches are kind of ugly but at least we won't 2700 // have to go over the whole lits for each char 2701 if (isName && i == readBufferPos) { 2702 char c2 = (char) (c & 0x00f0); 2703 switch (c & 0xff00) { 2704 // starting with 01 2705 case 0x0100: 2706 switch (c2) { 2707 case 0x0030: 2708 if (c == 0x0132 || c == 0x0133 2709 || c == 0x013f) { 2710 fatal("Not a name start character, U+" 2711 + Integer.toHexString(c)); 2712 } 2713 break; 2714 case 0x0040: 2715 if (c == 0x0140 || c == 0x0149) { 2716 fatal("Not a name start character, U+" 2717 + Integer.toHexString(c)); 2718 } 2719 break; 2720 case 0x00c0: 2721 if (c == 0x01c4 || c == 0x01cc) { 2722 fatal("Not a name start character, U+" 2723 + Integer.toHexString(c)); 2724 } 2725 break; 2726 case 0x00f0: 2727 if (c == 0x01f1 || c == 0x01f3) { 2728 fatal("Not a name start character, U+" 2729 + Integer.toHexString(c)); 2730 } 2731 break; 2732 case 0x00b0: 2733 if (c == 0x01f1 || c == 0x01f3) { 2734 fatal("Not a name start character, U+" 2735 + Integer.toHexString(c)); 2736 } 2737 break; 2738 default: 2739 if (c == 0x017f) { 2740 fatal("Not a name start character, U+" 2741 + Integer.toHexString(c)); 2742 } 2743 } 2744 2745 break; 2746 // starting with 11 2747 case 0x1100: 2748 switch (c2) { 2749 case 0x0000: 2750 if (c == 0x1104 || c == 0x1108 2751 || c == 0x110a 2752 || c == 0x110d) { 2753 fatal("Not a name start character, U+" 2754 + Integer.toHexString(c)); 2755 } 2756 break; 2757 case 0x0030: 2758 if (c == 0x113b || c == 0x113f) { 2759 fatal("Not a name start character, U+" 2760 + Integer.toHexString(c)); 2761 } 2762 break; 2763 case 0x0040: 2764 if (c == 0x1141 || c == 0x114d 2765 || c == 0x114f) { 2766 fatal("Not a name start character, U+" 2767 + Integer.toHexString(c)); 2768 } 2769 break; 2770 case 0x0050: 2771 if (c == 0x1151 || c == 0x1156) { 2772 fatal("Not a name start character, U+" 2773 + Integer.toHexString(c)); 2774 } 2775 break; 2776 case 0x0060: 2777 if (c == 0x1162 || c == 0x1164 2778 || c == 0x1166 2779 || c == 0x116b 2780 || c == 0x116f) { 2781 fatal("Not a name start character, U+" 2782 + Integer.toHexString(c)); 2783 } 2784 break; 2785 case 0x00b0: 2786 if (c == 0x11b6 || c == 0x11b9 2787 || c == 0x11bb 2788 || c == 0x116f) { 2789 fatal("Not a name start character, U+" 2790 + Integer.toHexString(c)); 2791 } 2792 break; 2793 default: 2794 if (c == 0x1174 || c == 0x119f 2795 || c == 0x11ac 2796 || c == 0x11c3 2797 || c == 0x11f1) { 2798 fatal("Not a name start character, U+" 2799 + Integer.toHexString(c)); 2800 } 2801 } 2802 break; 2803 default: 2804 if (c == 0x0e46 || c == 0x1011 2805 || c == 0x212f || c == 0x0587 2806 || c == 0x0230) { 2807 fatal("Not a name start character, U+" 2808 + Integer.toHexString(c)); 2809 } 2810 } 2811 } 2812 // punt on exact tests from Appendix A; approximate 2813 // them using the Unicode ID start/part rules 2814 if (i == readBufferPos && isName) { 2815 if (!Character.isUnicodeIdentifierStart(c) 2816 && c != ':' && c != '_') { 2817 fatal("Not a name start character, U+" 2818 + Integer.toHexString(c)); 2819 } 2820 } else if (!Character.isUnicodeIdentifierPart(c) 2821 && c != '-' && c != ':' && c != '_' && c != '.' 2822 && !isExtender(c)) { 2823 fatal("Not a name character, U+" 2824 + Integer.toHexString(c)); 2825 } 2826 } 2827 } 2828 } 2829 2830 nameBufferPos = 0; 2831 2832 // Read the first character. 2833 loop: while (true) { 2834 c = readCh(); 2835 switch (c) { 2836 case '%': 2837 case '<': 2838 case '>': 2839 case '&': 2840 case ',': 2841 case '|': 2842 case '*': 2843 case '+': 2844 case '?': 2845 case ')': 2846 case '=': 2847 case '\'': 2848 case '"': 2849 case '[': 2850 case ' ': 2851 case '\t': 2852 case '\n': 2853 case '\r': 2854 case ';': 2855 case '/': 2856 unread(c); 2857 if (nameBufferPos == 0) { 2858 fatal("name expected"); 2859 } 2860 // punt on exact tests from Appendix A, but approximate them 2861 if (isName 2862 && !Character.isUnicodeIdentifierStart(nameBuffer[0]) 2863 && ":_".indexOf(nameBuffer[0]) == -1) { 2864 fatal("Not a name start character, U+" 2865 + Integer.toHexString(nameBuffer[0])); 2866 } 2867 String s = intern(nameBuffer, 0, nameBufferPos); 2868 nameBufferPos = 0; 2869 return s; 2870 default: 2871 // punt on exact tests from Appendix A, but approximate them 2872 2873 if ((nameBufferPos != 0 || !isName) 2874 && !Character.isUnicodeIdentifierPart(c) 2875 && ":-_.".indexOf(c) == -1 && !isExtender(c)) { 2876 fatal("Not a name character, U+" 2877 + Integer.toHexString(c)); 2878 } 2879 if (nameBufferPos >= nameBuffer.length) { 2880 nameBuffer = (char[]) extendArray(nameBuffer, 2881 nameBuffer.length, nameBufferPos); 2882 } 2883 nameBuffer[nameBufferPos++] = c; 2884 } 2885 } 2886 } 2887 2888 private static boolean isExtender(char c) { 2889 // [88] Extender ::= ... 2890 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 2891 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 2892 || (c >= 0x3031 && c <= 0x3035) || (c >= 0x309d && c <= 0x309e) 2893 || (c >= 0x30fc && c <= 0x30fe); 2894 } 2895 2896 /** 2897 * Read a literal. With matching single or double quotes as delimiters (and 2898 * not embedded!) this is used to parse: 2899 * 2900 * <pre> 2901 * [9] EntityValue ::= ... ([ˆ%&] | PEReference | Reference)* ... 2902 * [10] AttValue ::= ... ([ˆ<&] | Reference)* ... 2903 * [11] SystemLiteral ::= ... (URLchar - "'")* ... 2904 * [12] PubidLiteral ::= ... (PubidChar - "'")* ... 2905 * </pre> 2906 * 2907 * as well as the quoted strings in XML and text declarations (for version, 2908 * encoding, and standalone) which have their own constraints. 2909 */ 2910 private String readLiteral(int flags) throws SAXException, IOException { 2911 char delim, c; 2912 int startLine = line; 2913 boolean saved = expandPE; 2914 boolean savedReport = doReport; 2915 2916 // Find the first delimiter. 2917 delim = readCh(); 2918 if (delim != '"' && delim != '\'') { 2919 fatal("expected '\"' or \"'\"", delim, null); 2920 return null; 2921 } 2922 inLiteral = true; 2923 if ((flags & LIT_DISABLE_PE) != 0) { 2924 expandPE = false; 2925 } 2926 doReport = false; 2927 2928 // Each level of input source has its own buffer; remember 2929 // ours, so we won't read the ending delimiter from any 2930 // other input source, regardless of entity processing. 2931 char[] ourBuf = readBuffer; 2932 2933 // Read the literal. 2934 try { 2935 c = readCh(); 2936 loop: while (!(c == delim && readBuffer == ourBuf)) { 2937 switch (c) { 2938 // attributes and public ids are normalized 2939 // in almost the same ways 2940 case '\n': 2941 case '\r': 2942 if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) { 2943 c = ' '; 2944 } 2945 break; 2946 case '\t': 2947 if ((flags & LIT_ATTRIBUTE) != 0) { 2948 c = ' '; 2949 } 2950 break; 2951 case '&': 2952 c = readCh(); 2953 // Char refs are expanded immediately, except for 2954 // all the cases where it's deferred. 2955 if (c == '#') { 2956 if ((flags & LIT_DISABLE_CREF) != 0) { 2957 dataBufferAppend('&'); 2958 break; 2959 } 2960 parseCharRef(false /* Do not do flushDataBuffer */); 2961 2962 // exotic WFness risk: this is an entity literal, 2963 // dataBuffer [dataBufferPos - 1] == '&', and 2964 // following chars are a _partial_ entity/char ref 2965 2966 // It looks like an entity ref ... 2967 } else { 2968 unread(c); 2969 // Expand it? 2970 if ((flags & LIT_ENTITY_REF) > 0) { 2971 parseEntityRef(false); 2972 // Is it just data? 2973 } else if ((flags & LIT_DISABLE_EREF) != 0) { 2974 dataBufferAppend('&'); 2975 2976 // OK, it will be an entity ref -- expanded 2977 // later. 2978 } else { 2979 String name = readNmtoken(true); 2980 require(';'); 2981 dataBufferAppend('&'); 2982 dataBufferAppend(name); 2983 dataBufferAppend(';'); 2984 } 2985 } 2986 c = readCh(); 2987 continue loop; 2988 2989 case '<': 2990 // and why? Perhaps so "&foo;" expands the same 2991 // inside and outside an attribute? 2992 if ((flags & LIT_ATTRIBUTE) != 0) { 2993 fatal("attribute values may not contain '<'"); 2994 } 2995 break; 2996 2997 // We don't worry about case '%' and PE refs, readCh does. 2998 2999 default: 3000 break; 3001 } 3002 dataBufferAppend(c); 3003 c = readCh(); 3004 } 3005 } catch (EOFException e) { 3006 fatal("end of input while looking for delimiter (started on line " 3007 + startLine + ')', null, new Character(delim).toString()); 3008 } 3009 inLiteral = false; 3010 expandPE = saved; 3011 doReport = savedReport; 3012 3013 // Normalise whitespace if necessary. 3014 if ((flags & LIT_NORMALIZE) > 0) { 3015 dataBufferNormalize(); 3016 } 3017 3018 // Return the value. 3019 return dataBufferToString(); 3020 } 3021 3022 /** 3023 * Try reading external identifiers. A system identifier is not required for 3024 * notations. 3025 * 3026 * @param inNotation 3027 * Are we parsing a notation decl? 3028 * @param isSubset 3029 * Parsing external subset decl (may be omitted)? 3030 * @return A three-member String array containing the identifiers, or nulls. 3031 * Order: public, system, baseURI. 3032 */ 3033 private ExternalIdentifiers readExternalIds(boolean inNotation, 3034 boolean isSubset) throws Exception { 3035 char c; 3036 ExternalIdentifiers ids = new ExternalIdentifiers(); 3037 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 3038 3039 if (tryRead("PUBLIC")) { 3040 requireWhitespace(); 3041 ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags); 3042 if (inNotation) { 3043 skipWhitespace(); 3044 c = readCh(); 3045 unread(c); 3046 if (c == '"' || c == '\'') { 3047 ids.systemId = readLiteral(flags); 3048 } 3049 } else { 3050 requireWhitespace(); 3051 ids.systemId = readLiteral(flags); 3052 } 3053 3054 for (int i = 0; i < ids.publicId.length(); i++) { 3055 c = ids.publicId.charAt(i); 3056 if (c >= 'a' && c <= 'z') { 3057 continue; 3058 } 3059 if (c >= 'A' && c <= 'Z') { 3060 continue; 3061 } 3062 if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1) { 3063 continue; 3064 } 3065 fatal("illegal PUBLIC id character U+" + Integer.toHexString(c)); 3066 } 3067 } else if (tryRead("SYSTEM")) { 3068 requireWhitespace(); 3069 ids.systemId = readLiteral(flags); 3070 } else if (!isSubset) { 3071 fatal("missing SYSTEM or PUBLIC keyword"); 3072 } 3073 3074 if (ids.systemId != null) { 3075 if (ids.systemId.indexOf('#') != -1) { 3076 handler.verror("SYSTEM id has a URI fragment: " + ids.systemId); 3077 } 3078 ids.baseUri = handler.getSystemId(); 3079 if (ids.baseUri == null && uriWarnings) { 3080 handler.warn("No base URI; hope URI is absolute: " 3081 + ids.systemId); 3082 } 3083 } 3084 3085 return ids; 3086 } 3087 3088 /** 3089 * Test if a character is whitespace. 3090 * 3091 * <pre> 3092 * [3] S ::= (#x20 | #x9 | #xd | #xa)+ 3093 * </pre> 3094 * 3095 * @param c 3096 * The character to test. 3097 * @return true if the character is whitespace. 3098 */ 3099 private final boolean isWhitespace(char c) { 3100 if (c > 0x20) { 3101 return false; 3102 } 3103 if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) { 3104 return true; 3105 } 3106 return false; // illegal ... 3107 } 3108 3109 // //////////////////////////////////////////////////////////////////// 3110 // Utility routines. 3111 // //////////////////////////////////////////////////////////////////// 3112 3113 /** 3114 * Add a character to the data buffer. 3115 */ 3116 private void dataBufferAppend(char c) { 3117 // Expand buffer if necessary. 3118 if (dataBufferPos >= dataBuffer.length) { 3119 dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, 3120 dataBufferPos); 3121 } 3122 dataBuffer[dataBufferPos++] = c; 3123 } 3124 3125 /** 3126 * Add a string to the data buffer. 3127 */ 3128 private void dataBufferAppend(String s) { 3129 dataBufferAppend(s.toCharArray(), 0, s.length()); 3130 } 3131 3132 /** 3133 * Append (part of) a character array to the data buffer. 3134 */ 3135 private void dataBufferAppend(char[] ch, int start, int length) { 3136 dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, 3137 dataBufferPos + length); 3138 3139 System.arraycopy(ch, start, dataBuffer, dataBufferPos, length); 3140 dataBufferPos += length; 3141 } 3142 3143 /** 3144 * Normalise space characters in the data buffer. 3145 */ 3146 private void dataBufferNormalize() { 3147 int i = 0; 3148 int j = 0; 3149 int end = dataBufferPos; 3150 3151 // Skip spaces at the start. 3152 while (j < end && dataBuffer[j] == ' ') { 3153 j++; 3154 } 3155 3156 // Skip whitespace at the end. 3157 while (end > j && dataBuffer[end - 1] == ' ') { 3158 end--; 3159 } 3160 3161 // Start copying to the left. 3162 while (j < end) { 3163 3164 char c = dataBuffer[j++]; 3165 3166 // Normalise all other spaces to 3167 // a single space. 3168 if (c == ' ') { 3169 while (j < end && dataBuffer[j++] == ' ') { 3170 continue; 3171 } 3172 dataBuffer[i++] = ' '; 3173 dataBuffer[i++] = dataBuffer[j - 1]; 3174 } else { 3175 dataBuffer[i++] = c; 3176 } 3177 } 3178 3179 // The new length is <= the old one. 3180 dataBufferPos = i; 3181 } 3182 3183 /** 3184 * Convert the data buffer to a string. 3185 */ 3186 private String dataBufferToString() { 3187 String s = new String(dataBuffer, 0, dataBufferPos); 3188 dataBufferPos = 0; 3189 return s; 3190 } 3191 3192 /** 3193 * Flush the contents of the data buffer to the handler, as appropriate, and 3194 * reset the buffer for new input. 3195 */ 3196 private void dataBufferFlush() throws SAXException { 3197 int saveLine = line; 3198 int saveColumn = column; 3199 line = linePrev; 3200 column = columnPrev; 3201 if (currentElementContent == CONTENT_ELEMENTS && dataBufferPos > 0 3202 && !inCDATA) { 3203 // We can't just trust the buffer to be whitespace, there 3204 // are (error) cases when it isn't 3205 for (int i = 0; i < dataBufferPos; i++) { 3206 if (!isWhitespace(dataBuffer[i])) { 3207 handler.charData(dataBuffer, 0, dataBufferPos); 3208 dataBufferPos = 0; 3209 } 3210 } 3211 if (dataBufferPos > 0) { 3212 handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos); 3213 dataBufferPos = 0; 3214 } 3215 } else if (dataBufferPos > 0) { 3216 handler.charData(dataBuffer, 0, dataBufferPos); 3217 dataBufferPos = 0; 3218 } 3219 line = saveLine; 3220 column = saveColumn; 3221 } 3222 3223 /** 3224 * Require a string to appear, or throw an exception. 3225 * <p> 3226 * <em>Precondition:</em> Entity expansion is not required. 3227 * <p> 3228 * <em>Precondition:</em> data buffer has no characters that will get sent 3229 * to the application. 3230 */ 3231 private void require(String delim) throws SAXException, IOException { 3232 int length = delim.length(); 3233 char[] ch; 3234 3235 if (length < dataBuffer.length) { 3236 ch = dataBuffer; 3237 delim.getChars(0, length, ch, 0); 3238 } else { 3239 ch = delim.toCharArray(); 3240 } 3241 3242 if (USE_CHEATS && length <= (readBufferLength - readBufferPos)) { 3243 int offset = readBufferPos; 3244 3245 for (int i = 0; i < length; i++, offset++) { 3246 if (ch[i] != readBuffer[offset]) { 3247 fatal("required string", null, delim); 3248 } 3249 } 3250 readBufferPos = offset; 3251 3252 } else { 3253 for (int i = 0; i < length; i++) { 3254 require(ch[i]); 3255 } 3256 } 3257 } 3258 3259 /** 3260 * Require a character to appear, or throw an exception. 3261 */ 3262 private void require(char delim) throws SAXException, IOException { 3263 char c = readCh(); 3264 3265 if (c != delim) { 3266 fatal("required character", c, new Character(delim).toString()); 3267 } 3268 } 3269 3270 /** 3271 * Create an interned string from a character array. Ælfred uses this 3272 * method to create an interned version of all names and name tokens, so 3273 * that it can test equality with <code>==</code> instead of 3274 * <code>String.equals ()</code>. 3275 * 3276 * <p> 3277 * This is much more efficient than constructing a non-interned string 3278 * first, and then interning it. 3279 * 3280 * @param ch 3281 * an array of characters for building the string. 3282 * @param start 3283 * the starting position in the array. 3284 * @param length 3285 * the number of characters to place in the string. 3286 * @return an interned string. 3287 * @see #intern (String) 3288 * @see java.lang.String#intern 3289 */ 3290 public String intern(char[] ch, int start, int length) { 3291 int index = 0; 3292 int hash = 0; 3293 Object[] bucket; 3294 3295 // Generate a hash code. This is a widely used string hash, 3296 // often attributed to Brian Kernighan. 3297 for (int i = start; i < start + length; i++) { 3298 hash = 31 * hash + ch[i]; 3299 } 3300 hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH; 3301 3302 // Get the bucket -- consists of {array,String} pairs 3303 if ((bucket = symbolTable[hash]) == null) { 3304 // first string in this bucket 3305 bucket = new Object[8]; 3306 3307 // Search for a matching tuple, and 3308 // return the string if we find one. 3309 } else { 3310 while (index < bucket.length) { 3311 char[] chFound = (char[]) bucket[index]; 3312 3313 // Stop when we hit an empty entry. 3314 if (chFound == null) { 3315 break; 3316 } 3317 3318 // If they're the same length, check for a match. 3319 if (chFound.length == length) { 3320 for (int i = 0; i < chFound.length; i++) { 3321 // continue search on failure 3322 if (ch[start + i] != chFound[i]) { 3323 break; 3324 } else if (i == length - 1) { 3325 // That's it, we have a match! 3326 return (String) bucket[index + 1]; 3327 } 3328 } 3329 } 3330 index += 2; 3331 } 3332 // Not found -- we'll have to add it. 3333 3334 // Do we have to grow the bucket? 3335 bucket = (Object[]) extendArray(bucket, bucket.length, index); 3336 } 3337 symbolTable[hash] = bucket; 3338 3339 // OK, add it to the end of the bucket -- "local" interning. 3340 // Intern "globally" to let applications share interning benefits. 3341 // That is, "!=" and "==" work on our strings, not just equals(). 3342 String s = new String(ch, start, length).intern(); 3343 bucket[index] = s.toCharArray(); 3344 bucket[index + 1] = s; 3345 return s; 3346 } 3347 3348 /** 3349 * Ensure the capacity of an array, allocating a new one if necessary. 3350 * Usually extends only for name hash collisions. 3351 */ 3352 private Object extendArray(Object array, int currentSize, int requiredSize) { 3353 if (requiredSize < currentSize) { 3354 return array; 3355 } else { 3356 System.err.println(requiredSize); 3357 System.err.flush(); 3358 Object newArray = null; 3359 int newSize = currentSize * 2; 3360 3361 if (newSize <= requiredSize) { 3362 newSize = requiredSize + 1; 3363 } 3364 3365 if (array instanceof char[]) { 3366 newArray = new char[newSize]; 3367 } else if (array instanceof Object[]) { 3368 newArray = new Object[newSize]; 3369 } else { 3370 throw new RuntimeException(); 3371 } 3372 3373 System.arraycopy(array, 0, newArray, 0, currentSize); 3374 return newArray; 3375 } 3376 } 3377 3378 // //////////////////////////////////////////////////////////////////// 3379 // XML query routines. 3380 // //////////////////////////////////////////////////////////////////// 3381 3382 boolean isStandalone() { 3383 return docIsStandalone; 3384 } 3385 3386 // 3387 // Elements 3388 // 3389 3390 private int getContentType(ElementDecl element, int defaultType) { 3391 int retval; 3392 3393 if (element == null) { 3394 return defaultType; 3395 } 3396 retval = element.contentType; 3397 if (retval == CONTENT_UNDECLARED) { 3398 retval = defaultType; 3399 } 3400 return retval; 3401 } 3402 3403 /** 3404 * Look up the content type of an element. 3405 * 3406 * @param name 3407 * The element type name. 3408 * @return An integer constant representing the content type. 3409 * @see #CONTENT_UNDECLARED 3410 * @see #CONTENT_ANY 3411 * @see #CONTENT_EMPTY 3412 * @see #CONTENT_MIXED 3413 * @see #CONTENT_ELEMENTS 3414 */ 3415 public int getElementContentType(String name) { 3416 ElementDecl element = elementInfo.get(name); 3417 return getContentType(element, CONTENT_UNDECLARED); 3418 } 3419 3420 /** 3421 * Register an element. Array format: [0] element type name [1] content 3422 * model (mixed, elements only) [2] attribute hash table 3423 */ 3424 private void setElement(String name, int contentType, String contentModel, 3425 HashMap<String, AttributeDecl> attributes) throws SAXException { 3426 if (skippedPE) { 3427 return; 3428 } 3429 3430 ElementDecl element = elementInfo.get(name); 3431 3432 // first <!ELEMENT ...> or <!ATTLIST ...> for this type? 3433 if (element == null) { 3434 element = new ElementDecl(); 3435 element.contentType = contentType; 3436 element.contentModel = contentModel; 3437 element.attributes = attributes; 3438 elementInfo.put(name, element); 3439 return; 3440 } 3441 3442 // <!ELEMENT ...> declaration? 3443 if (contentType != CONTENT_UNDECLARED) { 3444 // ... following an associated <!ATTLIST ...> 3445 if (element.contentType == CONTENT_UNDECLARED) { 3446 element.contentType = contentType; 3447 element.contentModel = contentModel; 3448 } else { 3449 // VC: Unique Element Type Declaration 3450 handler.verror("multiple declarations for element type: " 3451 + name); 3452 } 3453 } 3454 3455 // first <!ATTLIST ...>, before <!ELEMENT ...> ? 3456 else if (attributes != null) { 3457 element.attributes = attributes; 3458 } 3459 } 3460 3461 /** 3462 * Look up the attribute hash table for an element. The hash table is the 3463 * second item in the element array. 3464 */ 3465 private HashMap<String, AttributeDecl> getElementAttributes(String name) { 3466 ElementDecl element = elementInfo.get(name); 3467 return (element == null) ? null : element.attributes; 3468 } 3469 3470 // 3471 // Attributes 3472 // 3473 3474 /** 3475 * Get the declared attributes for an element type. 3476 * 3477 * @param elname 3478 * The name of the element type. 3479 * @return An iterator over all the attributes declared for a specific 3480 * element type. The results will be valid only after the DTD (if 3481 * any) has been parsed. 3482 * @see #getAttributeType 3483 * @see #getAttributeEnumeration 3484 * @see #getAttributeDefaultValueType 3485 * @see #getAttributeDefaultValue 3486 * @see #getAttributeExpandedValue 3487 */ 3488 private Iterator<String> declaredAttributes(ElementDecl element) { 3489 HashMap<String, AttributeDecl> attlist; 3490 3491 if (element == null) { 3492 return null; 3493 } 3494 if ((attlist = element.attributes) == null) { 3495 return null; 3496 } 3497 return attlist.keySet().iterator(); 3498 } 3499 3500 /** 3501 * Get the declared attributes for an element type. 3502 * 3503 * @param elname 3504 * The name of the element type. 3505 * @return An iterator over all the attributes declared for a specific 3506 * element type. The results will be valid only after the DTD (if 3507 * any) has been parsed. 3508 * @see #getAttributeType 3509 * @see #getAttributeEnumeration 3510 * @see #getAttributeDefaultValueType 3511 * @see #getAttributeDefaultValue 3512 * @see #getAttributeExpandedValue 3513 */ 3514 public Iterator<String> declaredAttributes(String elname) { 3515 return declaredAttributes(elementInfo.get(elname)); 3516 } 3517 3518 /** 3519 * Retrieve the declared type of an attribute. 3520 * 3521 * @param name 3522 * The name of the associated element. 3523 * @param aname 3524 * The name of the attribute. 3525 * @return An interend string denoting the type, or null indicating an 3526 * undeclared attribute. 3527 */ 3528 public String getAttributeType(String name, String aname) { 3529 AttributeDecl attribute = getAttribute(name, aname); 3530 return (attribute == null) ? null : attribute.type; 3531 } 3532 3533 /** 3534 * Retrieve the allowed values for an enumerated attribute type. 3535 * 3536 * @param name 3537 * The name of the associated element. 3538 * @param aname 3539 * The name of the attribute. 3540 * @return A string containing the token list. 3541 */ 3542 public String getAttributeEnumeration(String name, String aname) { 3543 AttributeDecl attribute = getAttribute(name, aname); 3544 // assert: attribute.enumeration is "ENUMERATION" or "NOTATION" 3545 return (attribute == null) ? null : attribute.enumeration; 3546 } 3547 3548 /** 3549 * Retrieve the default value of a declared attribute. 3550 * 3551 * @param name 3552 * The name of the associated element. 3553 * @param aname 3554 * The name of the attribute. 3555 * @return The default value, or null if the attribute was #IMPLIED or 3556 * simply undeclared and unspecified. 3557 * @see #getAttributeExpandedValue 3558 */ 3559 public String getAttributeDefaultValue(String name, String aname) { 3560 AttributeDecl attribute = getAttribute(name, aname); 3561 return (attribute == null) ? null : attribute.value; 3562 } 3563 3564 /* 3565 * // FIXME: Leaving this in, until W3C finally resolves the confusion // 3566 * between parts of the XML 2nd REC about when entity declararations // are 3567 * guaranteed to be known. Current code matches what section 5.1 // 3568 * (conformance) describes, but some readings of the self-contradicting // 3569 * text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that // 3570 * attribute expansion/normalization must be deferred in some cases // (just 3571 * TRY to identify them!). 3572 * 3573 * Retrieve the expanded value of a declared attribute. <p>General entities 3574 * (and char refs) will be expanded (once). @param name The name of the 3575 * associated element. @param aname The name of the attribute. @return The 3576 * expanded default value, or null if the attribute was #IMPLIED or simply 3577 * undeclared 3578 * 3579 * @see #getAttributeDefaultValue public String getAttributeExpandedValue 3580 * (String name, String aname) throws Exception { AttributeDecl 3581 * attribute = getAttribute (name, aname); 3582 * 3583 * if (attribute == null) { return null; } else if (attribute.defaultValue == 3584 * null && attribute.value != null) { // we MUST use the same buf for both 3585 * quotes else the literal // can't be properly terminated char buf [] = new 3586 * char [1]; int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; String type = 3587 * getAttributeType (name, aname); 3588 * 3589 * if (type != "CDATA" && type != null) flags |= LIT_NORMALIZE; buf [0] = 3590 * '"'; pushCharArray (null, buf, 0, 1); pushString (null, attribute.value); 3591 * pushCharArray (null, buf, 0, 1); attribute.defaultValue = readLiteral 3592 * (flags); } return attribute.defaultValue; } 3593 */ 3594 3595 /** 3596 * Retrieve the default value mode of a declared attribute. 3597 * 3598 * @see #ATTRIBUTE_DEFAULT_SPECIFIED 3599 * @see #ATTRIBUTE_DEFAULT_IMPLIED 3600 * @see #ATTRIBUTE_DEFAULT_REQUIRED 3601 * @see #ATTRIBUTE_DEFAULT_FIXED 3602 */ 3603 public int getAttributeDefaultValueType(String name, String aname) { 3604 AttributeDecl attribute = getAttribute(name, aname); 3605 return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED 3606 : attribute.valueType; 3607 } 3608 3609 /** 3610 * Register an attribute declaration for later retrieval. Format: - String 3611 * type - String default value - int value type - enumeration - processed 3612 * default value 3613 */ 3614 private void setAttribute(String elName, String name, String type, 3615 String enumeration, String value, int valueType) throws Exception { 3616 HashMap<String, AttributeDecl> attlist; 3617 3618 if (skippedPE) { 3619 return; 3620 } 3621 3622 // Create a new hashtable if necessary. 3623 attlist = getElementAttributes(elName); 3624 if (attlist == null) { 3625 attlist = new HashMap<String, AttributeDecl>(); 3626 } 3627 3628 // ignore multiple attribute declarations! 3629 if (attlist.get(name) != null) { 3630 // warn ... 3631 return; 3632 } else { 3633 AttributeDecl attribute = new AttributeDecl(); 3634 attribute.type = type; 3635 attribute.value = value; 3636 attribute.valueType = valueType; 3637 attribute.enumeration = enumeration; 3638 attlist.put(name, attribute); 3639 3640 // save; but don't overwrite any existing <!ELEMENT ...> 3641 setElement(elName, CONTENT_UNDECLARED, null, attlist); 3642 } 3643 } 3644 3645 /** 3646 * Retrieve the attribute declaration for the given element name and name. 3647 */ 3648 private AttributeDecl getAttribute(String elName, String name) { 3649 HashMap<String, AttributeDecl> attlist = getElementAttributes(elName); 3650 return (attlist == null) ? null : attlist.get(name); 3651 } 3652 3653 // 3654 // Entities 3655 // 3656 3657 /** 3658 * Find the type of an entity. 3659 * 3660 * @returns An integer constant representing the entity type. 3661 * @see #ENTITY_UNDECLARED 3662 * @see #ENTITY_INTERNAL 3663 * @see #ENTITY_NDATA 3664 * @see #ENTITY_TEXT 3665 */ 3666 public int getEntityType(String ename) { 3667 EntityInfo entity = entityInfo.get(ename); 3668 return (entity == null) ? ENTITY_UNDECLARED : entity.type; 3669 } 3670 3671 /** 3672 * Return an external entity's identifiers. 3673 * 3674 * @param ename 3675 * The name of the external entity. 3676 * @return The entity's public identifier, system identifier, and base URI. 3677 * Null if the entity was not declared as an external entity. 3678 * @see #getEntityType 3679 */ 3680 public ExternalIdentifiers getEntityIds(String ename) { 3681 EntityInfo entity = entityInfo.get(ename); 3682 return (entity == null) ? null : entity.ids; 3683 } 3684 3685 /** 3686 * Return an internal entity's replacement text. 3687 * 3688 * @param ename 3689 * The name of the internal entity. 3690 * @return The entity's replacement text, or null if the entity was not 3691 * declared as an internal entity. 3692 * @see #getEntityType 3693 */ 3694 public String getEntityValue(String ename) { 3695 EntityInfo entity = entityInfo.get(ename); 3696 return (entity == null) ? null : entity.value; 3697 } 3698 3699 /** 3700 * Register an entity declaration for later retrieval. 3701 */ 3702 private void setInternalEntity(String eName, String value) 3703 throws SAXException { 3704 if (skippedPE) { 3705 return; 3706 } 3707 3708 if (entityInfo.get(eName) == null) { 3709 EntityInfo entity = new EntityInfo(); 3710 entity.type = ENTITY_INTERNAL; 3711 entity.value = value; 3712 entityInfo.put(eName, entity); 3713 } 3714 if (handler.stringInterning) { 3715 if ("lt" == eName || "gt" == eName || "quot" == eName 3716 || "apos" == eName || "amp" == eName) { 3717 return; 3718 } 3719 } else { 3720 if ("lt".equals(eName) || "gt".equals(eName) 3721 || "quot".equals(eName) || "apos".equals(eName) 3722 || "amp".equals(eName)) { 3723 return; 3724 } 3725 } 3726 handler.getDeclHandler().internalEntityDecl(eName, value); 3727 } 3728 3729 /** 3730 * Register an external entity declaration for later retrieval. 3731 */ 3732 private void setExternalEntity(String eName, int eClass, 3733 ExternalIdentifiers ids, String nName) { 3734 if (entityInfo.get(eName) == null) { 3735 EntityInfo entity = new EntityInfo(); 3736 entity.type = eClass; 3737 entity.ids = ids; 3738 entity.notationName = nName; 3739 entityInfo.put(eName, entity); 3740 } 3741 } 3742 3743 // 3744 // Notations. 3745 // 3746 3747 /** 3748 * Report a notation declaration, checking for duplicates. 3749 */ 3750 private void setNotation(String nname, ExternalIdentifiers ids) 3751 throws SAXException { 3752 if (skippedPE) { 3753 return; 3754 } 3755 3756 handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri); 3757 if (notationInfo.get(nname) == null) { 3758 notationInfo.put(nname, nname); 3759 } else { 3760 // VC: Unique Notation Name 3761 handler.verror("Duplicate notation name decl: " + nname); 3762 } 3763 } 3764 3765 // 3766 // Location. 3767 // 3768 3769 /** 3770 * Return the current line number. 3771 */ 3772 public int getLineNumber() { 3773 if (line > 0) { 3774 return line; 3775 } else { 3776 return -1; 3777 } 3778 } 3779 3780 /** 3781 * Return the current column number. 3782 */ 3783 public int getColumnNumber() { 3784 if (column > 0) { 3785 return column; 3786 } else { 3787 return -1; 3788 } 3789 } 3790 3791 // //////////////////////////////////////////////////////////////////// 3792 // High-level I/O. 3793 // //////////////////////////////////////////////////////////////////// 3794 3795 /** 3796 * Read a single character from the readBuffer. 3797 * <p> 3798 * The readDataChunk () method maintains the buffer. 3799 * <p> 3800 * If we hit the end of an entity, try to pop the stack and keep going. 3801 * <p> 3802 * (This approach doesn't really enforce XML's rules about entity 3803 * boundaries, but this is not currently a validating parser). 3804 * <p> 3805 * This routine also attempts to keep track of the current position in 3806 * external entities, but it's not entirely accurate. 3807 * 3808 * @return The next available input character. 3809 * @see #unread (char) 3810 * @see #readDataChunk 3811 * @see #readBuffer 3812 * @see #line 3813 * @return The next character from the current input source. 3814 */ 3815 private char readCh() throws SAXException, IOException { 3816 // As long as there's nothing in the 3817 // read buffer, try reading more data 3818 // (for an external entity) or popping 3819 // the entity stack (for either). 3820 while (readBufferPos >= readBufferLength) { 3821 switch (sourceType) { 3822 case INPUT_READER: 3823 readDataChunk(); 3824 while (readBufferLength < 1) { 3825 popInput(); 3826 if (readBufferLength < 1) { 3827 readDataChunk(); 3828 } 3829 } 3830 break; 3831 3832 default: 3833 3834 popInput(); 3835 break; 3836 } 3837 } 3838 3839 char c = readBuffer[readBufferPos++]; 3840 advanceLocation(); 3841 // copied from fi.iki.hsivonen.htmlparser 3842 if ((c & 0xFC00) == 0xDC00) { 3843 // Got a low surrogate. See if prev was high surrogate 3844 if ((prev & 0xFC00) == 0xD800) { 3845 int intVal = (prev << 10) + c + SURROGATE_OFFSET; 3846 if (isNonCharacter(intVal)) { 3847 handler.warn("Astral non-character."); 3848 } 3849 if (isAstralPrivateUse(intVal)) { 3850 warnAboutPrivateUseChar(); 3851 } 3852 } else { 3853 fatal("Unmatched low surrogate."); 3854 } 3855 prev = c; 3856 } else { 3857 // see if there was a lone high surrogate 3858 if ((prev & 0xFC00) == 0xD800) { 3859 fatal("Unmatched high surrogate."); 3860 } 3861 } 3862 3863 if (c == '\n') { 3864 nextCharOnNewLine = true; 3865 } else { 3866 if (c == '<') { 3867 /* the most common return to parseContent () ... NOP */ 3868 } else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) 3869 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) && xmlVersion == XML_11)) { 3870 fatal("illegal XML character U+" + Integer.toHexString(c)); 3871 } else if (c >= '\u007F' && c <= '\u009F') // 2006-04-25 hsivonen 3872 { 3873 handler.warn("Saw a control character: U+00" 3874 + Integer.toHexString(c) + "."); 3875 } 3876 3877 if (isPrivateUse(c)) { 3878 warnAboutPrivateUseChar(); 3879 } 3880 // If we're in the DTD and in a context where PEs get expanded, 3881 // do so ... 1/14/2000 errata identify those contexts. There 3882 // are also spots in the internal subset where PE refs are fatal 3883 // errors, hence yet another flag. 3884 else if (c == '%' && expandPE) { 3885 if (peIsError) { 3886 fatal("PE reference within decl in internal subset."); 3887 } 3888 parsePEReference(); 3889 return readCh(); 3890 } 3891 } 3892 3893 return c; 3894 } 3895 3896 /** 3897 * Push a single character back onto the current input stream. 3898 * <p> 3899 * This method usually pushes the character back onto the readBuffer. 3900 * <p> 3901 * I don't think that this would ever be called with readBufferPos = 0, 3902 * because the methods always reads a character before unreading it, but 3903 * just in case, I've added a boundary condition. 3904 * 3905 * @param c 3906 * The character to push back. 3907 * @see #readCh 3908 * @see #unread (char[]) 3909 * @see #readBuffer 3910 */ 3911 private void unread(char c) throws SAXException { 3912 rollbackLocation(); 3913 if (readBufferPos > 0) { 3914 readBuffer[--readBufferPos] = c; 3915 } else { 3916 pushString(null, new Character(c).toString()); 3917 } 3918 } 3919 3920 /** 3921 * 3922 */ 3923 private void rollbackLocation() { 3924 assert (column != columnPrev) || (line != linePrev); 3925 if (column == 1) { 3926 nextCharOnNewLine = true; 3927 } 3928 line = linePrev; 3929 column = columnPrev; 3930 } 3931 3932 /** 3933 * Push a char array back onto the current input stream. 3934 * <p> 3935 * NOTE: you must <em>never</em> push back characters that you haven't 3936 * actually read: use pushString () instead. 3937 * 3938 * @see #readCh 3939 * @see #unread (char) 3940 * @see #readBuffer 3941 * @see #pushString 3942 */ 3943 private void unread(char[] ch, int length) throws SAXException { 3944 if (length < readBufferPos) { 3945 readBufferPos -= length; 3946 } else { 3947 pushCharArray(null, ch, 0, length); 3948 } 3949 } 3950 3951 /** 3952 * Push, or skip, a new external input source. The source will be some kind 3953 * of parsed entity, such as a PE (including the external DTD subset) or 3954 * content for the body. 3955 * 3956 * @param url 3957 * The java.net.URL object for the entity. 3958 * @see SAXDriver#resolveEntity 3959 * @see #pushString 3960 * @see #sourceType 3961 * @see #pushInput 3962 * @see #detectEncoding 3963 * @see #sourceType 3964 * @see #readBuffer 3965 */ 3966 private void pushURL(boolean isPE, String ename, ExternalIdentifiers ids, 3967 Reader aReader, InputStream aStream, String aEncoding, 3968 boolean doResolve) throws SAXException, IOException { 3969 // removed boolean ignoreEncoding -- 2006-02-03 hsivonen 3970 String systemId; 3971 InputSource source; 3972 InputSource scratch = new InputSource(); 3973 3974 if (!isPE) { 3975 dataBufferFlush(); 3976 } 3977 3978 scratch.setPublicId(ids.publicId); 3979 scratch.setSystemId(ids.systemId); 3980 3981 // See if we should skip or substitute the entity. 3982 // If we're not skipping, resolving reports startEntity() 3983 // and updates the (handler's) stack of URIs. 3984 if (doResolve) { 3985 // assert (stream == null && reader == null && encoding == null) 3986 source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri); 3987 if (source == null) { 3988 handler.warn("skipping entity: " + ename); 3989 handler.skippedEntity(ename); 3990 if (isPE) { 3991 skippedPE = true; 3992 } 3993 return; 3994 } 3995 3996 // we might be using alternate IDs/encoding 3997 systemId = source.getSystemId(); 3998 // The following warning and setting systemId was deleted bcause 3999 // the application has the option of not setting systemId 4000 // provided that it has set the characte/byte stream. 4001 /* 4002 * if (systemId == null) { handler.warn ("missing system ID, using " + 4003 * ids.systemId); systemId = ids.systemId; } 4004 */ 4005 } else { 4006 // "[document]", or "[dtd]" via getExternalSubset() 4007 scratch.setCharacterStream(aReader); 4008 scratch.setByteStream(aStream); 4009 scratch.setEncoding(aEncoding); 4010 source = scratch; 4011 systemId = ids.systemId; 4012 if (handler.stringInterning) { 4013 handler.startExternalEntity(ename, systemId, 4014 "[document]" == ename); 4015 } else { 4016 handler.startExternalEntity(ename, systemId, 4017 "[document]".equals(ename)); 4018 } 4019 } 4020 4021 // Push the existing status. 4022 pushInput(ename); 4023 4024 // Create a new read buffer. 4025 // (Note the four-character margin) 4026 readBuffer = new char[READ_BUFFER_MAX + 4]; 4027 readBufferPos = 0; 4028 readBufferLength = 0; 4029 readBufferOverflow = -1; 4030 is = null; 4031 reader = null; 4032 line = 0; 4033 column = 1; 4034 linePrev = 0; 4035 columnPrev = 1; 4036 nextCharOnNewLine = true; 4037 currentByteCount = 0; 4038 4039 // If there's an explicit character stream, just 4040 // ignore encoding declarations. 4041 if (source.getCharacterStream() != null) { 4042 sourceType = INPUT_READER; 4043 this.reader = source.getCharacterStream(); 4044 // swallow UTF-8 BOM -- 2006-02-03 hsivonen 4045 if ("UTF-8".equalsIgnoreCase(source.getEncoding())) { 4046 char bom = readCh(); 4047 if (bom != '\uFEFF') { 4048 unread(bom); 4049 } 4050 } 4051 tryEncodingDecl(source.getEncoding() == null ? "" 4052 : source.getEncoding()); 4053 return; 4054 } 4055 4056 // Else we handle the conversion, and need to ensure 4057 // it's done right. 4058 if (source.getByteStream() != null) { 4059 is = source.getByteStream(); 4060 } else { 4061 // Stop -- 2006-11-10 hsivonen 4062 fatal("The entity resolver didn't properly resolve the entity."); 4063 } 4064 4065 // If we get to here, there must be 4066 // an InputStream available. 4067 if (!is.markSupported()) { 4068 is = new BufferedInputStream(is); 4069 } 4070 4071 // Zapped bogus external encoding label code -- 2006-11-10 hsivonen 4072 4073 // if we got an external encoding label, use it ... 4074 if (source.getEncoding() != null) { 4075 draconianInputStreamReader(source.getEncoding(), is, false); 4076 if ("UTF-8".equalsIgnoreCase(source.getEncoding())) { 4077 char bom = readCh(); 4078 if (bom != '\uFEFF') { 4079 unread(bom); 4080 } 4081 } 4082 tryEncodingDecl(source.getEncoding()); 4083 // ... else autodetect from first bytes. 4084 } else { 4085 detectEncoding(); 4086 // Read any XML or text declaration. 4087 String enc = tryEncodingDecl(null); 4088 if (enc == null && "UTF-32" == characterEncoding) { 4089 fatal("UTF-32 was sniffed from the BOM, but there was no matching encoding declaration. The omission of explicit encoding declaration is only allowed with UTF-8 and UTF-16."); 4090 } 4091 } 4092 } 4093 4094 /** 4095 * Check for an encoding declaration. This is the second part of the XML 4096 * encoding autodetection algorithm, relying on detectEncoding to get to the 4097 * point that this part can read any encoding declaration in the document 4098 * (using only US-ASCII characters). 4099 * 4100 * <p> 4101 * Because this part starts to fill parser buffers with this data, it's 4102 * tricky to setup a reader so that Java's built-in decoders can be used for 4103 * the character encodings that aren't built in to this parser (such as 4104 * EUC-JP, KOI8-R, Big5, etc). 4105 * 4106 * @return any encoding in the declaration, uppercased; or null 4107 * @see detectEncoding 4108 */ 4109 private String tryEncodingDecl(String encoding) throws SAXException, 4110 IOException { 4111 // Read the XML/text declaration. 4112 if (tryRead("<?xml")) { 4113 if (tryWhitespace()) { 4114 if (inputStack.size() > 0) { 4115 return parseTextDecl(encoding); 4116 } else { 4117 return parseXMLDecl(encoding); 4118 } 4119 } else { 4120 // <?xml-stylesheet ...?> or similar 4121 unread('l'); 4122 unread('m'); 4123 unread('x'); 4124 unread('?'); 4125 unread('<'); 4126 } 4127 } 4128 // 2006-02-03 hsivonen 4129 warnAboutLackOfEncodingDecl(encoding); 4130 return null; 4131 } 4132 4133 /** 4134 * @param characterEncoding 4135 * @throws SAXException 4136 */ 4137 private void warnAboutLackOfEncodingDecl(String encoding) 4138 throws SAXException { 4139 if (!(encoding == null || "".equals(encoding) 4140 || "UTF-8".equalsIgnoreCase(encoding) || "UTF-16".equalsIgnoreCase(encoding))) { 4141 handler.warn("External encoding information specified a non-UTF-8/non-UTF-16 encoding (" 4142 + encoding 4143 + "), but there was no matching internal encoding declaration. The well-formedness status of this document may change when decoupled from the external encoding information."); 4144 } 4145 } 4146 4147 /** 4148 * Attempt to detect the encoding of an entity. 4149 * <p> 4150 * The trick here (as suggested in the XML standard) is that any entity not 4151 * in UTF-8, or in UCS-2 with a byte-order mark, <b>must</b> begin with an 4152 * XML declaration or an encoding declaration; we simply have to look for 4153 * "<?xml" in various encodings. 4154 * <p> 4155 * This method has no way to distinguish among 8-bit encodings. Instead, it 4156 * sets up for UTF-8, then (possibly) revises its assumption later in 4157 * setupDecoding (). Any ASCII-derived 8-bit encoding should work, but most 4158 * will be rejected later by setupDecoding (). 4159 * 4160 * @see #tryEncoding (byte[], byte, byte, byte, byte) 4161 * @see #tryEncoding (byte[], byte, byte) 4162 * @see #setupDecoding 4163 */ 4164 private void detectEncoding() throws SAXException, IOException { 4165 byte[] signature = new byte[4]; 4166 4167 // Read the first four bytes for 4168 // autodetection. 4169 is.mark(4); 4170 is.read(signature); 4171 is.reset(); 4172 4173 // 4174 // FIRST: four byte encodings (who uses these?) 4175 // 4176 if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x00, 4177 (byte) 0x3c)) { 4178 // UCS-4 must begin with "<?xml" 4179 // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234) 4180 // "UTF-32BE" 4181 draconianInputStreamReader("UTF-32BE", is, false); 4182 } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, 4183 (byte) 0x00, (byte) 0x00)) { 4184 // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321) 4185 // "UTF-32LE" 4186 draconianInputStreamReader("UTF-32LE", is, false); 4187 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, 4188 (byte) 0x3c, (byte) 0x00)) { 4189 // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143) 4190 fatal("Unsupported 32-bit encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); // 2006-02-03 4191 // hsivonen 4192 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, 4193 (byte) 0x00, (byte) 0x00)) { 4194 // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421) 4195 fatal("Unsupported 32-bit encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); // 2006-02-03 4196 // hsivonen 4197 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, 4198 (byte) 0xfe, (byte) 0xff)) { 4199 // 00 00 fe ff UCS_4_1234 (with BOM) 4200 is.read(); 4201 is.read(); 4202 is.read(); 4203 is.read(); 4204 draconianInputStreamReader("UTF-32BE", is, false, "UTF-32"); 4205 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, 4206 (byte) 0x00, (byte) 0x00)) { 4207 // ff fe 00 00 UCS_4_4321 (with BOM) 4208 is.read(); 4209 is.read(); 4210 is.read(); 4211 is.read(); 4212 draconianInputStreamReader("UTF-32LE", is, false, "UTF-32"); 4213 } 4214 // SECOND: two byte encodings 4215 // note ... with 1/14/2000 errata the XML spec identifies some 4216 // more "broken UTF-16" autodetection cases, with no XML decl, 4217 // which we don't handle here (that's legal too). 4218 // 4219 else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) { 4220 // UCS-2 with a byte-order marker. (UTF-16) 4221 // 0xfe 0xff: UCS-2, big-endian (12) 4222 is.read(); 4223 is.read(); 4224 draconianInputStreamReader("UTF-16BE", is, false, "UTF-16"); 4225 } else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) { 4226 // UCS-2 with a byte-order marker. (UTF-16) 4227 // 0xff 0xfe: UCS-2, little-endian (21) 4228 is.read(); 4229 is.read(); 4230 draconianInputStreamReader("UTF-16LE", is, false, "UTF-16"); 4231 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, 4232 (byte) 0x00, (byte) 0x3f)) { 4233 // UTF-16BE (otherwise, malformed UTF-16) 4234 // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark 4235 fatal("no byte-order mark for UTF-16 entity"); // s/UCS-2/UTF-16/ 4236 // -- 2006-02-03 4237 // hsivonen 4238 } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, 4239 (byte) 0x3f, (byte) 0x00)) { 4240 // UTF-16LE (otherwise, malformed UTF-16) 4241 // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark 4242 fatal("no byte-order mark for UTF-16 entity"); // s/UCS-2/UTF-16/ 4243 // -- 2006-02-03 4244 // hsivonen 4245 } 4246 // 4247 // THIRD: EBCDIC 4248 // 4249 else if (tryEncoding(signature, (byte) 0x4c, (byte) 0x6f, (byte) 0xa7, 4250 (byte) 0x94)) { 4251 // 4c 6f a7 94 ... we don't understand EBCDIC flavors 4252 fatal("Unsupported EBCDIC encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); 4253 } 4254 // 4255 // FOURTH: ASCII-derived encodings, fixed and variable lengths 4256 // 4257 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, (byte) 0x78, 4258 (byte) 0x6d)) { 4259 // ASCII derived 4260 // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING) 4261 characterEncoding = null; 4262 prefetchASCIIEncodingDecl(); 4263 } else if (signature[0] == (byte) 0xef && signature[1] == (byte) 0xbb 4264 && signature[2] == (byte) 0xbf) { 4265 // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text) 4266 // this un-needed notion slipped into XML 2nd ed through a 4267 // "non-normative" erratum; now required by MSFT and UDDI, 4268 // and E22 made it normative. 4269 is.read(); 4270 is.read(); 4271 is.read(); 4272 draconianInputStreamReader("UTF-8", is, false); 4273 } else { 4274 // (default) UTF-8 without encoding/XML declaration 4275 draconianInputStreamReader("UTF-8", is, false); 4276 } 4277 } 4278 4279 /** 4280 * Check for a four-byte signature. 4281 * <p> 4282 * Utility routine for detectEncoding (). 4283 * <p> 4284 * Always looks for some part of "<?XML" in a specific encoding. 4285 * 4286 * @param sig 4287 * The first four bytes read. 4288 * @param b1 4289 * The first byte of the signature 4290 * @param b2 4291 * The second byte of the signature 4292 * @param b3 4293 * The third byte of the signature 4294 * @param b4 4295 * The fourth byte of the signature 4296 * @see #detectEncoding 4297 */ 4298 private static boolean tryEncoding(byte[] sig, byte b1, byte b2, byte b3, 4299 byte b4) { 4300 return (sig[0] == b1 && sig[1] == b2 && sig[2] == b3 && sig[3] == b4); 4301 } 4302 4303 /** 4304 * Check for a two-byte signature. 4305 * <p> 4306 * Looks for a UCS-2 byte-order mark. 4307 * <p> 4308 * Utility routine for detectEncoding (). 4309 * 4310 * @param sig 4311 * The first four bytes read. 4312 * @param b1 4313 * The first byte of the signature 4314 * @param b2 4315 * The second byte of the signature 4316 * @see #detectEncoding 4317 */ 4318 private static boolean tryEncoding(byte[] sig, byte b1, byte b2) { 4319 return ((sig[0] == b1) && (sig[1] == b2)); 4320 } 4321 4322 /** 4323 * This method pushes a string back onto input. 4324 * <p> 4325 * It is useful either as the expansion of an internal entity, or for 4326 * backtracking during the parse. 4327 * <p> 4328 * Call pushCharArray () to do the actual work. 4329 * 4330 * @param s 4331 * The string to push back onto input. 4332 * @see #pushCharArray 4333 */ 4334 private void pushString(String ename, String s) throws SAXException { 4335 char[] ch = s.toCharArray(); 4336 pushCharArray(ename, ch, 0, ch.length); 4337 } 4338 4339 /** 4340 * Push a new internal input source. 4341 * <p> 4342 * This method is useful for expanding an internal entity, or for unreading 4343 * a string of characters. It creates a new readBuffer containing the 4344 * characters in the array, instead of characters converted from an input 4345 * byte stream. 4346 * 4347 * @param ch 4348 * The char array to push. 4349 * @see #pushString 4350 * @see #pushURL 4351 * @see #readBuffer 4352 * @see #sourceType 4353 * @see #pushInput 4354 */ 4355 private void pushCharArray(String ename, char[] ch, int start, int length) 4356 throws SAXException { 4357 // Push the existing status 4358 pushInput(ename); 4359 if (ename != null && doReport) { 4360 dataBufferFlush(); 4361 handler.startInternalEntity(ename); 4362 } 4363 sourceType = INPUT_INTERNAL; 4364 readBuffer = ch; 4365 readBufferPos = start; 4366 readBufferLength = length; 4367 readBufferOverflow = -1; 4368 } 4369 4370 /** 4371 * Save the current input source onto the stack. 4372 * <p> 4373 * This method saves all of the global variables associated with the current 4374 * input source, so that they can be restored when a new input source has 4375 * finished. It also tests for entity recursion. 4376 * <p> 4377 * The method saves the following global variables onto a stack using a 4378 * fixed-length array: 4379 * <ol> 4380 * <li>sourceType 4381 * <li>externalEntity 4382 * <li>readBuffer 4383 * <li>readBufferPos 4384 * <li>readBufferLength 4385 * <li>line 4386 * <li>characterEncoding 4387 * </ol> 4388 * 4389 * @param ename 4390 * The name of the entity (if any) causing the new input. 4391 * @see #popInput 4392 * @see #sourceType 4393 * @see #externalEntity 4394 * @see #readBuffer 4395 * @see #readBufferPos 4396 * @see #readBufferLength 4397 * @see #line 4398 * @see #characterEncoding 4399 */ 4400 private void pushInput(String ename) throws SAXException { 4401 // Protect against billion laughs -- 2006-12-28 hsivonen 4402 if (entityStack.size() > 16) { 4403 fatal("Entity recursion too deep. Stopping to protect against denial of service attacks."); 4404 } 4405 4406 // Check for entity recursion. 4407 if (ename != null) { 4408 Iterator<String> entities = entityStack.iterator(); 4409 while (entities.hasNext()) { 4410 String e = entities.next(); 4411 if (e != null && e == ename) { 4412 fatal("recursive reference to entity", ename, null); 4413 } 4414 } 4415 } 4416 entityStack.addLast(ename); 4417 4418 // Don't bother if there is no current input. 4419 if (sourceType == INPUT_NONE) { 4420 return; 4421 } 4422 4423 // Set up a snapshot of the current 4424 // input source. 4425 Input input = new Input(); 4426 4427 input.sourceType = sourceType; 4428 input.readBuffer = readBuffer; 4429 input.readBufferPos = readBufferPos; 4430 input.readBufferLength = readBufferLength; 4431 input.line = line; 4432 input.linePrev = linePrev; 4433 input.charecterEncoding = characterEncoding; 4434 input.readBufferOverflow = readBufferOverflow; 4435 input.is = is; 4436 input.currentByteCount = currentByteCount; 4437 input.column = column; 4438 input.columnPrev = columnPrev; 4439 input.nextCharOnNewLine = nextCharOnNewLine; 4440 input.reader = reader; 4441 input.prev = prev; 4442 input.normalizationChecker = normalizationChecker; 4443 input.characterHandler = characterHandler; 4444 characterHandler = null; 4445 4446 // Push it onto the stack. 4447 inputStack.addLast(input); 4448 } 4449 4450 /** 4451 * Restore a previous input source. 4452 * <p> 4453 * This method restores all of the global variables associated with the 4454 * current input source. 4455 * 4456 * @exception java.io.EOFException 4457 * If there are no more entries on the input stack. 4458 * @see #pushInput 4459 * @see #sourceType 4460 * @see #readBuffer 4461 * @see #readBufferPos 4462 * @see #readBufferLength 4463 * @see #line 4464 * @see #characterEncoding 4465 */ 4466 private void popInput() throws SAXException, IOException { 4467 String ename = entityStack.removeLast(); 4468 4469 if (ename != null && doReport) { 4470 dataBufferFlush(); 4471 } 4472 switch (sourceType) { 4473 case INPUT_READER: 4474 handler.endExternalEntity(ename); 4475 reader.close(); 4476 break; 4477 case INPUT_INTERNAL: 4478 if (ename != null && doReport) { 4479 handler.endInternalEntity(ename); 4480 } 4481 break; 4482 } 4483 if (characterHandler != null) { 4484 characterHandler.end(); 4485 } 4486 if (normalizationChecker != null) { 4487 normalizationChecker.end(); 4488 } 4489 4490 // Throw an EOFException if there 4491 // is nothing else to pop. 4492 if (inputStack.isEmpty()) { 4493 throw new EOFException("no more input"); 4494 } 4495 4496 Input input = inputStack.removeLast(); 4497 4498 sourceType = input.sourceType; 4499 readBuffer = input.readBuffer; 4500 readBufferPos = input.readBufferPos; 4501 readBufferLength = input.readBufferLength; 4502 line = input.line; 4503 linePrev = input.linePrev; 4504 characterEncoding = input.charecterEncoding; 4505 readBufferOverflow = input.readBufferOverflow; 4506 is = input.is; 4507 currentByteCount = input.currentByteCount; 4508 column = input.column; 4509 columnPrev = input.columnPrev; 4510 nextCharOnNewLine = input.nextCharOnNewLine; 4511 reader = input.reader; 4512 prev = input.prev; 4513 normalizationChecker = input.normalizationChecker; 4514 characterHandler = input.characterHandler; 4515 } 4516 4517 /** 4518 * Return true if we can read the expected character. 4519 * <p> 4520 * Note that the character will be removed from the input stream on success, 4521 * but will be put back on failure. Do not attempt to read the character 4522 * again if the method succeeds. 4523 * 4524 * @param delim 4525 * The character that should appear next. For a insensitive 4526 * match, you must supply this in upper-case. 4527 * @return true if the character was successfully read, or false if it was 4528 * not. 4529 * @see #tryRead (String) 4530 */ 4531 private boolean tryRead(char delim) throws SAXException, IOException { 4532 char c; 4533 4534 // Read the character 4535 c = readCh(); 4536 4537 // Test for a match, and push the character 4538 // back if the match fails. 4539 if (c == delim) { 4540 return true; 4541 } else { 4542 unread(c); 4543 return false; 4544 } 4545 } 4546 4547 /** 4548 * Return true if we can read the expected string. 4549 * <p> 4550 * This is simply a convenience method. 4551 * <p> 4552 * Note that the string will be removed from the input stream on success, 4553 * but will be put back on failure. Do not attempt to read the string again 4554 * if the method succeeds. 4555 * <p> 4556 * This method will push back a character rather than an array whenever 4557 * possible (probably the majority of cases). 4558 * 4559 * @param delim 4560 * The string that should appear next. 4561 * @return true if the string was successfully read, or false if it was not. 4562 * @see #tryRead (char) 4563 */ 4564 private boolean tryRead(String delim) throws SAXException, IOException { 4565 return tryRead(delim.toCharArray()); 4566 } 4567 4568 private boolean tryRead(char[] ch) throws SAXException, IOException { 4569 char c; 4570 4571 // Compare the input, character- 4572 // by character. 4573 int saveLine = line; 4574 int saveColumn = column; 4575 int saveLinePrev = linePrev; 4576 int saveColumnPrev = columnPrev; 4577 boolean saveNextCharOnNewLine = nextCharOnNewLine; 4578 4579 for (int i = 0; i < ch.length; i++) { 4580 c = readCh(); 4581 if (c != ch[i]) { 4582 unread(c); 4583 if (i != 0) { 4584 unread(ch, i); 4585 } 4586 line = saveLine; 4587 column = saveColumn; 4588 linePrev = saveLinePrev; 4589 columnPrev = saveColumnPrev; 4590 nextCharOnNewLine = saveNextCharOnNewLine; 4591 return false; 4592 } 4593 } 4594 return true; 4595 } 4596 4597 /** 4598 * Return true if we can read some whitespace. 4599 * <p> 4600 * This is simply a convenience method. 4601 * <p> 4602 * This method will push back a character rather than an array whenever 4603 * possible (probably the majority of cases). 4604 * 4605 * @return true if whitespace was found. 4606 */ 4607 private boolean tryWhitespace() throws SAXException, IOException { 4608 char c; 4609 c = readCh(); 4610 if (isWhitespace(c)) { 4611 skipWhitespace(); 4612 return true; 4613 } else { 4614 unread(c); 4615 return false; 4616 } 4617 } 4618 4619 private void parseUntil(char[] delim) throws SAXException, IOException { 4620 char c; 4621 int startLine = line; 4622 4623 try { 4624 while (!tryRead(delim)) { 4625 c = readCh(); 4626 dataBufferAppend(c); 4627 } 4628 } catch (EOFException e) { 4629 fatal("end of input while looking for delimiter " 4630 + "(started on line " + startLine + ')', null, new String( 4631 delim)); 4632 } 4633 } 4634 4635 // //////////////////////////////////////////////////////////////////// 4636 // Low-level I/O. 4637 // //////////////////////////////////////////////////////////////////// 4638 4639 /** 4640 * Prefetch US-ASCII XML/text decl from input stream into read buffer. 4641 * Doesn't buffer more than absolutely needed, so that when an encoding decl 4642 * says we need to create an InputStreamReader, we can discard our buffer 4643 * and reset(). Caller knows the first chars of the decl exist in the input 4644 * stream. 4645 */ 4646 private void prefetchASCIIEncodingDecl() throws SAXException, IOException { 4647 int ch; 4648 readBufferPos = readBufferLength = 0; 4649 4650 is.mark(readBuffer.length); 4651 while (true) { 4652 ch = is.read(); 4653 readBuffer[readBufferLength++] = (char) ch; 4654 switch (ch) { 4655 case (int) '>': 4656 return; 4657 case -1: 4658 fatal( 4659 "file ends before end of XML or encoding declaration.", 4660 null, "?>"); 4661 } 4662 if (readBuffer.length == readBufferLength) { 4663 fatal("unfinished XML or encoding declaration"); 4664 } 4665 } 4666 } 4667 4668 /** 4669 * Read a chunk of data from an external input source. 4670 * <p>This is simply a front-end that fills the rawReadBuffer 4671 * with bytes, then calls the appropriate encoding handler. 4672 * @see #characterEncoding 4673 * @see #rawReadBuffer 4674 * @see #readBuffer 4675 * @see #filterCR 4676 * @see #copyUtf8ReadBuffer 4677 * @see #copyIso8859_1ReadBuffer 4678 * @see #copyUcs_2ReadBuffer 4679 * @see #copyUcs_4ReadBuffer 4680 */ 4681 private void readDataChunk() throws SAXException, IOException { 4682 int count; 4683 4684 // See if we have any overflow (filterCR sets for CR at end) 4685 if (readBufferOverflow > -1) { 4686 readBuffer[0] = (char) readBufferOverflow; 4687 readBufferOverflow = -1; 4688 readBufferPos = 1; 4689 sawCR = true; 4690 } else { 4691 readBufferPos = 0; 4692 sawCR = false; 4693 } 4694 4695 try { 4696 count = reader.read(readBuffer, readBufferPos, READ_BUFFER_MAX 4697 - readBufferPos); 4698 } catch (CharacterCodingException cce) { 4699 // 2006-04-25 hsivonen 4700 fatal("Input data does not conform to the input encoding. The input encoding was " 4701 + characterEncoding + "."); 4702 return; // never happens 4703 } 4704 if (characterHandler != null && count > 0) { 4705 characterHandler.characters(readBuffer, readBufferPos, count); 4706 } 4707 if (normalizationChecker != null && count > 0) { 4708 normalizationChecker.characters(readBuffer, readBufferPos, count); 4709 } 4710 if (count < 0) { 4711 readBufferLength = readBufferPos; 4712 } else { 4713 readBufferLength = readBufferPos + count; 4714 } 4715 if (readBufferLength > 0) { 4716 filterCR(count >= 0); 4717 } 4718 sawCR = false; 4719 } 4720 4721 /** 4722 * Filter carriage returns in the read buffer. 4723 * CRLF becomes LF; CR becomes LF. 4724 * @param moreData true iff more data might come from the same source 4725 * @see #readDataChunk 4726 * @see #readBuffer 4727 * @see #readBufferOverflow 4728 */ 4729 private void filterCR(boolean moreData) { 4730 int i, j; 4731 4732 readBufferOverflow = -1; 4733 4734 loop: for (i = j = readBufferPos; j < readBufferLength; i++, j++) { 4735 switch (readBuffer[j]) { 4736 case '\r': 4737 if (j == readBufferLength - 1) { 4738 if (moreData) { 4739 readBufferOverflow = '\r'; 4740 readBufferLength--; 4741 } else // CR at end of buffer 4742 { 4743 readBuffer[i++] = '\n'; 4744 } 4745 break loop; 4746 } else if (readBuffer[j + 1] == '\n') { 4747 j++; 4748 } 4749 readBuffer[i] = '\n'; 4750 break; 4751 4752 case '\n': 4753 default: 4754 readBuffer[i] = readBuffer[j]; 4755 break; 4756 } 4757 } 4758 readBufferLength = i; 4759 } 4760 4761 private void warnAboutPrivateUseChar() throws SAXException { 4762 if (!alreadyWarnedAboutPrivateUseCharacters) { 4763 handler.warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)"); 4764 alreadyWarnedAboutPrivateUseCharacters = true; 4765 } 4766 } 4767 4768 // copied from fi.iki.hsivonen.htmlparser 4769 4770 private boolean isPrivateUse(char c) { 4771 return c >= '\uE000' && c <= '\uF8FF'; 4772 } 4773 4774 private boolean isPrivateUse(int c) { 4775 return (c >= 0xE000 && c <= 0xF8FF) || (c >= 0xF0000 && c <= 0xFFFFD) 4776 || (c >= 0x100000 && c <= 0x10FFFD); 4777 } 4778 4779 private boolean isAstralPrivateUse(int c) { 4780 return (c >= 0xF0000 && c <= 0xFFFFD) 4781 || (c >= 0x100000 && c <= 0x10FFFD); 4782 } 4783 4784 private boolean isNonCharacter(int c) { 4785 return (c & 0xFFFE) == 0xFFFE; 4786 } 4787 4788 ////////////////////////////////////////////////////////////////////// 4789 // Local Variables. 4790 ////////////////////////////////////////////////////////////////////// 4791 4792 /** 4793 * Re-initialize the variables for each parse. 4794 * @throws SAXException 4795 */ 4796 private void initializeVariables() throws SAXException { 4797 prev = '\u0000'; 4798 // First line 4799 line = 0; 4800 column = 1; 4801 linePrev = 0; 4802 columnPrev = 1; 4803 nextCharOnNewLine = true; 4804 4805 // Set up the buffers for data and names 4806 dataBufferPos = 0; 4807 dataBuffer = new char[DATA_BUFFER_INITIAL]; 4808 nameBufferPos = 0; 4809 nameBuffer = new char[NAME_BUFFER_INITIAL]; 4810 4811 // Set up the DTD hash tables 4812 elementInfo = new HashMap<String, ElementDecl>(); 4813 entityInfo = new HashMap<String, EntityInfo>(); 4814 notationInfo = new HashMap<String, String>(); 4815 skippedPE = false; 4816 4817 // Set up the variables for the current 4818 // element context. 4819 currentElement = null; 4820 currentElementContent = CONTENT_UNDECLARED; 4821 4822 // Set up the input variables 4823 sourceType = INPUT_NONE; 4824 inputStack = new LinkedList<Input>(); 4825 entityStack = new LinkedList<String>(); 4826 tagAttributePos = 0; 4827 tagAttributes = new String[100]; 4828 rawReadBuffer = new byte[READ_BUFFER_MAX]; 4829 readBufferOverflow = -1; 4830 4831 inLiteral = false; 4832 expandPE = false; 4833 peIsError = false; 4834 4835 doReport = false; 4836 4837 inCDATA = false; 4838 4839 symbolTable = new Object[SYMBOL_TABLE_LENGTH][]; 4840 4841 if (handler.checkNormalization) { 4842 normalizationChecker = new NormalizationChecker(handler); 4843 normalizationChecker.setErrorHandler(handler.getErrorHandler()); 4844 normalizationChecker.start(); 4845 } else { 4846 normalizationChecker = null; 4847 } 4848 if (handler.characterHandler != null) { 4849 characterHandler = handler.characterHandler; 4850 handler.characterHandler = null; 4851 characterHandler.start(); 4852 } else { 4853 characterHandler = null; 4854 } 4855 } 4856 4857 static class ExternalIdentifiers { 4858 4859 String publicId; 4860 4861 String systemId; 4862 4863 String baseUri; 4864 4865 ExternalIdentifiers() { 4866 } 4867 4868 ExternalIdentifiers(String publicId, String systemId, String baseUri) { 4869 this.publicId = publicId; 4870 this.systemId = systemId; 4871 this.baseUri = baseUri; 4872 } 4873 4874 } 4875 4876 static class EntityInfo { 4877 4878 int type; 4879 4880 ExternalIdentifiers ids; 4881 4882 String value; 4883 4884 String notationName; 4885 4886 } 4887 4888 static class AttributeDecl { 4889 4890 String type; 4891 4892 String value; 4893 4894 int valueType; 4895 4896 String enumeration; 4897 4898 String defaultValue; 4899 4900 } 4901 4902 static class ElementDecl { 4903 4904 int contentType; 4905 4906 String contentModel; 4907 4908 HashMap<String, AttributeDecl> attributes; 4909 4910 } 4911 4912 static class Input { 4913 CharacterHandler characterHandler; 4914 4915 boolean nextCharOnNewLine; 4916 4917 int columnPrev; 4918 4919 int linePrev; 4920 4921 char prev; 4922 4923 int sourceType; 4924 4925 char[] readBuffer; 4926 4927 int readBufferPos; 4928 4929 int readBufferLength; 4930 4931 int line; 4932 4933 String charecterEncoding; 4934 4935 int readBufferOverflow; 4936 4937 InputStream is; 4938 4939 int currentByteCount; 4940 4941 int column; 4942 4943 Reader reader; 4944 4945 NormalizationChecker normalizationChecker; 4946 } 4947 4948 }