|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectnu.validator.htmlparser.impl.Tokenizer
public final class Tokenizer
An implementatition of
http://www.whatwg.org/specs/web-apps/current-work/multipage/section-tokenisation.html
This class implements the Locator
interface. This is not an
incidental implementation detail: Users of this class are encouraged to make
use of the Locator
nature.
By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
can be configured to treat these conditions as fatal or to coerce the infoset
to something that XML 1.0 allows.
Nested Class Summary | |
---|---|
private static class |
Tokenizer.CommentState
|
Field Summary | |
---|---|
private boolean |
alreadyComplainedAboutNonAscii
Used together with nonAsciiProhibited . |
private boolean |
alreadyWarnedAboutPrivateUseCharacters
Keeps track of PUA warnings. |
private char[] |
astralChar
Buffer for expanding astral NCRs. |
private String |
attributeName
The current attribute name. |
private AttributesImpl |
attributes
The attribute holder. |
private char[] |
bmpChar
Buffer for expanding NCRs falling into the Basic Multilingual Plane. |
private XmlViolationPolicy |
bogusXmlnsPolicy
|
private char[] |
buf
The main input buffer that the tokenizer reads from. |
private static int |
BUFFER_GROW_BY
Buffer growth parameter. |
private int |
bufLen
The number of char s in buf that have
meaning. |
private CharacterHandler[] |
characterHandlers
Used for NFC checking if non- null , source code capture,
etc. |
private int |
col
The current column number in the current resource being tokenized. |
private int |
colPrev
|
private XmlViolationPolicy |
commentPolicy
The policy for comments. |
private String |
contentModelElement
The element whose end tag closes the current CDATA or RCDATA element. |
private ContentModelFlag |
contentModelFlag
http://www.whatwg.org/specs/web-apps/current-work/#content2 |
private XmlViolationPolicy |
contentNonXmlCharPolicy
The policy for non-space non-XML characters. |
private XmlViolationPolicy |
contentSpacePolicy
The policy for vertical tab and form feed. |
private int |
cstart
The index of the first char in buf that is
part of a coalesced run of character tokens or -1 if there
is not a current run being coalesced. |
private String |
doctypeName
The name of the current doctype token. |
private boolean |
endTag
true if tokenizing an end tag |
private ErrorHandler |
errorHandler
The error handler. |
private boolean |
escapeFlag
http://www.whatwg.org/specs/web-apps/current-work/#escape |
private boolean |
html4
true when HTML4-specific additional errors are requested. |
private boolean |
html4ModeCompatibleWithXhtml1Schemata
|
private boolean |
inContent
true when in text content or in attribute value. |
private static int |
LEAD_OFFSET
Magic value for UTF-16 operations. |
private static char[] |
LF
Array version of line feed. |
private int |
line
The current line number in the current resource being parsed. |
private int |
linePrev
|
private char[] |
longStrBuf
Buffer for long strings. |
private int |
longStrBufLen
Number of significant char s in longStrBuf . |
private char |
longStrBufPending
If not U+0000, a pending code unit to be appended to longStrBuf . |
private static char[] |
LT_GT
UTF-16 code unit array containing less than and greater than for emitting those characters on certain parse errors. |
private static char[] |
LT_SOLIDUS
UTF-16 code unit array containing less than and solidus for emitting those characters on certain parse errors. |
private boolean |
mappingLangToXmlLang
|
private boolean |
metaBoundaryPassed
Whether the stream is past the first 512 bytes. |
private XmlViolationPolicy |
namePolicy
|
private static Pattern |
NCNAME_PATTERN
|
private boolean |
nextCharOnNewLine
|
private boolean |
nonAsciiProhibited
Whether non-ASCII causes an error. |
private static char[] |
OCTYPE
"octype" as char[] |
private int |
pos
The index of the last char read from buf . |
private char |
prev
The previous char read from the buffer with infoset
alteration applied except for CR. |
private char[] |
prevFour
Lookbehind buffer for magic RCDATA/CDATA escaping. |
private int |
prevFourPtr
Points to the last char written to prevFour . |
private String |
publicId
The SAX public id for the resource being tokenized. |
private String |
publicIdentifier
The public id of the current doctype token. |
private Reader |
reader
The input UTF-16 code unit stream. |
private static char[] |
REPLACEMENT_CHARACTER
Array version of U+FFFD. |
private boolean |
shouldAddAttributes
If false , addAttribute*() are no-ops. |
private static char[] |
SPACE
Array version of space. |
private char[] |
strBuf
Buffer for short identifiers. |
private int |
strBufLen
Number of significant char s in strBuf . |
private static int |
SURROGATE_OFFSET
Magic value for UTF-16 operations. |
private boolean |
swallowBom
|
private String |
systemId
The SAX system id for the resource being tokenized. |
private String |
systemIdentifier
The system id of the current doctype token. |
private String |
tagName
The current tag token name. |
private TokenHandler |
tokenHandler
The token handler. |
private static char[] |
UBLIC
"ublic" as char[] |
private int |
unreadBuffer
Single code unit buffer for reconsuming an input character. |
private static String[] |
VOID_ELEMENTS
Lexically sorted void element names |
private boolean |
wantsComments
Whether comment tokens are emitted. |
private XmlViolationPolicy |
xmlnsPolicy
|
private static char[] |
YSTEM
"ystem" as char[] |
Constructor Summary | |
---|---|
Tokenizer(TokenHandler tokenHandler)
The constuctor. |
Method Summary | |
---|---|
private void |
addAttributeWithoutValue()
|
private void |
addAttributeWithValue()
|
void |
addCharacterHandler(CharacterHandler characterHandler)
|
private boolean |
afterAttributeNameState()
After attribute name state |
private void |
afterDoctypeNameState()
After DOCTYPE name state |
private void |
afterDoctypePublicIdentifierState()
After DOCTYPE public identifier state |
private void |
afterDoctypeSystemIdentifierState()
After DOCTYPE system identifier state |
private void |
appendLongStrBuf(char c)
Appends to the larger buffer. |
private void |
appendLongStrBuf(char[] arr)
Appends to the larger buffer. |
private void |
appendStrBuf(char c)
Appends to the smaller buffer. |
private void |
appendStrBufToLongStrBuf()
Append the contents of the smaller buffer to the larger one. |
private void |
appendToComment(char c)
Appends to the larger buffer when it is used to buffer a comment. |
private void |
attributeNameComplete()
|
private boolean |
attributeNameState()
Attribute name state |
private boolean |
attributeValueDoubleQuotedState()
Attribute value (double-quoted) state |
private boolean |
attributeValueSingleQuotedState()
Attribute value (single-quoted) state |
private boolean |
attributeValueUnquotedState()
Attribute value (unquoted) state |
private void |
beforeAttributeNameState()
This method implements a wrapper loop for the attribute-related states to avoid recursion to an arbitrary depth. |
private boolean |
beforeAttributeNameStateImpl()
Before attribute name state |
private boolean |
beforeAttributeValueState()
Before attribute value state |
private void |
beforeDoctypeNameState()
Before DOCTYPE name state |
private void |
beforeDoctypePublicIdentifierState()
Before DOCTYPE public identifier state |
private void |
beforeDoctypeSystemIdentifierState()
Before DOCTYPE system identifier state |
private void |
bogusCommentState()
Bogus comment state |
private void |
bogusDoctypeState()
Bogus DOCTYPE state |
private void |
clearLongStrBuf()
Clears the larger buffer. |
private void |
clearStrBuf()
Clears the smaller buffer. |
private void |
closeTagOpenState()
Close tag open state |
private void |
commentStates()
Comment start state, Comment start dash state, Comment state, Comment end dash state and Comment end state |
private void |
consumeEntity(boolean inAttribute)
Consume entity Unlike the definition is the spec, this method does not return a value and never requires the caller to backtrack. |
private void |
consumeNCR(boolean inAttribute)
|
private boolean |
currentIsVoid()
|
private void |
dataState()
Data state |
private CharsetDecoder |
decoderFromExternalDeclaration(String encoding)
Initializes a decoder from external decl. |
private void |
doctypeNameState()
DOCTYPE name state |
private void |
doctypePublicIdentifierDoubleQuotedState()
DOCTYPE public identifier (double-quoted) state |
private void |
doctypePublicIdentifierSingleQuotedState()
DOCTYPE public identifier (single-quoted) state |
private void |
doctypeState()
DOCTYPE state |
private void |
doctypeSystemIdentifierDoubleQuotedState()
DOCTYPE system identifier (double-quoted) state |
private void |
doctypeSystemIdentifierSingleQuotedState()
DOCTYPE system identifier (single-quoted) state |
(package private) void |
dontSwallowBom()
|
private void |
emitComment()
Emits the current comment token. |
private void |
emitCurrentTagToken()
|
private void |
emitOrAppend(char[] val,
boolean inAttribute)
|
private void |
emitStrBuf()
Emits the smaller buffer as character tokens. |
private void |
entityDataState()
Entity data state |
private void |
entityInAttributeValueState()
Entity in attribute value state |
private void |
err(String message)
Reports a Parse Error. |
private void |
fatal(String message)
Reports an condition that would make the infoset incompatible with XML 1.0 as fatal. |
private void |
flushChars()
Flushes coalesced character tokens. |
int |
getColumnNumber()
|
XmlViolationPolicy |
getCommentPolicy()
Returns the commentPolicy. |
XmlViolationPolicy |
getContentNonXmlCharPolicy()
Returns the contentNonXmlCharPolicy. |
XmlViolationPolicy |
getContentSpacePolicy()
Returns the contentSpacePolicy. |
int |
getLineNumber()
|
String |
getPublicId()
|
String |
getSystemId()
|
private void |
handleNCRValue(int value,
boolean inAttribute)
|
private boolean |
isAstralPrivateUse(int c)
Tells if the argument is an astral PUA character. |
boolean |
isCheckingNormalization()
Query if checking normalization. |
boolean |
isMappingLangToXmlLang()
Returns the mappingLangToXmlLang. |
private boolean |
isNcname(String str)
|
private boolean |
isNonCharacter(int c)
Tells if the argument is a non-character (works for BMP and astral). |
private boolean |
isPrivateUse(char c)
Tells if the argument is a BMP PUA character. |
private boolean |
lastHyphHyph()
|
private boolean |
lastLtExclHyph()
|
private String |
longStrBufToString()
The larger buffer as a string. |
private void |
markupDeclarationOpenState()
Markup declaration open state |
(package private) AttributesImpl |
newAttributes()
|
(package private) void |
noEncodingDeclared()
|
(package private) void |
notifyAboutMetaBoundary()
|
private void |
parseErrorUnlessPermittedSlash()
|
private char |
read()
Reads the next UTF-16 code unit. |
private void |
resetAttributes()
|
void |
setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy)
Sets the bogusXmlnsPolicy. |
void |
setCheckingNormalization(boolean enable)
Turns NFC checking on or off. |
void |
setCommentPolicy(XmlViolationPolicy commentPolicy)
Sets the commentPolicy. |
void |
setContentModelFlag(ContentModelFlag contentModelFlag,
String contentModelElement)
Sets the content model flag and the associated element name. |
void |
setContentNonXmlCharPolicy(XmlViolationPolicy contentNonXmlCharPolicy)
Sets the contentNonXmlCharPolicy. |
void |
setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy)
Sets the contentSpacePolicy. |
void |
setErrorHandler(ErrorHandler eh)
Sets the error handler. |
void |
setHtml4ModeCompatibleWithXhtml1Schemata(boolean html4ModeCompatibleWithXhtml1Schemata)
Sets the html4ModeCompatibleWithXhtml1Schemata. |
void |
setMappingLangToXmlLang(boolean mappingLangToXmlLang)
Sets the mappingLangToXmlLang. |
void |
setNamePolicy(XmlViolationPolicy namePolicy)
|
void |
setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy)
Sets the xmlnsPolicy. |
private String |
strBufToElementNameString()
|
private String |
strBufToString()
The smaller buffer as a string. |
private void |
tagNameState()
Tag name state |
private void |
tagOpenState()
Tag open state |
private String |
toAsciiLowerCase(String str)
|
void |
tokenize(InputSource is)
Runs the tokenization. |
(package private) void |
turnOnAdditionalHtml4Errors()
|
private void |
unread(char c)
Unreads a code unit so that it is returned the next time read() is called. |
private void |
warn(String message)
Reports a warning |
private void |
warnAboutPrivateUseChar()
Emits a warning about private use characters if the warning has not been emitted yet. |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
private static final Pattern NCNAME_PATTERN
private static final int LEAD_OFFSET
private static final int SURROGATE_OFFSET
private static final char[] LT_GT
private static final char[] LT_SOLIDUS
private static final char[] REPLACEMENT_CHARACTER
private static final char[] SPACE
private static final char[] LF
private static final int BUFFER_GROW_BY
private static final String[] VOID_ELEMENTS
private static final char[] OCTYPE
char[]
private static final char[] UBLIC
char[]
private static final char[] YSTEM
char[]
private final TokenHandler tokenHandler
private ErrorHandler errorHandler
private Reader reader
HtmlInputStreamReader
.
private char[] buf
reader
.
private int pos
char
read from buf
.
private int cstart
char
in buf
that is
part of a coalesced run of character tokens or -1
if there
is not a current run being coalesced.
private int bufLen
char
s in buf
that have
meaning. (The rest of the array is garbage and should not be examined.)
private char prev
char
read from the buffer with infoset
alteration applied except for CR. Used for CRLF normalization and
surrogate pair checking.
private final char[] prevFour
private int prevFourPtr
char
written to prevFour
.
private int unreadBuffer
-1
the next read()
returns from the real
buffer, otherwise from here.
private int line
private int linePrev
private int col
private int colPrev
private boolean nextCharOnNewLine
private String publicId
private String systemId
private char[] strBuf
private int strBufLen
char
s in strBuf
.
private char[] longStrBuf
private int longStrBufLen
char
s in longStrBuf
.
private char longStrBufPending
longStrBuf
.
private AttributesImpl attributes
private final char[] bmpChar
private final char[] astralChar
private boolean alreadyWarnedAboutPrivateUseCharacters
private ContentModelFlag contentModelFlag
private boolean escapeFlag
private String contentModelElement
private boolean endTag
true
if tokenizing an end tag
private String tagName
private String attributeName
private boolean wantsComments
private boolean shouldAddAttributes
false
, addAttribute*()
are no-ops.
private boolean inContent
true
when in text content or in attribute value.
private boolean html4
true
when HTML4-specific additional errors are requested.
private boolean nonAsciiProhibited
private boolean alreadyComplainedAboutNonAscii
nonAsciiProhibited
.
private boolean metaBoundaryPassed
private String doctypeName
private String publicIdentifier
private String systemIdentifier
private CharacterHandler[] characterHandlers
null
, source code capture,
etc.
private XmlViolationPolicy contentSpacePolicy
private XmlViolationPolicy contentNonXmlCharPolicy
private XmlViolationPolicy commentPolicy
private XmlViolationPolicy xmlnsPolicy
private XmlViolationPolicy namePolicy
private boolean swallowBom
private boolean html4ModeCompatibleWithXhtml1Schemata
private boolean mappingLangToXmlLang
private XmlViolationPolicy bogusXmlnsPolicy
Constructor Detail |
---|
public Tokenizer(TokenHandler tokenHandler)
tokenHandler
- the handler for receiving tokensMethod Detail |
---|
public void setCheckingNormalization(boolean enable)
enable
- true
if checking onpublic void addCharacterHandler(CharacterHandler characterHandler)
public boolean isCheckingNormalization()
true
if checking onpublic void setErrorHandler(ErrorHandler eh)
XMLReader.setErrorHandler(org.xml.sax.ErrorHandler)
public XmlViolationPolicy getCommentPolicy()
public void setCommentPolicy(XmlViolationPolicy commentPolicy)
commentPolicy
- the commentPolicy to setpublic XmlViolationPolicy getContentNonXmlCharPolicy()
public void setContentNonXmlCharPolicy(XmlViolationPolicy contentNonXmlCharPolicy)
contentNonXmlCharPolicy
- the contentNonXmlCharPolicy to setpublic XmlViolationPolicy getContentSpacePolicy()
public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy)
contentSpacePolicy
- the contentSpacePolicy to setpublic void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy)
xmlnsPolicy
- the xmlnsPolicy to setpublic void setNamePolicy(XmlViolationPolicy namePolicy)
public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy)
bogusXmlnsPolicy
- the bogusXmlnsPolicy to setpublic void setHtml4ModeCompatibleWithXhtml1Schemata(boolean html4ModeCompatibleWithXhtml1Schemata)
html4ModeCompatibleWithXhtml1Schemata
- the html4ModeCompatibleWithXhtml1Schemata to setpublic void tokenize(InputSource is) throws SAXException, IOException
is
- the input source
SAXException
- on fatal error (if configured to treat XML violations as
fatal) or if the token handler threw
IOException
- if the stream threwpublic void setContentModelFlag(ContentModelFlag contentModelFlag, String contentModelElement)
contentModelFlag
- the flagcontentModelElement
- the element causing the flag to be setpublic String getPublicId()
getPublicId
in interface Locator
Locator.getPublicId()
public String getSystemId()
getSystemId
in interface Locator
Locator.getSystemId()
public int getLineNumber()
getLineNumber
in interface Locator
Locator.getLineNumber()
public int getColumnNumber()
getColumnNumber
in interface Locator
Locator.getColumnNumber()
void notifyAboutMetaBoundary()
void turnOnAdditionalHtml4Errors()
void dontSwallowBom()
void noEncodingDeclared()
AttributesImpl newAttributes()
private void clearStrBuf()
private void appendStrBuf(char c)
c
- the UTF-16 code unit to appendprivate String strBufToString()
private void emitStrBuf() throws SAXException
SAXException
- if the token handler threwprivate boolean isNcname(String str)
private void clearLongStrBuf()
private void appendLongStrBuf(char c)
c
- the UTF-16 code unit to appendprivate void appendToComment(char c) throws SAXException
c
- the UTF-16 code unit to append
SAXException
private void appendLongStrBuf(char[] arr)
arr
- the UTF-16 code units to appendprivate void appendStrBufToLongStrBuf()
private String longStrBufToString()
private void emitComment() throws SAXException
SAXException
private void unread(char c)
read()
is called.
c
- the code unit to unreadprivate char read() throws SAXException, IOException
SAXException
IOException
private void warnAboutPrivateUseChar() throws SAXException
SAXException
private boolean isPrivateUse(char c)
c
- the UTF-16 code unit to check
true
if PUA characterprivate boolean isAstralPrivateUse(int c)
c
- the code point to check
true
if astral private useprivate boolean isNonCharacter(int c)
c
- the code point to check
true
if non-characterprivate void flushChars() throws SAXException, IOException
SAXException
IOException
private void fatal(String message) throws SAXException
message
- the message
SAXException
SAXParseException
private void err(String message) throws SAXException
message
- the message
SAXException
private void warn(String message) throws SAXException
message
- the message
SAXException
private CharsetDecoder decoderFromExternalDeclaration(String encoding) throws SAXException
SAXException
private boolean currentIsVoid()
private void dataState() throws SAXException, IOException
IOException
SAXException
private boolean lastHyphHyph()
private boolean lastLtExclHyph()
private void entityDataState() throws SAXException, IOException
IOException
SAXException
private void tagOpenState() throws SAXException, IOException
IOException
SAXException
private void closeTagOpenState() throws SAXException, IOException
IOException
SAXException
private void tagNameState() throws SAXException, IOException
IOException
SAXException
private String strBufToElementNameString()
private void beforeAttributeNameState() throws SAXException, IOException
IOException
SAXException
private void resetAttributes()
private boolean beforeAttributeNameStateImpl() throws SAXException, IOException
IOException
SAXException
private void parseErrorUnlessPermittedSlash() throws SAXException, IOException
SAXException
IOException
private void emitCurrentTagToken() throws SAXException
SAXException
private boolean attributeNameState() throws SAXException, IOException
IOException
SAXException
private void attributeNameComplete() throws SAXException
SAXException
private void addAttributeWithoutValue() throws SAXException
SAXException
private void addAttributeWithValue() throws SAXException
SAXException
private String toAsciiLowerCase(String str)
private boolean afterAttributeNameState() throws SAXException, IOException
IOException
SAXException
private boolean beforeAttributeValueState() throws SAXException, IOException
IOException
SAXException
private boolean attributeValueDoubleQuotedState() throws SAXException, IOException
IOException
SAXException
private boolean attributeValueSingleQuotedState() throws SAXException, IOException
SAXException
IOException
private boolean attributeValueUnquotedState() throws SAXException, IOException
IOException
SAXException
private void entityInAttributeValueState() throws SAXException, IOException
IOException
SAXException
private void bogusCommentState() throws SAXException, IOException
IOException
SAXException
private void markupDeclarationOpenState() throws SAXException, IOException
IOException
SAXException
private void commentStates() throws SAXException, IOException
IOException
SAXException
private void doctypeState() throws SAXException, IOException
IOException
SAXException
private void beforeDoctypeNameState() throws SAXException, IOException
IOException
SAXException
private void doctypeNameState() throws SAXException, IOException
IOException
SAXException
private void afterDoctypeNameState() throws SAXException, IOException
IOException
SAXException
private void beforeDoctypePublicIdentifierState() throws SAXException, IOException
IOException
SAXException
private void doctypePublicIdentifierDoubleQuotedState() throws SAXException, IOException
IOException
SAXException
private void doctypePublicIdentifierSingleQuotedState() throws SAXException, IOException
IOException
SAXException
private void afterDoctypePublicIdentifierState() throws SAXException, IOException
IOException
SAXException
private void beforeDoctypeSystemIdentifierState() throws SAXException, IOException
IOException
SAXException
private void doctypeSystemIdentifierDoubleQuotedState() throws SAXException, IOException
IOException
SAXException
private void doctypeSystemIdentifierSingleQuotedState() throws SAXException, IOException
IOException
SAXException
private void afterDoctypeSystemIdentifierState() throws SAXException, IOException
IOException
SAXException
private void bogusDoctypeState() throws SAXException, IOException
IOException
SAXException
private void consumeEntity(boolean inAttribute) throws SAXException, IOException
IOException
SAXException
private void consumeNCR(boolean inAttribute) throws SAXException, IOException
SAXException
IOException
private void handleNCRValue(int value, boolean inAttribute) throws SAXException, IOException
SAXException
IOException
private void emitOrAppend(char[] val, boolean inAttribute) throws SAXException, IOException
val
-
SAXException
IOException
public boolean isMappingLangToXmlLang()
public void setMappingLangToXmlLang(boolean mappingLangToXmlLang)
mappingLangToXmlLang
- the mappingLangToXmlLang to set
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |