aboutsummaryrefslogtreecommitdiff
path: root/node_modules/htmlparser2/lib/Tokenizer.js
diff options
context:
space:
mode:
authorFlorian Dold <florian.dold@gmail.com>2017-05-03 15:35:00 +0200
committerFlorian Dold <florian.dold@gmail.com>2017-05-03 15:35:00 +0200
commitde98e0b232509d5f40c135d540a70e415272ff85 (patch)
treea79222a5b58484ab3b80d18efcaaa7ccc4769b33 /node_modules/htmlparser2/lib/Tokenizer.js
parente0c9d480a73fa629c1e4a47d3e721f1d2d345406 (diff)
node_modules
Diffstat (limited to 'node_modules/htmlparser2/lib/Tokenizer.js')
-rw-r--r--node_modules/htmlparser2/lib/Tokenizer.js876
1 files changed, 876 insertions, 0 deletions
diff --git a/node_modules/htmlparser2/lib/Tokenizer.js b/node_modules/htmlparser2/lib/Tokenizer.js
new file mode 100644
index 000000000..ef98766b8
--- /dev/null
+++ b/node_modules/htmlparser2/lib/Tokenizer.js
@@ -0,0 +1,876 @@
+module.exports = Tokenizer;
+
+var entityMap = require("./entities/entities.json"),
+ legacyMap = require("./entities/legacy.json"),
+ xmlMap = require("./entities/xml.json"),
+ decodeMap = require("./entities/decode.json"),
+
+ i = 0,
+
+ TEXT = i++,
+ BEFORE_TAG_NAME = i++, //after <
+ IN_TAG_NAME = i++,
+ IN_SELF_CLOSING_TAG = i++,
+ BEFORE_CLOSING_TAG_NAME = i++,
+ IN_CLOSING_TAG_NAME = i++,
+ AFTER_CLOSING_TAG_NAME = i++,
+
+ //attributes
+ BEFORE_ATTRIBUTE_NAME = i++,
+ IN_ATTRIBUTE_NAME = i++,
+ AFTER_ATTRIBUTE_NAME = i++,
+ BEFORE_ATTRIBUTE_VALUE = i++,
+ IN_ATTRIBUTE_VALUE_DQ = i++, // "
+ IN_ATTRIBUTE_VALUE_SQ = i++, // '
+ IN_ATTRIBUTE_VALUE_NQ = i++,
+
+ //declarations
+ BEFORE_DECLARATION = i++, // !
+ IN_DECLARATION = i++,
+
+ //processing instructions
+ IN_PROCESSING_INSTRUCTION = i++, // ?
+
+ //comments
+ BEFORE_COMMENT = i++,
+ IN_COMMENT = i++,
+ AFTER_COMMENT_1 = i++,
+ AFTER_COMMENT_2 = i++,
+
+ //cdata
+ BEFORE_CDATA_1 = i++, // [
+ BEFORE_CDATA_2 = i++, // C
+ BEFORE_CDATA_3 = i++, // D
+ BEFORE_CDATA_4 = i++, // A
+ BEFORE_CDATA_5 = i++, // T
+ BEFORE_CDATA_6 = i++, // A
+ IN_CDATA = i++,// [
+ AFTER_CDATA_1 = i++, // ]
+ AFTER_CDATA_2 = i++, // ]
+
+ //special tags
+ BEFORE_SPECIAL = i++, //S
+ BEFORE_SPECIAL_END = i++, //S
+
+ BEFORE_SCRIPT_1 = i++, //C
+ BEFORE_SCRIPT_2 = i++, //R
+ BEFORE_SCRIPT_3 = i++, //I
+ BEFORE_SCRIPT_4 = i++, //P
+ BEFORE_SCRIPT_5 = i++, //T
+ AFTER_SCRIPT_1 = i++, //C
+ AFTER_SCRIPT_2 = i++, //R
+ AFTER_SCRIPT_3 = i++, //I
+ AFTER_SCRIPT_4 = i++, //P
+ AFTER_SCRIPT_5 = i++, //T
+
+ BEFORE_STYLE_1 = i++, //T
+ BEFORE_STYLE_2 = i++, //Y
+ BEFORE_STYLE_3 = i++, //L
+ BEFORE_STYLE_4 = i++, //E
+ AFTER_STYLE_1 = i++, //T
+ AFTER_STYLE_2 = i++, //Y
+ AFTER_STYLE_3 = i++, //L
+ AFTER_STYLE_4 = i++, //E
+
+ BEFORE_ENTITY = i++, //&
+ BEFORE_NUMERIC_ENTITY = i++, //#
+ IN_NAMED_ENTITY = i++,
+ IN_NUMERIC_ENTITY = i++,
+ IN_HEX_ENTITY = i++, //X
+
+ j = 0,
+
+ SPECIAL_NONE = j++,
+ SPECIAL_SCRIPT = j++,
+ SPECIAL_STYLE = j++;
+
+function whitespace(c){
+ return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r";
+}
+
+function ifElseState(upper, SUCCESS, FAILURE){
+ var lower = upper.toLowerCase();
+
+ if(upper === lower){
+ return function(c){
+ this._state = c === lower ? SUCCESS : FAILURE;
+ };
+ } else {
+ return function(c){
+ this._state = (c === lower || c === upper) ? SUCCESS : FAILURE;
+ };
+ }
+}
+
+function consumeSpecialNameChar(upper, NEXT_STATE){
+ var lower = upper.toLowerCase();
+
+ return function(c){
+ if(c === lower || c === upper){
+ this._state = NEXT_STATE;
+ } else {
+ this._state = IN_TAG_NAME;
+ this._index--; //consume the token again
+ }
+ };
+}
+
+function Tokenizer(options, cbs){
+ this._state = TEXT;
+ this._buffer = "";
+ this._sectionStart = 0;
+ this._index = 0;
+ this._baseState = TEXT;
+ this._special = SPECIAL_NONE;
+ this._cbs = cbs;
+ this._running = true;
+ this._xmlMode = !!(options && options.xmlMode);
+ this._decodeEntities = !!(options && options.decodeEntities);
+}
+
+Tokenizer.prototype._stateText = function(c){
+ if(c === "<"){
+ if(this._index > this._sectionStart){
+ this._cbs.ontext(this._getSection());
+ }
+ this._state = BEFORE_TAG_NAME;
+ this._sectionStart = this._index;
+ } else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){
+ if(this._index > this._sectionStart){
+ this._cbs.ontext(this._getSection());
+ }
+ this._baseState = TEXT;
+ this._state = BEFORE_ENTITY;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateBeforeTagName = function(c){
+ if(c === "/"){
+ this._state = BEFORE_CLOSING_TAG_NAME;
+ } else if(c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) {
+ this._state = TEXT;
+ } else if(c === "!"){
+ this._state = BEFORE_DECLARATION;
+ this._sectionStart = this._index + 1;
+ } else if(c === "?"){
+ this._state = IN_PROCESSING_INSTRUCTION;
+ this._sectionStart = this._index + 1;
+ } else if(c === "<"){
+ this._cbs.ontext(this._getSection());
+ this._sectionStart = this._index;
+ } else {
+ this._state = (!this._xmlMode && (c === "s" || c === "S")) ?
+ BEFORE_SPECIAL : IN_TAG_NAME;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInTagName = function(c){
+ if(c === "/" || c === ">" || whitespace(c)){
+ this._emitToken("onopentagname");
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateBeforeCloseingTagName = function(c){
+ if(whitespace(c));
+ else if(c === ">"){
+ this._state = TEXT;
+ } else if(this._special !== SPECIAL_NONE){
+ if(c === "s" || c === "S"){
+ this._state = BEFORE_SPECIAL_END;
+ } else {
+ this._state = TEXT;
+ this._index--;
+ }
+ } else {
+ this._state = IN_CLOSING_TAG_NAME;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInCloseingTagName = function(c){
+ if(c === ">" || whitespace(c)){
+ this._emitToken("onclosetag");
+ this._state = AFTER_CLOSING_TAG_NAME;
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateAfterCloseingTagName = function(c){
+ //skip everything until ">"
+ if(c === ">"){
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ }
+};
+
+Tokenizer.prototype._stateBeforeAttributeName = function(c){
+ if(c === ">"){
+ this._cbs.onopentagend();
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ } else if(c === "/"){
+ this._state = IN_SELF_CLOSING_TAG;
+ } else if(!whitespace(c)){
+ this._state = IN_ATTRIBUTE_NAME;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInSelfClosingTag = function(c){
+ if(c === ">"){
+ this._cbs.onselfclosingtag();
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ } else if(!whitespace(c)){
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateInAttributeName = function(c){
+ if(c === "=" || c === "/" || c === ">" || whitespace(c)){
+ if(this._index > this._sectionStart){
+ this._cbs.onattribname(this._getSection());
+ }
+ this._sectionStart = -1;
+ this._state = AFTER_ATTRIBUTE_NAME;
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateAfterAttributeName = function(c){
+ if(c === "="){
+ this._state = BEFORE_ATTRIBUTE_VALUE;
+ } else if(c === "/" || c === ">"){
+ this._cbs.onattribend();
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ this._index--;
+ } else if(!whitespace(c)){
+ this._cbs.onattribend();
+ this._state = IN_ATTRIBUTE_NAME;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateBeforeAttributeValue = function(c){
+ if(c === "\""){
+ this._state = IN_ATTRIBUTE_VALUE_DQ;
+ this._sectionStart = this._index + 1;
+ } else if(c === "'"){
+ this._state = IN_ATTRIBUTE_VALUE_SQ;
+ this._sectionStart = this._index + 1;
+ } else if(!whitespace(c)){
+ this._state = IN_ATTRIBUTE_VALUE_NQ;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c){
+ if(c === "\""){
+ this._emitToken("onattribdata");
+ this._cbs.onattribend();
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ } else if(this._decodeEntities && c === "&"){
+ this._emitToken("onattribdata");
+ this._baseState = this._state;
+ this._state = BEFORE_ENTITY;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c){
+ if(c === "'"){
+ this._emitToken("onattribdata");
+ this._cbs.onattribend();
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ } else if(this._decodeEntities && c === "&"){
+ this._emitToken("onattribdata");
+ this._baseState = this._state;
+ this._state = BEFORE_ENTITY;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c){
+ if(whitespace(c) || c === ">"){
+ this._emitToken("onattribdata");
+ this._cbs.onattribend();
+ this._state = BEFORE_ATTRIBUTE_NAME;
+ this._index--;
+ } else if(this._decodeEntities && c === "&"){
+ this._emitToken("onattribdata");
+ this._baseState = this._state;
+ this._state = BEFORE_ENTITY;
+ this._sectionStart = this._index;
+ }
+};
+
+Tokenizer.prototype._stateBeforeDeclaration = function(c){
+ this._state = c === "[" ? BEFORE_CDATA_1 :
+ c === "-" ? BEFORE_COMMENT :
+ IN_DECLARATION;
+};
+
+Tokenizer.prototype._stateInDeclaration = function(c){
+ if(c === ">"){
+ this._cbs.ondeclaration(this._getSection());
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ }
+};
+
+Tokenizer.prototype._stateInProcessingInstruction = function(c){
+ if(c === ">"){
+ this._cbs.onprocessinginstruction(this._getSection());
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ }
+};
+
+Tokenizer.prototype._stateBeforeComment = function(c){
+ if(c === "-"){
+ this._state = IN_COMMENT;
+ this._sectionStart = this._index + 1;
+ } else {
+ this._state = IN_DECLARATION;
+ }
+};
+
+Tokenizer.prototype._stateInComment = function(c){
+ if(c === "-") this._state = AFTER_COMMENT_1;
+};
+
+Tokenizer.prototype._stateAfterComment1 = ifElseState("-", AFTER_COMMENT_2, IN_COMMENT);
+
+Tokenizer.prototype._stateAfterComment2 = function(c){
+ if(c === ">"){
+ //remove 2 trailing chars
+ this._cbs.oncomment(this._buffer.substring(this._sectionStart, this._index - 2));
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ } else if(c !== "-"){
+ this._state = IN_COMMENT;
+ }
+ // else: stay in AFTER_COMMENT_2 (`--->`)
+};
+
+Tokenizer.prototype._stateBeforeCdata1 = ifElseState("C", BEFORE_CDATA_2, IN_DECLARATION);
+Tokenizer.prototype._stateBeforeCdata2 = ifElseState("D", BEFORE_CDATA_3, IN_DECLARATION);
+Tokenizer.prototype._stateBeforeCdata3 = ifElseState("A", BEFORE_CDATA_4, IN_DECLARATION);
+Tokenizer.prototype._stateBeforeCdata4 = ifElseState("T", BEFORE_CDATA_5, IN_DECLARATION);
+Tokenizer.prototype._stateBeforeCdata5 = ifElseState("A", BEFORE_CDATA_6, IN_DECLARATION);
+
+Tokenizer.prototype._stateBeforeCdata6 = function(c){
+ if(c === "["){
+ this._state = IN_CDATA;
+ this._sectionStart = this._index + 1;
+ } else {
+ this._state = IN_DECLARATION;
+ }
+};
+
+Tokenizer.prototype._stateInCdata = function(c){
+ if(c === "]") this._state = AFTER_CDATA_1;
+};
+
+Tokenizer.prototype._stateAfterCdata1 = ifElseState("]", AFTER_CDATA_2, IN_CDATA);
+
+Tokenizer.prototype._stateAfterCdata2 = function(c){
+ if(c === ">"){
+ //remove 2 trailing chars
+ this._cbs.oncdata(this._buffer.substring(this._sectionStart, this._index - 2));
+ this._state = TEXT;
+ this._sectionStart = this._index + 1;
+ } else if (c !== "]") {
+ this._state = IN_CDATA;
+ }
+ //else: stay in AFTER_CDATA_2 (`]]]>`)
+};
+
+Tokenizer.prototype._stateBeforeSpecial = function(c){
+ if(c === "c" || c === "C"){
+ this._state = BEFORE_SCRIPT_1;
+ } else if(c === "t" || c === "T"){
+ this._state = BEFORE_STYLE_1;
+ } else {
+ this._state = IN_TAG_NAME;
+ this._index--; //consume the token again
+ }
+};
+
+Tokenizer.prototype._stateBeforeSpecialEnd = function(c){
+ if(this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")){
+ this._state = AFTER_SCRIPT_1;
+ } else if(this._special === SPECIAL_STYLE && (c === "t" || c === "T")){
+ this._state = AFTER_STYLE_1;
+ }
+ else this._state = TEXT;
+};
+
+Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar("R", BEFORE_SCRIPT_2);
+Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar("I", BEFORE_SCRIPT_3);
+Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar("P", BEFORE_SCRIPT_4);
+Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar("T", BEFORE_SCRIPT_5);
+
+Tokenizer.prototype._stateBeforeScript5 = function(c){
+ if(c === "/" || c === ">" || whitespace(c)){
+ this._special = SPECIAL_SCRIPT;
+ }
+ this._state = IN_TAG_NAME;
+ this._index--; //consume the token again
+};
+
+Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT);
+Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT);
+Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT);
+Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT);
+
+Tokenizer.prototype._stateAfterScript5 = function(c){
+ if(c === ">" || whitespace(c)){
+ this._special = SPECIAL_NONE;
+ this._state = IN_CLOSING_TAG_NAME;
+ this._sectionStart = this._index - 6;
+ this._index--; //reconsume the token
+ }
+ else this._state = TEXT;
+};
+
+Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar("Y", BEFORE_STYLE_2);
+Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar("L", BEFORE_STYLE_3);
+Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar("E", BEFORE_STYLE_4);
+
+Tokenizer.prototype._stateBeforeStyle4 = function(c){
+ if(c === "/" || c === ">" || whitespace(c)){
+ this._special = SPECIAL_STYLE;
+ }
+ this._state = IN_TAG_NAME;
+ this._index--; //consume the token again
+};
+
+Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT);
+Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT);
+Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT);
+
+Tokenizer.prototype._stateAfterStyle4 = function(c){
+ if(c === ">" || whitespace(c)){
+ this._special = SPECIAL_NONE;
+ this._state = IN_CLOSING_TAG_NAME;
+ this._sectionStart = this._index - 5;
+ this._index--; //reconsume the token
+ }
+ else this._state = TEXT;
+};
+
+Tokenizer.prototype._stateBeforeEntity = ifElseState("#", BEFORE_NUMERIC_ENTITY, IN_NAMED_ENTITY);
+Tokenizer.prototype._stateBeforeNumericEntity = ifElseState("X", IN_HEX_ENTITY, IN_NUMERIC_ENTITY);
+
+//for entities within attributes
+Tokenizer.prototype._parseNamedEntityStrict = function(){
+ //offset = 1
+ if(this._sectionStart + 1 < this._index){
+ var entity = this._buffer.substring(this._sectionStart + 1, this._index),
+ map = this._xmlMode ? xmlMap : entityMap;
+
+ if(map.hasOwnProperty(entity)){
+ this._emitPartial(map[entity]);
+ this._sectionStart = this._index + 1;
+ }
+ }
+};
+
+
+//parses legacy entities (without trailing semicolon)
+Tokenizer.prototype._parseLegacyEntity = function(){
+ var start = this._sectionStart + 1,
+ limit = this._index - start;
+
+ if(limit > 6) limit = 6; //the max length of legacy entities is 6
+
+ while(limit >= 2){ //the min length of legacy entities is 2
+ var entity = this._buffer.substr(start, limit);
+
+ if(legacyMap.hasOwnProperty(entity)){
+ this._emitPartial(legacyMap[entity]);
+ this._sectionStart += limit + 2;
+ break;
+ } else {
+ limit--;
+ }
+ }
+};
+
+Tokenizer.prototype._stateInNamedEntity = function(c){
+ if(c === ";"){
+ this._parseNamedEntityStrict();
+ if(this._sectionStart + 1 < this._index && !this._xmlMode){
+ this._parseLegacyEntity();
+ }
+ this._state = this._baseState;
+ } else if((c < "a" || c > "z") && (c < "A" || c > "Z") && (c < "0" || c > "9")){
+ if(this._xmlMode);
+ else if(this._baseState !== TEXT){
+ if(c !== "="){
+ this._parseNamedEntityStrict();
+ this._sectionStart--; //include the current character in the section
+ }
+ } else {
+ this._parseLegacyEntity();
+ this._sectionStart--;
+ }
+ this._state = this._baseState;
+ this._index--;
+ }
+};
+
+// modified version of https://github.com/mathiasbynens/he/blob/master/src/he.js#L94-L119
+function decodeCodePoint(codePoint){
+ var output = "";
+
+ if((codePoint >= 0xD800 && codePoint <= 0xDFFF) || codePoint > 0x10FFFF){
+ return "\uFFFD";
+ }
+
+ if(codePoint in decodeMap){
+ codePoint = decodeMap[codePoint];
+ }
+
+ if(codePoint > 0xFFFF){
+ codePoint -= 0x10000;
+ output += String.fromCharCode(codePoint >>> 10 & 0x3FF | 0xD800);
+ codePoint = 0xDC00 | codePoint & 0x3FF;
+ }
+
+ output += String.fromCharCode(codePoint);
+ return output;
+}
+
+Tokenizer.prototype._decodeNumericEntity = function(offset, base){
+ var sectionStart = this._sectionStart + offset;
+
+ if(sectionStart !== this._index){
+ //parse entity
+ var entity = this._buffer.substring(sectionStart, this._index);
+ var parsed = parseInt(entity, base);
+
+ if(parsed === parsed){ //not NaN (TODO: when can this happen?)
+ this._emitPartial(decodeCodePoint(parsed));
+ this._sectionStart = this._index;
+ }
+ }
+
+ this._state = this._baseState;
+};
+
+Tokenizer.prototype._stateInNumericEntity = function(c){
+ if(c === ";"){
+ this._decodeNumericEntity(2, 10);
+ this._sectionStart++;
+ } else if(c < "0" || c > "9"){
+ if(!this._xmlMode){
+ this._decodeNumericEntity(2, 10);
+ } else {
+ this._state = this._baseState;
+ }
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._stateInHexEntity = function(c){
+ if(c === ";"){
+ this._decodeNumericEntity(3, 16);
+ this._sectionStart++;
+ } else if((c < "a" || c > "f") && (c < "A" || c > "F") && (c < "0" || c > "9")){
+ if(!this._xmlMode){
+ this._decodeNumericEntity(3, 16);
+ } else {
+ this._state = this._baseState;
+ }
+ this._index--;
+ }
+};
+
+Tokenizer.prototype._cleanup = function () {
+ if(this._sectionStart < 0){
+ this._buffer = "";
+ this._index = 0;
+ } else {
+ if(this._state === TEXT){
+ if(this._sectionStart !== this._index){
+ this._cbs.ontext(this._buffer.substr(this._sectionStart));
+ }
+ this._buffer = "";
+ this._index = 0;
+ } else if(this._sectionStart === this._index){
+ //the section just started
+ this._buffer = "";
+ this._index = 0;
+ } else {
+ //remove everything unnecessary
+ this._buffer = this._buffer.substr(this._sectionStart);
+ this._index -= this._sectionStart;
+ }
+
+ this._sectionStart = 0;
+ }
+};
+
+//TODO make events conditional
+Tokenizer.prototype.write = function(chunk){
+ this._buffer += chunk;
+
+ while(this._index < this._buffer.length && this._running){
+ var c = this._buffer.charAt(this._index);
+ if(this._state === TEXT) {
+ this._stateText(c);
+ } else if(this._state === BEFORE_TAG_NAME){
+ this._stateBeforeTagName(c);
+ } else if(this._state === IN_TAG_NAME) {
+ this._stateInTagName(c);
+ } else if(this._state === BEFORE_CLOSING_TAG_NAME){
+ this._stateBeforeCloseingTagName(c);
+ } else if(this._state === IN_CLOSING_TAG_NAME){
+ this._stateInCloseingTagName(c);
+ } else if(this._state === AFTER_CLOSING_TAG_NAME){
+ this._stateAfterCloseingTagName(c);
+ } else if(this._state === IN_SELF_CLOSING_TAG){
+ this._stateInSelfClosingTag(c);
+ }
+
+ /*
+ * attributes
+ */
+ else if(this._state === BEFORE_ATTRIBUTE_NAME){
+ this._stateBeforeAttributeName(c);
+ } else if(this._state === IN_ATTRIBUTE_NAME){
+ this._stateInAttributeName(c);
+ } else if(this._state === AFTER_ATTRIBUTE_NAME){
+ this._stateAfterAttributeName(c);
+ } else if(this._state === BEFORE_ATTRIBUTE_VALUE){
+ this._stateBeforeAttributeValue(c);
+ } else if(this._state === IN_ATTRIBUTE_VALUE_DQ){
+ this._stateInAttributeValueDoubleQuotes(c);
+ } else if(this._state === IN_ATTRIBUTE_VALUE_SQ){
+ this._stateInAttributeValueSingleQuotes(c);
+ } else if(this._state === IN_ATTRIBUTE_VALUE_NQ){
+ this._stateInAttributeValueNoQuotes(c);
+ }
+
+ /*
+ * declarations
+ */
+ else if(this._state === BEFORE_DECLARATION){
+ this._stateBeforeDeclaration(c);
+ } else if(this._state === IN_DECLARATION){
+ this._stateInDeclaration(c);
+ }
+
+ /*
+ * processing instructions
+ */
+ else if(this._state === IN_PROCESSING_INSTRUCTION){
+ this._stateInProcessingInstruction(c);
+ }
+
+ /*
+ * comments
+ */
+ else if(this._state === BEFORE_COMMENT){
+ this._stateBeforeComment(c);
+ } else if(this._state === IN_COMMENT){
+ this._stateInComment(c);
+ } else if(this._state === AFTER_COMMENT_1){
+ this._stateAfterComment1(c);
+ } else if(this._state === AFTER_COMMENT_2){
+ this._stateAfterComment2(c);
+ }
+
+ /*
+ * cdata
+ */
+ else if(this._state === BEFORE_CDATA_1){
+ this._stateBeforeCdata1(c);
+ } else if(this._state === BEFORE_CDATA_2){
+ this._stateBeforeCdata2(c);
+ } else if(this._state === BEFORE_CDATA_3){
+ this._stateBeforeCdata3(c);
+ } else if(this._state === BEFORE_CDATA_4){
+ this._stateBeforeCdata4(c);
+ } else if(this._state === BEFORE_CDATA_5){
+ this._stateBeforeCdata5(c);
+ } else if(this._state === BEFORE_CDATA_6){
+ this._stateBeforeCdata6(c);
+ } else if(this._state === IN_CDATA){
+ this._stateInCdata(c);
+ } else if(this._state === AFTER_CDATA_1){
+ this._stateAfterCdata1(c);
+ } else if(this._state === AFTER_CDATA_2){
+ this._stateAfterCdata2(c);
+ }
+
+ /*
+ * special tags
+ */
+ else if(this._state === BEFORE_SPECIAL){
+ this._stateBeforeSpecial(c);
+ } else if(this._state === BEFORE_SPECIAL_END){
+ this._stateBeforeSpecialEnd(c);
+ }
+
+ /*
+ * script
+ */
+ else if(this._state === BEFORE_SCRIPT_1){
+ this._stateBeforeScript1(c);
+ } else if(this._state === BEFORE_SCRIPT_2){
+ this._stateBeforeScript2(c);
+ } else if(this._state === BEFORE_SCRIPT_3){
+ this._stateBeforeScript3(c);
+ } else if(this._state === BEFORE_SCRIPT_4){
+ this._stateBeforeScript4(c);
+ } else if(this._state === BEFORE_SCRIPT_5){
+ this._stateBeforeScript5(c);
+ }
+
+ else if(this._state === AFTER_SCRIPT_1){
+ this._stateAfterScript1(c);
+ } else if(this._state === AFTER_SCRIPT_2){
+ this._stateAfterScript2(c);
+ } else if(this._state === AFTER_SCRIPT_3){
+ this._stateAfterScript3(c);
+ } else if(this._state === AFTER_SCRIPT_4){
+ this._stateAfterScript4(c);
+ } else if(this._state === AFTER_SCRIPT_5){
+ this._stateAfterScript5(c);
+ }
+
+ /*
+ * style
+ */
+ else if(this._state === BEFORE_STYLE_1){
+ this._stateBeforeStyle1(c);
+ } else if(this._state === BEFORE_STYLE_2){
+ this._stateBeforeStyle2(c);
+ } else if(this._state === BEFORE_STYLE_3){
+ this._stateBeforeStyle3(c);
+ } else if(this._state === BEFORE_STYLE_4){
+ this._stateBeforeStyle4(c);
+ }
+
+ else if(this._state === AFTER_STYLE_1){
+ this._stateAfterStyle1(c);
+ } else if(this._state === AFTER_STYLE_2){
+ this._stateAfterStyle2(c);
+ } else if(this._state === AFTER_STYLE_3){
+ this._stateAfterStyle3(c);
+ } else if(this._state === AFTER_STYLE_4){
+ this._stateAfterStyle4(c);
+ }
+
+ /*
+ * entities
+ */
+ else if(this._state === BEFORE_ENTITY){
+ this._stateBeforeEntity(c);
+ } else if(this._state === BEFORE_NUMERIC_ENTITY){
+ this._stateBeforeNumericEntity(c);
+ } else if(this._state === IN_NAMED_ENTITY){
+ this._stateInNamedEntity(c);
+ } else if(this._state === IN_NUMERIC_ENTITY){
+ this._stateInNumericEntity(c);
+ } else if(this._state === IN_HEX_ENTITY){
+ this._stateInHexEntity(c);
+ }
+
+ else {
+ this._cbs.onerror(Error("unknown _state"), this._state);
+ }
+
+ this._index++;
+ }
+
+ this._cleanup();
+};
+
+Tokenizer.prototype.pause = function(){
+ this._running = false;
+};
+Tokenizer.prototype.resume = function(){
+ this._running = true;
+};
+
+Tokenizer.prototype.end = function(chunk){
+ if(chunk) this.write(chunk);
+
+ //if there is remaining data, emit it in a reasonable way
+ if(this._sectionStart < this._index){
+ this._handleTrailingData();
+ }
+
+ this._cbs.onend();
+};
+
+Tokenizer.prototype._handleTrailingData = function(){
+ var data = this._buffer.substr(this._sectionStart);
+
+ if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
+ this._cbs.oncdata(data);
+ } else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
+ this._cbs.oncomment(data);
+ } else if(this._state === IN_TAG_NAME){
+ this._cbs.onopentagname(data);
+ } else if(this._state === BEFORE_ATTRIBUTE_NAME || this._state === BEFORE_ATTRIBUTE_VALUE || this._state === AFTER_ATTRIBUTE_NAME){
+ this._cbs.onopentagend();
+ } else if(this._state === IN_ATTRIBUTE_NAME){
+ this._cbs.onattribname(data);
+ } else if(this._state === IN_ATTRIBUTE_VALUE_SQ || this._state === IN_ATTRIBUTE_VALUE_DQ || this._state === IN_ATTRIBUTE_VALUE_NQ){
+ this._cbs.onattribdata(data);
+ this._cbs.onattribend();
+ } else if(this._state === IN_CLOSING_TAG_NAME){
+ this._cbs.onclosetag(data);
+ } else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
+ this._parseLegacyEntity();
+ if(--this._sectionStart < this._index){
+ this._state = this._baseState;
+ this._handleTrailingData();
+ }
+ } else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
+ this._decodeNumericEntity(2, 10);
+ if(this._sectionStart < this._index){
+ this._state = this._baseState;
+ this._handleTrailingData();
+ }
+ } else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
+ this._decodeNumericEntity(3, 16);
+ if(this._sectionStart < this._index){
+ this._state = this._baseState;
+ this._handleTrailingData();
+ }
+ } else {
+ this._cbs.ontext(data);
+ }
+};
+
+Tokenizer.prototype.reset = function(){
+ Tokenizer.call(this, {xmlMode: this._xmlMode, decodeEntities: this._decodeEntities}, this._cbs);
+};
+
+Tokenizer.prototype._getSection = function(){
+ return this._buffer.substring(this._sectionStart, this._index);
+};
+
+Tokenizer.prototype._emitToken = function(name){
+ this._cbs[name](this._getSection());
+ this._sectionStart = -1;
+};
+
+Tokenizer.prototype._emitPartial = function(value){
+ if(this._baseState !== TEXT){
+ this._cbs.onattribdata(value); //TODO implement the new event
+ } else {
+ this._cbs.ontext(value);
+ }
+};