diff options
author | Florian Dold <florian.dold@gmail.com> | 2017-05-03 15:35:00 +0200 |
---|---|---|
committer | Florian Dold <florian.dold@gmail.com> | 2017-05-03 15:35:00 +0200 |
commit | de98e0b232509d5f40c135d540a70e415272ff85 (patch) | |
tree | a79222a5b58484ab3b80d18efcaaa7ccc4769b33 /node_modules/htmlparser2/lib/Tokenizer.js | |
parent | e0c9d480a73fa629c1e4a47d3e721f1d2d345406 (diff) |
node_modules
Diffstat (limited to 'node_modules/htmlparser2/lib/Tokenizer.js')
-rw-r--r-- | node_modules/htmlparser2/lib/Tokenizer.js | 876 |
1 files changed, 876 insertions, 0 deletions
diff --git a/node_modules/htmlparser2/lib/Tokenizer.js b/node_modules/htmlparser2/lib/Tokenizer.js new file mode 100644 index 000000000..ef98766b8 --- /dev/null +++ b/node_modules/htmlparser2/lib/Tokenizer.js @@ -0,0 +1,876 @@ +module.exports = Tokenizer; + +var entityMap = require("./entities/entities.json"), + legacyMap = require("./entities/legacy.json"), + xmlMap = require("./entities/xml.json"), + decodeMap = require("./entities/decode.json"), + + i = 0, + + TEXT = i++, + BEFORE_TAG_NAME = i++, //after < + IN_TAG_NAME = i++, + IN_SELF_CLOSING_TAG = i++, + BEFORE_CLOSING_TAG_NAME = i++, + IN_CLOSING_TAG_NAME = i++, + AFTER_CLOSING_TAG_NAME = i++, + + //attributes + BEFORE_ATTRIBUTE_NAME = i++, + IN_ATTRIBUTE_NAME = i++, + AFTER_ATTRIBUTE_NAME = i++, + BEFORE_ATTRIBUTE_VALUE = i++, + IN_ATTRIBUTE_VALUE_DQ = i++, // " + IN_ATTRIBUTE_VALUE_SQ = i++, // ' + IN_ATTRIBUTE_VALUE_NQ = i++, + + //declarations + BEFORE_DECLARATION = i++, // ! + IN_DECLARATION = i++, + + //processing instructions + IN_PROCESSING_INSTRUCTION = i++, // ? + + //comments + BEFORE_COMMENT = i++, + IN_COMMENT = i++, + AFTER_COMMENT_1 = i++, + AFTER_COMMENT_2 = i++, + + //cdata + BEFORE_CDATA_1 = i++, // [ + BEFORE_CDATA_2 = i++, // C + BEFORE_CDATA_3 = i++, // D + BEFORE_CDATA_4 = i++, // A + BEFORE_CDATA_5 = i++, // T + BEFORE_CDATA_6 = i++, // A + IN_CDATA = i++,// [ + AFTER_CDATA_1 = i++, // ] + AFTER_CDATA_2 = i++, // ] + + //special tags + BEFORE_SPECIAL = i++, //S + BEFORE_SPECIAL_END = i++, //S + + BEFORE_SCRIPT_1 = i++, //C + BEFORE_SCRIPT_2 = i++, //R + BEFORE_SCRIPT_3 = i++, //I + BEFORE_SCRIPT_4 = i++, //P + BEFORE_SCRIPT_5 = i++, //T + AFTER_SCRIPT_1 = i++, //C + AFTER_SCRIPT_2 = i++, //R + AFTER_SCRIPT_3 = i++, //I + AFTER_SCRIPT_4 = i++, //P + AFTER_SCRIPT_5 = i++, //T + + BEFORE_STYLE_1 = i++, //T + BEFORE_STYLE_2 = i++, //Y + BEFORE_STYLE_3 = i++, //L + BEFORE_STYLE_4 = i++, //E + AFTER_STYLE_1 = i++, //T + AFTER_STYLE_2 = i++, //Y + AFTER_STYLE_3 = i++, //L + AFTER_STYLE_4 = i++, //E + + BEFORE_ENTITY = i++, //& + BEFORE_NUMERIC_ENTITY = i++, //# + IN_NAMED_ENTITY = i++, + IN_NUMERIC_ENTITY = i++, + IN_HEX_ENTITY = i++, //X + + j = 0, + + SPECIAL_NONE = j++, + SPECIAL_SCRIPT = j++, + SPECIAL_STYLE = j++; + +function whitespace(c){ + return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r"; +} + +function ifElseState(upper, SUCCESS, FAILURE){ + var lower = upper.toLowerCase(); + + if(upper === lower){ + return function(c){ + this._state = c === lower ? SUCCESS : FAILURE; + }; + } else { + return function(c){ + this._state = (c === lower || c === upper) ? SUCCESS : FAILURE; + }; + } +} + +function consumeSpecialNameChar(upper, NEXT_STATE){ + var lower = upper.toLowerCase(); + + return function(c){ + if(c === lower || c === upper){ + this._state = NEXT_STATE; + } else { + this._state = IN_TAG_NAME; + this._index--; //consume the token again + } + }; +} + +function Tokenizer(options, cbs){ + this._state = TEXT; + this._buffer = ""; + this._sectionStart = 0; + this._index = 0; + this._baseState = TEXT; + this._special = SPECIAL_NONE; + this._cbs = cbs; + this._running = true; + this._xmlMode = !!(options && options.xmlMode); + this._decodeEntities = !!(options && options.decodeEntities); +} + +Tokenizer.prototype._stateText = function(c){ + if(c === "<"){ + if(this._index > this._sectionStart){ + this._cbs.ontext(this._getSection()); + } + this._state = BEFORE_TAG_NAME; + this._sectionStart = this._index; + } else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){ + if(this._index > this._sectionStart){ + this._cbs.ontext(this._getSection()); + } + this._baseState = TEXT; + this._state = BEFORE_ENTITY; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateBeforeTagName = function(c){ + if(c === "/"){ + this._state = BEFORE_CLOSING_TAG_NAME; + } else if(c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) { + this._state = TEXT; + } else if(c === "!"){ + this._state = BEFORE_DECLARATION; + this._sectionStart = this._index + 1; + } else if(c === "?"){ + this._state = IN_PROCESSING_INSTRUCTION; + this._sectionStart = this._index + 1; + } else if(c === "<"){ + this._cbs.ontext(this._getSection()); + this._sectionStart = this._index; + } else { + this._state = (!this._xmlMode && (c === "s" || c === "S")) ? + BEFORE_SPECIAL : IN_TAG_NAME; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInTagName = function(c){ + if(c === "/" || c === ">" || whitespace(c)){ + this._emitToken("onopentagname"); + this._state = BEFORE_ATTRIBUTE_NAME; + this._index--; + } +}; + +Tokenizer.prototype._stateBeforeCloseingTagName = function(c){ + if(whitespace(c)); + else if(c === ">"){ + this._state = TEXT; + } else if(this._special !== SPECIAL_NONE){ + if(c === "s" || c === "S"){ + this._state = BEFORE_SPECIAL_END; + } else { + this._state = TEXT; + this._index--; + } + } else { + this._state = IN_CLOSING_TAG_NAME; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInCloseingTagName = function(c){ + if(c === ">" || whitespace(c)){ + this._emitToken("onclosetag"); + this._state = AFTER_CLOSING_TAG_NAME; + this._index--; + } +}; + +Tokenizer.prototype._stateAfterCloseingTagName = function(c){ + //skip everything until ">" + if(c === ">"){ + this._state = TEXT; + this._sectionStart = this._index + 1; + } +}; + +Tokenizer.prototype._stateBeforeAttributeName = function(c){ + if(c === ">"){ + this._cbs.onopentagend(); + this._state = TEXT; + this._sectionStart = this._index + 1; + } else if(c === "/"){ + this._state = IN_SELF_CLOSING_TAG; + } else if(!whitespace(c)){ + this._state = IN_ATTRIBUTE_NAME; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInSelfClosingTag = function(c){ + if(c === ">"){ + this._cbs.onselfclosingtag(); + this._state = TEXT; + this._sectionStart = this._index + 1; + } else if(!whitespace(c)){ + this._state = BEFORE_ATTRIBUTE_NAME; + this._index--; + } +}; + +Tokenizer.prototype._stateInAttributeName = function(c){ + if(c === "=" || c === "/" || c === ">" || whitespace(c)){ + if(this._index > this._sectionStart){ + this._cbs.onattribname(this._getSection()); + } + this._sectionStart = -1; + this._state = AFTER_ATTRIBUTE_NAME; + this._index--; + } +}; + +Tokenizer.prototype._stateAfterAttributeName = function(c){ + if(c === "="){ + this._state = BEFORE_ATTRIBUTE_VALUE; + } else if(c === "/" || c === ">"){ + this._cbs.onattribend(); + this._state = BEFORE_ATTRIBUTE_NAME; + this._index--; + } else if(!whitespace(c)){ + this._cbs.onattribend(); + this._state = IN_ATTRIBUTE_NAME; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateBeforeAttributeValue = function(c){ + if(c === "\""){ + this._state = IN_ATTRIBUTE_VALUE_DQ; + this._sectionStart = this._index + 1; + } else if(c === "'"){ + this._state = IN_ATTRIBUTE_VALUE_SQ; + this._sectionStart = this._index + 1; + } else if(!whitespace(c)){ + this._state = IN_ATTRIBUTE_VALUE_NQ; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c){ + if(c === "\""){ + this._emitToken("onattribdata"); + this._cbs.onattribend(); + this._state = BEFORE_ATTRIBUTE_NAME; + } else if(this._decodeEntities && c === "&"){ + this._emitToken("onattribdata"); + this._baseState = this._state; + this._state = BEFORE_ENTITY; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c){ + if(c === "'"){ + this._emitToken("onattribdata"); + this._cbs.onattribend(); + this._state = BEFORE_ATTRIBUTE_NAME; + } else if(this._decodeEntities && c === "&"){ + this._emitToken("onattribdata"); + this._baseState = this._state; + this._state = BEFORE_ENTITY; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c){ + if(whitespace(c) || c === ">"){ + this._emitToken("onattribdata"); + this._cbs.onattribend(); + this._state = BEFORE_ATTRIBUTE_NAME; + this._index--; + } else if(this._decodeEntities && c === "&"){ + this._emitToken("onattribdata"); + this._baseState = this._state; + this._state = BEFORE_ENTITY; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateBeforeDeclaration = function(c){ + this._state = c === "[" ? BEFORE_CDATA_1 : + c === "-" ? BEFORE_COMMENT : + IN_DECLARATION; +}; + +Tokenizer.prototype._stateInDeclaration = function(c){ + if(c === ">"){ + this._cbs.ondeclaration(this._getSection()); + this._state = TEXT; + this._sectionStart = this._index + 1; + } +}; + +Tokenizer.prototype._stateInProcessingInstruction = function(c){ + if(c === ">"){ + this._cbs.onprocessinginstruction(this._getSection()); + this._state = TEXT; + this._sectionStart = this._index + 1; + } +}; + +Tokenizer.prototype._stateBeforeComment = function(c){ + if(c === "-"){ + this._state = IN_COMMENT; + this._sectionStart = this._index + 1; + } else { + this._state = IN_DECLARATION; + } +}; + +Tokenizer.prototype._stateInComment = function(c){ + if(c === "-") this._state = AFTER_COMMENT_1; +}; + +Tokenizer.prototype._stateAfterComment1 = ifElseState("-", AFTER_COMMENT_2, IN_COMMENT); + +Tokenizer.prototype._stateAfterComment2 = function(c){ + if(c === ">"){ + //remove 2 trailing chars + this._cbs.oncomment(this._buffer.substring(this._sectionStart, this._index - 2)); + this._state = TEXT; + this._sectionStart = this._index + 1; + } else if(c !== "-"){ + this._state = IN_COMMENT; + } + // else: stay in AFTER_COMMENT_2 (`--->`) +}; + +Tokenizer.prototype._stateBeforeCdata1 = ifElseState("C", BEFORE_CDATA_2, IN_DECLARATION); +Tokenizer.prototype._stateBeforeCdata2 = ifElseState("D", BEFORE_CDATA_3, IN_DECLARATION); +Tokenizer.prototype._stateBeforeCdata3 = ifElseState("A", BEFORE_CDATA_4, IN_DECLARATION); +Tokenizer.prototype._stateBeforeCdata4 = ifElseState("T", BEFORE_CDATA_5, IN_DECLARATION); +Tokenizer.prototype._stateBeforeCdata5 = ifElseState("A", BEFORE_CDATA_6, IN_DECLARATION); + +Tokenizer.prototype._stateBeforeCdata6 = function(c){ + if(c === "["){ + this._state = IN_CDATA; + this._sectionStart = this._index + 1; + } else { + this._state = IN_DECLARATION; + } +}; + +Tokenizer.prototype._stateInCdata = function(c){ + if(c === "]") this._state = AFTER_CDATA_1; +}; + +Tokenizer.prototype._stateAfterCdata1 = ifElseState("]", AFTER_CDATA_2, IN_CDATA); + +Tokenizer.prototype._stateAfterCdata2 = function(c){ + if(c === ">"){ + //remove 2 trailing chars + this._cbs.oncdata(this._buffer.substring(this._sectionStart, this._index - 2)); + this._state = TEXT; + this._sectionStart = this._index + 1; + } else if (c !== "]") { + this._state = IN_CDATA; + } + //else: stay in AFTER_CDATA_2 (`]]]>`) +}; + +Tokenizer.prototype._stateBeforeSpecial = function(c){ + if(c === "c" || c === "C"){ + this._state = BEFORE_SCRIPT_1; + } else if(c === "t" || c === "T"){ + this._state = BEFORE_STYLE_1; + } else { + this._state = IN_TAG_NAME; + this._index--; //consume the token again + } +}; + +Tokenizer.prototype._stateBeforeSpecialEnd = function(c){ + if(this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")){ + this._state = AFTER_SCRIPT_1; + } else if(this._special === SPECIAL_STYLE && (c === "t" || c === "T")){ + this._state = AFTER_STYLE_1; + } + else this._state = TEXT; +}; + +Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar("R", BEFORE_SCRIPT_2); +Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar("I", BEFORE_SCRIPT_3); +Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar("P", BEFORE_SCRIPT_4); +Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar("T", BEFORE_SCRIPT_5); + +Tokenizer.prototype._stateBeforeScript5 = function(c){ + if(c === "/" || c === ">" || whitespace(c)){ + this._special = SPECIAL_SCRIPT; + } + this._state = IN_TAG_NAME; + this._index--; //consume the token again +}; + +Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT); +Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT); +Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT); +Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT); + +Tokenizer.prototype._stateAfterScript5 = function(c){ + if(c === ">" || whitespace(c)){ + this._special = SPECIAL_NONE; + this._state = IN_CLOSING_TAG_NAME; + this._sectionStart = this._index - 6; + this._index--; //reconsume the token + } + else this._state = TEXT; +}; + +Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar("Y", BEFORE_STYLE_2); +Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar("L", BEFORE_STYLE_3); +Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar("E", BEFORE_STYLE_4); + +Tokenizer.prototype._stateBeforeStyle4 = function(c){ + if(c === "/" || c === ">" || whitespace(c)){ + this._special = SPECIAL_STYLE; + } + this._state = IN_TAG_NAME; + this._index--; //consume the token again +}; + +Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT); +Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT); +Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT); + +Tokenizer.prototype._stateAfterStyle4 = function(c){ + if(c === ">" || whitespace(c)){ + this._special = SPECIAL_NONE; + this._state = IN_CLOSING_TAG_NAME; + this._sectionStart = this._index - 5; + this._index--; //reconsume the token + } + else this._state = TEXT; +}; + +Tokenizer.prototype._stateBeforeEntity = ifElseState("#", BEFORE_NUMERIC_ENTITY, IN_NAMED_ENTITY); +Tokenizer.prototype._stateBeforeNumericEntity = ifElseState("X", IN_HEX_ENTITY, IN_NUMERIC_ENTITY); + +//for entities within attributes +Tokenizer.prototype._parseNamedEntityStrict = function(){ + //offset = 1 + if(this._sectionStart + 1 < this._index){ + var entity = this._buffer.substring(this._sectionStart + 1, this._index), + map = this._xmlMode ? xmlMap : entityMap; + + if(map.hasOwnProperty(entity)){ + this._emitPartial(map[entity]); + this._sectionStart = this._index + 1; + } + } +}; + + +//parses legacy entities (without trailing semicolon) +Tokenizer.prototype._parseLegacyEntity = function(){ + var start = this._sectionStart + 1, + limit = this._index - start; + + if(limit > 6) limit = 6; //the max length of legacy entities is 6 + + while(limit >= 2){ //the min length of legacy entities is 2 + var entity = this._buffer.substr(start, limit); + + if(legacyMap.hasOwnProperty(entity)){ + this._emitPartial(legacyMap[entity]); + this._sectionStart += limit + 2; + break; + } else { + limit--; + } + } +}; + +Tokenizer.prototype._stateInNamedEntity = function(c){ + if(c === ";"){ + this._parseNamedEntityStrict(); + if(this._sectionStart + 1 < this._index && !this._xmlMode){ + this._parseLegacyEntity(); + } + this._state = this._baseState; + } else if((c < "a" || c > "z") && (c < "A" || c > "Z") && (c < "0" || c > "9")){ + if(this._xmlMode); + else if(this._baseState !== TEXT){ + if(c !== "="){ + this._parseNamedEntityStrict(); + this._sectionStart--; //include the current character in the section + } + } else { + this._parseLegacyEntity(); + this._sectionStart--; + } + this._state = this._baseState; + this._index--; + } +}; + +// modified version of https://github.com/mathiasbynens/he/blob/master/src/he.js#L94-L119 +function decodeCodePoint(codePoint){ + var output = ""; + + if((codePoint >= 0xD800 && codePoint <= 0xDFFF) || codePoint > 0x10FFFF){ + return "\uFFFD"; + } + + if(codePoint in decodeMap){ + codePoint = decodeMap[codePoint]; + } + + if(codePoint > 0xFFFF){ + codePoint -= 0x10000; + output += String.fromCharCode(codePoint >>> 10 & 0x3FF | 0xD800); + codePoint = 0xDC00 | codePoint & 0x3FF; + } + + output += String.fromCharCode(codePoint); + return output; +} + +Tokenizer.prototype._decodeNumericEntity = function(offset, base){ + var sectionStart = this._sectionStart + offset; + + if(sectionStart !== this._index){ + //parse entity + var entity = this._buffer.substring(sectionStart, this._index); + var parsed = parseInt(entity, base); + + if(parsed === parsed){ //not NaN (TODO: when can this happen?) + this._emitPartial(decodeCodePoint(parsed)); + this._sectionStart = this._index; + } + } + + this._state = this._baseState; +}; + +Tokenizer.prototype._stateInNumericEntity = function(c){ + if(c === ";"){ + this._decodeNumericEntity(2, 10); + this._sectionStart++; + } else if(c < "0" || c > "9"){ + if(!this._xmlMode){ + this._decodeNumericEntity(2, 10); + } else { + this._state = this._baseState; + } + this._index--; + } +}; + +Tokenizer.prototype._stateInHexEntity = function(c){ + if(c === ";"){ + this._decodeNumericEntity(3, 16); + this._sectionStart++; + } else if((c < "a" || c > "f") && (c < "A" || c > "F") && (c < "0" || c > "9")){ + if(!this._xmlMode){ + this._decodeNumericEntity(3, 16); + } else { + this._state = this._baseState; + } + this._index--; + } +}; + +Tokenizer.prototype._cleanup = function () { + if(this._sectionStart < 0){ + this._buffer = ""; + this._index = 0; + } else { + if(this._state === TEXT){ + if(this._sectionStart !== this._index){ + this._cbs.ontext(this._buffer.substr(this._sectionStart)); + } + this._buffer = ""; + this._index = 0; + } else if(this._sectionStart === this._index){ + //the section just started + this._buffer = ""; + this._index = 0; + } else { + //remove everything unnecessary + this._buffer = this._buffer.substr(this._sectionStart); + this._index -= this._sectionStart; + } + + this._sectionStart = 0; + } +}; + +//TODO make events conditional +Tokenizer.prototype.write = function(chunk){ + this._buffer += chunk; + + while(this._index < this._buffer.length && this._running){ + var c = this._buffer.charAt(this._index); + if(this._state === TEXT) { + this._stateText(c); + } else if(this._state === BEFORE_TAG_NAME){ + this._stateBeforeTagName(c); + } else if(this._state === IN_TAG_NAME) { + this._stateInTagName(c); + } else if(this._state === BEFORE_CLOSING_TAG_NAME){ + this._stateBeforeCloseingTagName(c); + } else if(this._state === IN_CLOSING_TAG_NAME){ + this._stateInCloseingTagName(c); + } else if(this._state === AFTER_CLOSING_TAG_NAME){ + this._stateAfterCloseingTagName(c); + } else if(this._state === IN_SELF_CLOSING_TAG){ + this._stateInSelfClosingTag(c); + } + + /* + * attributes + */ + else if(this._state === BEFORE_ATTRIBUTE_NAME){ + this._stateBeforeAttributeName(c); + } else if(this._state === IN_ATTRIBUTE_NAME){ + this._stateInAttributeName(c); + } else if(this._state === AFTER_ATTRIBUTE_NAME){ + this._stateAfterAttributeName(c); + } else if(this._state === BEFORE_ATTRIBUTE_VALUE){ + this._stateBeforeAttributeValue(c); + } else if(this._state === IN_ATTRIBUTE_VALUE_DQ){ + this._stateInAttributeValueDoubleQuotes(c); + } else if(this._state === IN_ATTRIBUTE_VALUE_SQ){ + this._stateInAttributeValueSingleQuotes(c); + } else if(this._state === IN_ATTRIBUTE_VALUE_NQ){ + this._stateInAttributeValueNoQuotes(c); + } + + /* + * declarations + */ + else if(this._state === BEFORE_DECLARATION){ + this._stateBeforeDeclaration(c); + } else if(this._state === IN_DECLARATION){ + this._stateInDeclaration(c); + } + + /* + * processing instructions + */ + else if(this._state === IN_PROCESSING_INSTRUCTION){ + this._stateInProcessingInstruction(c); + } + + /* + * comments + */ + else if(this._state === BEFORE_COMMENT){ + this._stateBeforeComment(c); + } else if(this._state === IN_COMMENT){ + this._stateInComment(c); + } else if(this._state === AFTER_COMMENT_1){ + this._stateAfterComment1(c); + } else if(this._state === AFTER_COMMENT_2){ + this._stateAfterComment2(c); + } + + /* + * cdata + */ + else if(this._state === BEFORE_CDATA_1){ + this._stateBeforeCdata1(c); + } else if(this._state === BEFORE_CDATA_2){ + this._stateBeforeCdata2(c); + } else if(this._state === BEFORE_CDATA_3){ + this._stateBeforeCdata3(c); + } else if(this._state === BEFORE_CDATA_4){ + this._stateBeforeCdata4(c); + } else if(this._state === BEFORE_CDATA_5){ + this._stateBeforeCdata5(c); + } else if(this._state === BEFORE_CDATA_6){ + this._stateBeforeCdata6(c); + } else if(this._state === IN_CDATA){ + this._stateInCdata(c); + } else if(this._state === AFTER_CDATA_1){ + this._stateAfterCdata1(c); + } else if(this._state === AFTER_CDATA_2){ + this._stateAfterCdata2(c); + } + + /* + * special tags + */ + else if(this._state === BEFORE_SPECIAL){ + this._stateBeforeSpecial(c); + } else if(this._state === BEFORE_SPECIAL_END){ + this._stateBeforeSpecialEnd(c); + } + + /* + * script + */ + else if(this._state === BEFORE_SCRIPT_1){ + this._stateBeforeScript1(c); + } else if(this._state === BEFORE_SCRIPT_2){ + this._stateBeforeScript2(c); + } else if(this._state === BEFORE_SCRIPT_3){ + this._stateBeforeScript3(c); + } else if(this._state === BEFORE_SCRIPT_4){ + this._stateBeforeScript4(c); + } else if(this._state === BEFORE_SCRIPT_5){ + this._stateBeforeScript5(c); + } + + else if(this._state === AFTER_SCRIPT_1){ + this._stateAfterScript1(c); + } else if(this._state === AFTER_SCRIPT_2){ + this._stateAfterScript2(c); + } else if(this._state === AFTER_SCRIPT_3){ + this._stateAfterScript3(c); + } else if(this._state === AFTER_SCRIPT_4){ + this._stateAfterScript4(c); + } else if(this._state === AFTER_SCRIPT_5){ + this._stateAfterScript5(c); + } + + /* + * style + */ + else if(this._state === BEFORE_STYLE_1){ + this._stateBeforeStyle1(c); + } else if(this._state === BEFORE_STYLE_2){ + this._stateBeforeStyle2(c); + } else if(this._state === BEFORE_STYLE_3){ + this._stateBeforeStyle3(c); + } else if(this._state === BEFORE_STYLE_4){ + this._stateBeforeStyle4(c); + } + + else if(this._state === AFTER_STYLE_1){ + this._stateAfterStyle1(c); + } else if(this._state === AFTER_STYLE_2){ + this._stateAfterStyle2(c); + } else if(this._state === AFTER_STYLE_3){ + this._stateAfterStyle3(c); + } else if(this._state === AFTER_STYLE_4){ + this._stateAfterStyle4(c); + } + + /* + * entities + */ + else if(this._state === BEFORE_ENTITY){ + this._stateBeforeEntity(c); + } else if(this._state === BEFORE_NUMERIC_ENTITY){ + this._stateBeforeNumericEntity(c); + } else if(this._state === IN_NAMED_ENTITY){ + this._stateInNamedEntity(c); + } else if(this._state === IN_NUMERIC_ENTITY){ + this._stateInNumericEntity(c); + } else if(this._state === IN_HEX_ENTITY){ + this._stateInHexEntity(c); + } + + else { + this._cbs.onerror(Error("unknown _state"), this._state); + } + + this._index++; + } + + this._cleanup(); +}; + +Tokenizer.prototype.pause = function(){ + this._running = false; +}; +Tokenizer.prototype.resume = function(){ + this._running = true; +}; + +Tokenizer.prototype.end = function(chunk){ + if(chunk) this.write(chunk); + + //if there is remaining data, emit it in a reasonable way + if(this._sectionStart < this._index){ + this._handleTrailingData(); + } + + this._cbs.onend(); +}; + +Tokenizer.prototype._handleTrailingData = function(){ + var data = this._buffer.substr(this._sectionStart); + + if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){ + this._cbs.oncdata(data); + } else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){ + this._cbs.oncomment(data); + } else if(this._state === IN_TAG_NAME){ + this._cbs.onopentagname(data); + } else if(this._state === BEFORE_ATTRIBUTE_NAME || this._state === BEFORE_ATTRIBUTE_VALUE || this._state === AFTER_ATTRIBUTE_NAME){ + this._cbs.onopentagend(); + } else if(this._state === IN_ATTRIBUTE_NAME){ + this._cbs.onattribname(data); + } else if(this._state === IN_ATTRIBUTE_VALUE_SQ || this._state === IN_ATTRIBUTE_VALUE_DQ || this._state === IN_ATTRIBUTE_VALUE_NQ){ + this._cbs.onattribdata(data); + this._cbs.onattribend(); + } else if(this._state === IN_CLOSING_TAG_NAME){ + this._cbs.onclosetag(data); + } else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){ + this._parseLegacyEntity(); + if(--this._sectionStart < this._index){ + this._state = this._baseState; + this._handleTrailingData(); + } + } else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){ + this._decodeNumericEntity(2, 10); + if(this._sectionStart < this._index){ + this._state = this._baseState; + this._handleTrailingData(); + } + } else if(this._state === IN_HEX_ENTITY && !this._xmlMode){ + this._decodeNumericEntity(3, 16); + if(this._sectionStart < this._index){ + this._state = this._baseState; + this._handleTrailingData(); + } + } else { + this._cbs.ontext(data); + } +}; + +Tokenizer.prototype.reset = function(){ + Tokenizer.call(this, {xmlMode: this._xmlMode, decodeEntities: this._decodeEntities}, this._cbs); +}; + +Tokenizer.prototype._getSection = function(){ + return this._buffer.substring(this._sectionStart, this._index); +}; + +Tokenizer.prototype._emitToken = function(name){ + this._cbs[name](this._getSection()); + this._sectionStart = -1; +}; + +Tokenizer.prototype._emitPartial = function(value){ + if(this._baseState !== TEXT){ + this._cbs.onattribdata(value); //TODO implement the new event + } else { + this._cbs.ontext(value); + } +}; |