diff options
author | Florian Dold <florian.dold@gmail.com> | 2017-05-03 15:35:00 +0200 |
---|---|---|
committer | Florian Dold <florian.dold@gmail.com> | 2017-05-03 15:35:00 +0200 |
commit | de98e0b232509d5f40c135d540a70e415272ff85 (patch) | |
tree | a79222a5b58484ab3b80d18efcaaa7ccc4769b33 /node_modules/html-minifier/src/htmlparser.js | |
parent | e0c9d480a73fa629c1e4a47d3e721f1d2d345406 (diff) |
node_modules
Diffstat (limited to 'node_modules/html-minifier/src/htmlparser.js')
-rw-r--r-- | node_modules/html-minifier/src/htmlparser.js | 528 |
1 files changed, 528 insertions, 0 deletions
diff --git a/node_modules/html-minifier/src/htmlparser.js b/node_modules/html-minifier/src/htmlparser.js new file mode 100644 index 000000000..2195347dc --- /dev/null +++ b/node_modules/html-minifier/src/htmlparser.js @@ -0,0 +1,528 @@ +/*! + * HTML Parser By John Resig (ejohn.org) + * Modified by Juriy "kangax" Zaytsev + * Original code by Erik Arvidsson, Mozilla Public License + * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js + */ + +/* + * // Use like so: + * HTMLParser(htmlString, { + * start: function(tag, attrs, unary) {}, + * end: function(tag) {}, + * chars: function(text) {}, + * comment: function(text) {} + * }); + * + * // or to get an XML string: + * HTMLtoXML(htmlString); + * + * // or to get an XML DOM Document + * HTMLtoDOM(htmlString); + * + * // or to inject into an existing document/DOM node + * HTMLtoDOM(htmlString, document); + * HTMLtoDOM(htmlString, document.body); + * + */ + + /* global ActiveXObject, DOMDocument */ + +'use strict'; + +var createMapFromString = require('./utils').createMapFromString; + +function makeMap(values) { + return createMapFromString(values, true); +} + +// Regular Expressions for parsing tags and attributes +var singleAttrIdentifier = /([^\s"'<>/=]+)/, + singleAttrAssigns = [/=/], + singleAttrValues = [ + // attr value double quotes + /"([^"]*)"+/.source, + // attr value, single quotes + /'([^']*)'+/.source, + // attr value, no quotes + /([^ \t\n\f\r"'`=<>]+)/.source + ], + // https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName + qnameCapture = (function() { + var ncname = require('ncname').source.slice(1, -1); + return '((?:' + ncname + '\\:)?' + ncname + ')'; + })(), + startTagOpen = new RegExp('^<' + qnameCapture), + startTagClose = /^\s*(\/?)>/, + endTag = new RegExp('^<\\/' + qnameCapture + '[^>]*>'), + doctype = /^<!DOCTYPE [^>]+>/i; + +var IS_REGEX_CAPTURING_BROKEN = false; +'x'.replace(/x(.)?/g, function(m, g) { + IS_REGEX_CAPTURING_BROKEN = g === ''; +}); + +// Empty Elements +var empty = makeMap('area,base,basefont,br,col,embed,frame,hr,img,input,isindex,keygen,link,meta,param,source,track,wbr'); + +// Inline Elements +var inline = makeMap('a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,noscript,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,svg,textarea,tt,u,var'); + +// Elements that you can, intentionally, leave open +// (and which close themselves) +var closeSelf = makeMap('colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr,source'); + +// Attributes that have their values filled in disabled='disabled' +var fillAttrs = makeMap('checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected'); + +// Special Elements (can contain anything) +var special = makeMap('script,style'); + +// HTML5 tags https://html.spec.whatwg.org/multipage/indices.html#elements-3 +// Phrasing Content https://html.spec.whatwg.org/multipage/dom.html#phrasing-content +var nonPhrasing = makeMap('address,article,aside,base,blockquote,body,caption,col,colgroup,dd,details,dialog,div,dl,dt,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,head,header,hgroup,hr,html,legend,li,menuitem,meta,optgroup,option,param,rp,rt,source,style,summary,tbody,td,tfoot,th,thead,title,tr,track'); + +var reCache = {}; + +function attrForHandler(handler) { + var pattern = singleAttrIdentifier.source + + '(?:\\s*(' + joinSingleAttrAssigns(handler) + ')' + + '[ \\t\\n\\f\\r]*(?:' + singleAttrValues.join('|') + '))?'; + if (handler.customAttrSurround) { + var attrClauses = []; + for (var i = handler.customAttrSurround.length - 1; i >= 0; i--) { + attrClauses[i] = '(?:' + + '(' + handler.customAttrSurround[i][0].source + ')\\s*' + + pattern + + '\\s*(' + handler.customAttrSurround[i][1].source + ')' + + ')'; + } + attrClauses.push('(?:' + pattern + ')'); + pattern = '(?:' + attrClauses.join('|') + ')'; + } + return new RegExp('^\\s*' + pattern); +} + +function joinSingleAttrAssigns(handler) { + return singleAttrAssigns.concat( + handler.customAttrAssign || [] + ).map(function(assign) { + return '(?:' + assign.source + ')'; + }).join('|'); +} + +function HTMLParser(html, handler) { + var stack = [], lastTag; + var attribute = attrForHandler(handler); + var last, prevTag, nextTag; + while (html) { + last = html; + // Make sure we're not in a script or style element + if (!lastTag || !special(lastTag)) { + var textEnd = html.indexOf('<'); + if (textEnd === 0) { + // Comment: + if (/^<!--/.test(html)) { + var commentEnd = html.indexOf('-->'); + + if (commentEnd >= 0) { + if (handler.comment) { + handler.comment(html.substring(4, commentEnd)); + } + html = html.substring(commentEnd + 3); + prevTag = ''; + continue; + } + } + + // http://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment + if (/^<!\[/.test(html)) { + var conditionalEnd = html.indexOf(']>'); + + if (conditionalEnd >= 0) { + if (handler.comment) { + handler.comment(html.substring(2, conditionalEnd + 1), true /* non-standard */); + } + html = html.substring(conditionalEnd + 2); + prevTag = ''; + continue; + } + } + + // Doctype: + var doctypeMatch = html.match(doctype); + if (doctypeMatch) { + if (handler.doctype) { + handler.doctype(doctypeMatch[0]); + } + html = html.substring(doctypeMatch[0].length); + prevTag = ''; + continue; + } + + // End tag: + var endTagMatch = html.match(endTag); + if (endTagMatch) { + html = html.substring(endTagMatch[0].length); + endTagMatch[0].replace(endTag, parseEndTag); + prevTag = '/' + endTagMatch[1].toLowerCase(); + continue; + } + + // Start tag: + var startTagMatch = parseStartTag(html); + if (startTagMatch) { + html = startTagMatch.rest; + handleStartTag(startTagMatch); + prevTag = startTagMatch.tagName.toLowerCase(); + continue; + } + } + + var text; + if (textEnd >= 0) { + text = html.substring(0, textEnd); + html = html.substring(textEnd); + } + else { + text = html; + html = ''; + } + + // next tag + var nextTagMatch = parseStartTag(html); + if (nextTagMatch) { + nextTag = nextTagMatch.tagName; + } + else { + nextTagMatch = html.match(endTag); + if (nextTagMatch) { + nextTag = '/' + nextTagMatch[1]; + } + else { + nextTag = ''; + } + } + + if (handler.chars) { + handler.chars(text, prevTag, nextTag); + } + prevTag = ''; + + } + else { + var stackedTag = lastTag.toLowerCase(); + var reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)</' + stackedTag + '[^>]*>', 'i')); + + html = html.replace(reStackedTag, function(all, text) { + if (stackedTag !== 'script' && stackedTag !== 'style' && stackedTag !== 'noscript') { + text = text + .replace(/<!--([\s\S]*?)-->/g, '$1') + .replace(/<!\[CDATA\[([\s\S]*?)]]>/g, '$1'); + } + + if (handler.chars) { + handler.chars(text); + } + + return ''; + }); + + parseEndTag('</' + stackedTag + '>', stackedTag); + } + + if (html === last) { + throw new Error('Parse Error: ' + html); + } + } + + if (!handler.partialMarkup) { + // Clean up any remaining tags + parseEndTag(); + } + + function parseStartTag(input) { + var start = input.match(startTagOpen); + if (start) { + var match = { + tagName: start[1], + attrs: [] + }; + input = input.slice(start[0].length); + var end, attr; + while (!(end = input.match(startTagClose)) && (attr = input.match(attribute))) { + input = input.slice(attr[0].length); + match.attrs.push(attr); + } + if (end) { + match.unarySlash = end[1]; + match.rest = input.slice(end[0].length); + return match; + } + } + } + + function handleStartTag(match) { + var tagName = match.tagName; + var unarySlash = match.unarySlash; + + if (handler.html5 && lastTag === 'p' && nonPhrasing(tagName)) { + parseEndTag('', lastTag); + } + + if (!handler.html5) { + while (lastTag && inline(lastTag)) { + parseEndTag('', lastTag); + } + } + + if (closeSelf(tagName) && lastTag === tagName) { + parseEndTag('', tagName); + } + + var unary = empty(tagName) || tagName === 'html' && lastTag === 'head' || !!unarySlash; + + var attrs = match.attrs.map(function(args) { + var name, value, customOpen, customClose, customAssign, quote; + var ncp = 7; // number of captured parts, scalar + + // hackish work around FF bug https://bugzilla.mozilla.org/show_bug.cgi?id=369778 + if (IS_REGEX_CAPTURING_BROKEN && args[0].indexOf('""') === -1) { + if (args[3] === '') { delete args[3]; } + if (args[4] === '') { delete args[4]; } + if (args[5] === '') { delete args[5]; } + } + + function populate(index) { + customAssign = args[index]; + value = args[index + 1]; + if (typeof value !== 'undefined') { + return '"'; + } + value = args[index + 2]; + if (typeof value !== 'undefined') { + return '\''; + } + value = args[index + 3]; + if (typeof value === 'undefined' && fillAttrs(name)) { + value = name; + } + return ''; + } + + var j = 1; + if (handler.customAttrSurround) { + for (var i = 0, l = handler.customAttrSurround.length; i < l; i++, j += ncp) { + name = args[j + 1]; + if (name) { + quote = populate(j + 2); + customOpen = args[j]; + customClose = args[j + 6]; + break; + } + } + } + + if (!name && (name = args[j])) { + quote = populate(j + 1); + } + + return { + name: name, + value: value, + customAssign: customAssign || '=', + customOpen: customOpen || '', + customClose: customClose || '', + quote: quote || '' + }; + }); + + if (!unary) { + stack.push({ tag: tagName, attrs: attrs }); + lastTag = tagName; + unarySlash = ''; + } + + if (handler.start) { + handler.start(tagName, attrs, unary, unarySlash); + } + } + + function parseEndTag(tag, tagName) { + var pos; + + // Find the closest opened tag of the same type + if (tagName) { + var needle = tagName.toLowerCase(); + for (pos = stack.length - 1; pos >= 0; pos--) { + if (stack[pos].tag.toLowerCase() === needle) { + break; + } + } + } + // If no tag name is provided, clean shop + else { + pos = 0; + } + + if (pos >= 0) { + // Close all the open elements, up the stack + for (var i = stack.length - 1; i >= pos; i--) { + if (handler.end) { + handler.end(stack[i].tag, stack[i].attrs, i > pos || !tag); + } + } + + // Remove the open elements from the stack + stack.length = pos; + lastTag = pos && stack[pos - 1].tag; + } + else if (tagName.toLowerCase() === 'br') { + if (handler.start) { + handler.start(tagName, [], true, ''); + } + } + else if (tagName.toLowerCase() === 'p') { + if (handler.start) { + handler.start(tagName, [], false, '', true); + } + if (handler.end) { + handler.end(tagName, []); + } + } + } +} + +exports.HTMLParser = HTMLParser; +exports.HTMLtoXML = function(html) { + var results = ''; + + new HTMLParser(html, { + start: function(tag, attrs, unary) { + results += '<' + tag; + + for (var i = 0, len = attrs.length; i < len; i++) { + results += ' ' + attrs[i].name + '="' + (attrs[i].value || '').replace(/"/g, '"') + '"'; + } + + results += (unary ? '/' : '') + '>'; + }, + end: function(tag) { + results += '</' + tag + '>'; + }, + chars: function(text) { + results += text; + }, + comment: function(text) { + results += '<!--' + text + '-->'; + }, + ignore: function(text) { + results += text; + } + }); + + return results; +}; + +exports.HTMLtoDOM = function(html, doc) { + // There can be only one of these elements + var one = { + html: true, + head: true, + body: true, + title: true + }; + + // Enforce a structure for the document + var structure = { + link: 'head', + base: 'head' + }; + + if (doc) { + doc = doc.ownerDocument || doc.getOwnerDocument && doc.getOwnerDocument() || doc; + } + else if (typeof DOMDocument !== 'undefined') { + doc = new DOMDocument(); + } + else if (typeof document !== 'undefined' && document.implementation && document.implementation.createDocument) { + doc = document.implementation.createDocument('', '', null); + } + else if (typeof ActiveX !== 'undefined') { + doc = new ActiveXObject('Msxml.DOMDocument'); + } + + var elems = [], + documentElement = doc.documentElement || + doc.getDocumentElement && doc.getDocumentElement(); + + // If we're dealing with an empty document then we + // need to pre-populate it with the HTML document structure + if (!documentElement && doc.createElement) { + (function() { + var html = doc.createElement('html'); + var head = doc.createElement('head'); + head.appendChild(doc.createElement('title')); + html.appendChild(head); + html.appendChild(doc.createElement('body')); + doc.appendChild(html); + })(); + } + + // Find all the unique elements + if (doc.getElementsByTagName) { + for (var i in one) { + one[i] = doc.getElementsByTagName(i)[0]; + } + } + + // If we're working with a document, inject contents into + // the body element + var curParentNode = one.body; + + new HTMLParser(html, { + start: function(tagName, attrs, unary) { + // If it's a pre-built element, then we can ignore + // its construction + if (one[tagName]) { + curParentNode = one[tagName]; + return; + } + + var elem = doc.createElement(tagName); + + for (var attr in attrs) { + elem.setAttribute(attrs[attr].name, attrs[attr].value); + } + + if (structure[tagName] && typeof one[structure[tagName]] !== 'boolean') { + one[structure[tagName]].appendChild(elem); + } + else if (curParentNode && curParentNode.appendChild) { + curParentNode.appendChild(elem); + } + + if (!unary) { + elems.push(elem); + curParentNode = elem; + } + }, + end: function(/* tag */) { + elems.length -= 1; + + // Init the new parentNode + curParentNode = elems[elems.length - 1]; + }, + chars: function(text) { + curParentNode.appendChild(doc.createTextNode(text)); + }, + comment: function(/* text */) { + // create comment node + }, + ignore: function(/* text */) { + // What to do here? + } + }); + + return doc; +}; |