From abd94a7f5a50f43c797a11b53549ae48fff667c3 Mon Sep 17 00:00:00 2001 From: Florian Dold Date: Mon, 10 Oct 2016 03:43:44 +0200 Subject: add node_modules to address #4364 --- node_modules/gettext-parser/lib/poparser.js | 525 ++++++++++++++++++++++++++++ 1 file changed, 525 insertions(+) create mode 100644 node_modules/gettext-parser/lib/poparser.js (limited to 'node_modules/gettext-parser/lib/poparser.js') diff --git a/node_modules/gettext-parser/lib/poparser.js b/node_modules/gettext-parser/lib/poparser.js new file mode 100644 index 000000000..e215bca08 --- /dev/null +++ b/node_modules/gettext-parser/lib/poparser.js @@ -0,0 +1,525 @@ +'use strict'; + +var encoding = require('encoding'); +var sharedFuncs = require('./shared'); +var Transform = require('stream').Transform; +var util = require('util'); + +/** + * Parses a PO object into translation table + * + * @param {Buffer|String} buffer PO object + * @param {String} [defaultCharset] Default charset to use + * @return {Object} Translation object + */ +module.exports.parse = function(buffer, defaultCharset) { + var parser = new Parser(buffer, defaultCharset); + return parser.parse(); +}; + +/** + * Parses a PO stream, emits translation table in object mode + * + * @param {String} [defaultCharset] Default charset to use + * @param {String} [options] Stream options + * @return {Stream} Transform stream + */ +module.exports.stream = function(defaultCharset, options) { + return new PoParserTransform(defaultCharset, options); +}; + +/** + * Creates a PO parser object. If PO object is a string, + * UTF-8 will be used as the charset + * + * @constructor + * @param {Buffer|String} fileContents PO object + * @param {String} [defaultCharset] Default charset to use + */ +function Parser(fileContents, defaultCharset) { + + this._charset = defaultCharset || 'iso-8859-1'; + + this._lex = []; + this._escaped = false; + this._node; + this._state = this.states.none; + + if (typeof fileContents === 'string') { + this._charset = 'utf-8'; + this._fileContents = fileContents; + } else { + this._handleCharset(fileContents); + } +} + +/** + * Parses the PO object and returns translation table + * + * @return {Object} Translation table + */ +Parser.prototype.parse = function() { + this._lexer(this._fileContents); + return this._finalize(this._lex); +}; + +/** + * Detects charset for PO strings from the header + * + * @param {Buffer} headers Header value + */ +Parser.prototype._handleCharset = function(buf) { + var str = (buf || '').toString(), + pos, headers = '', + match; + + if ((pos = str.search(/^\s*msgid/im)) >= 0) { + if ((pos = pos + str.substr(pos + 5).search(/^\s*(msgid|msgctxt)/im))) { + headers = str.substr(0, pos); + } + } + + if ((match = headers.match(/[; ]charset\s*=\s*([\w\-]+)(?:[\s;]|\\n)*"\s*$/mi))) { + this._charset = sharedFuncs.formatCharset(match[1], this._charset); + } + + if (this._charset === 'utf-8') { + this._fileContents = str; + } else { + this._fileContents = this._toString(buf); + } +}; + +Parser.prototype._toString = function(buf) { + return encoding.convert(buf, 'utf-8', this._charset).toString('utf-8'); +}; + +/** + * State constants for parsing FSM + */ +Parser.prototype.states = { + none: 0x01, + comments: 0x02, + key: 0x03, + string: 0x04 +}; + +/** + * Value types for lexer + */ +Parser.prototype.types = { + comments: 0x01, + key: 0x02, + string: 0x03 +}; + +/** + * String matches for lexer + */ +Parser.prototype.symbols = { + quotes: /["']/, + comments: /\#/, + whitespace: /\s/, + key: /[\w\-\[\]]/ +}; + +/** + * Token parser. Parsed state can be found from this._lex + * + * @param {String} chunk String + */ +Parser.prototype._lexer = function(chunk) { + var chr; + + for (var i = 0, len = chunk.length; i < len; i++) { + chr = chunk.charAt(i); + switch (this._state) { + case this.states.none: + if (chr.match(this.symbols.quotes)) { + this._node = { + type: this.types.string, + value: '', + quote: chr + }; + this._lex.push(this._node); + this._state = this.states.string; + } else if (chr.match(this.symbols.comments)) { + this._node = { + type: this.types.comments, + value: '' + }; + this._lex.push(this._node); + this._state = this.states.comments; + } else if (!chr.match(this.symbols.whitespace)) { + this._node = { + type: this.types.key, + value: chr + }; + this._lex.push(this._node); + this._state = this.states.key; + } + break; + case this.states.comments: + if (chr === '\n') { + this._state = this.states.none; + } else if (chr !== '\r') { + this._node.value += chr; + } + break; + case this.states.string: + if (this._escaped) { + switch (chr) { + case 't': + this._node.value += '\t'; + break; + case 'n': + this._node.value += '\n'; + break; + case 'r': + this._node.value += '\r'; + break; + default: + this._node.value += chr; + } + this._escaped = false; + } else { + if (chr === this._node.quote) { + this._state = this.states.none; + } else if (chr === '\\') { + this._escaped = true; + break; + } else { + this._node.value += chr; + } + this._escaped = false; + } + break; + case this.states.key: + if (!chr.match(this.symbols.key)) { + this._state = this.states.none; + i--; + } else { + this._node.value += chr; + } + break; + } + } +}; + +/** + * Join multi line strings + * + * @param {Object} tokens Parsed tokens + * @return {Object} Parsed tokens, with multi line strings joined into one + */ +Parser.prototype._joinStringValues = function(tokens) { + var lastNode, response = []; + + for (var i = 0, len = tokens.length; i < len; i++) { + if (lastNode && tokens[i].type === this.types.string && lastNode.type === this.types.string) { + lastNode.value += tokens[i].value; + } else if (lastNode && tokens[i].type === this.types.comments && lastNode.type === this.types.comments) { + lastNode.value += '\n' + tokens[i].value; + } else { + response.push(tokens[i]); + lastNode = tokens[i]; + } + } + + return response; +}; + +/** + * Parse comments into separate comment blocks + * + * @param {Object} tokens Parsed tokens + */ +Parser.prototype._parseComments = function(tokens) { + // parse comments + tokens.forEach((function(node) { + var comment, lines; + + if (node && node.type === this.types.comments) { + comment = { + translator: [], + extracted: [], + reference: [], + flag: [], + previous: [] + }; + lines = (node.value || '').split(/\n/); + lines.forEach(function(line) { + switch (line.charAt(0) || '') { + case ':': + comment.reference.push(line.substr(1).trim()); + break; + case '.': + comment.extracted.push(line.substr(1).replace(/^\s+/, '')); + break; + case ',': + comment.flag.push(line.substr(1).replace(/^\s+/, '')); + break; + case '|': + comment.previous.push(line.substr(1).replace(/^\s+/, '')); + break; + default: + comment.translator.push(line.replace(/^\s+/, '')); + } + }); + + node.value = {}; + + Object.keys(comment).forEach(function(key) { + if (comment[key] && comment[key].length) { + node.value[key] = comment[key].join('\n'); + } + }); + } + }).bind(this)); +}; + +/** + * Join gettext keys with values + * + * @param {Object} tokens Parsed tokens + * @return {Object} Tokens + */ +Parser.prototype._handleKeys = function(tokens) { + var response = [], + lastNode; + + for (var i = 0, len = tokens.length; i < len; i++) { + if (tokens[i].type === this.types.key) { + lastNode = { + key: tokens[i].value + }; + if (i && tokens[i - 1].type === this.types.comments) { + lastNode.comments = tokens[i - 1].value; + } + lastNode.value = ''; + response.push(lastNode); + } else if (tokens[i].type === this.types.string && lastNode) { + lastNode.value += tokens[i].value; + } + } + + return response; +}; + +/** + * Separate different values into individual translation objects + * + * @param {Object} tokens Parsed tokens + * @return {Object} Tokens + */ +Parser.prototype._handleValues = function(tokens) { + var response = [], + lastNode, curContext, curComments; + + for (var i = 0, len = tokens.length; i < len; i++) { + if (tokens[i].key.toLowerCase() === 'msgctxt') { + curContext = tokens[i].value; + curComments = tokens[i].comments; + } else if (tokens[i].key.toLowerCase() === 'msgid') { + lastNode = { + msgid: tokens[i].value + }; + + if (curContext) { + lastNode.msgctxt = curContext; + } + + if (curComments) { + lastNode.comments = curComments; + } + + if (tokens[i].comments && !lastNode.comments) { + lastNode.comments = tokens[i].comments; + } + + curContext = false; + curComments = false; + response.push(lastNode); + } else if (tokens[i].key.toLowerCase() === 'msgid_plural') { + if (lastNode) { + lastNode.msgid_plural = tokens[i].value; + } + + if (tokens[i].comments && !lastNode.comments) { + lastNode.comments = tokens[i].comments; + } + + curContext = false; + curComments = false; + } else if (tokens[i].key.substr(0, 6).toLowerCase() === 'msgstr') { + if (lastNode) { + lastNode.msgstr = (lastNode.msgstr || []).concat(tokens[i].value); + } + + if (tokens[i].comments && !lastNode.comments) { + lastNode.comments = tokens[i].comments; + } + + curContext = false; + curComments = false; + } + } + + return response; +}; + +/** + * Compose a translation table from tokens object + * + * @param {Object} tokens Parsed tokens + * @return {Object} Translation table + */ +Parser.prototype._normalize = function(tokens) { + var msgctxt, + table = { + charset: this._charset, + headers: undefined, + translations: {} + }; + + for (var i = 0, len = tokens.length; i < len; i++) { + msgctxt = tokens[i].msgctxt || ''; + + if (!table.translations[msgctxt]) { + table.translations[msgctxt] = {}; + } + + if (!table.headers && !msgctxt && !tokens[i].msgid) { + table.headers = sharedFuncs.parseHeader(tokens[i].msgstr[0]); + } + + table.translations[msgctxt][tokens[i].msgid] = tokens[i]; + } + + return table; +}; + +/** + * Converts parsed tokens to a translation table + * + * @param {Object} tokens Parsed tokens + * @returns {Object} Translation table + */ +Parser.prototype._finalize = function(tokens) { + var data = this._joinStringValues(tokens); + this._parseComments(data); + data = this._handleKeys(data); + data = this._handleValues(data); + + return this._normalize(data); +}; + +/** + * Creates a transform stream for parsing PO input + * + * @constructor + * @param {String} [defaultCharset] Default charset to use + * @param {String} [options] Stream options + */ +function PoParserTransform(defaultCharset, options) { + if (!options && defaultCharset && typeof defaultCharset === 'object') { + options = defaultCharset; + defaultCharset = undefined; + } + + this.defaultCharset = defaultCharset; + this._parser = false; + this._tokens = {}; + + this._cache = []; + this._cacheSize = 0; + + this.initialTreshold = options.initialTreshold || 2 * 1024; + + Transform.call(this, options); + this._writableState.objectMode = false; + this._readableState.objectMode = true; +} +util.inherits(PoParserTransform, Transform); + +/** + * Processes a chunk of the input stream + */ +PoParserTransform.prototype._transform = function(chunk, encoding, done) { + var i, len = 0; + + if (!chunk || !chunk.length) { + return done(); + } + + if (!this._parser) { + this._cache.push(chunk); + this._cacheSize += chunk.length; + + // wait until the first 1kb before parsing headers for charset + if (this._cacheSize < this.initialTreshold) { + return setImmediate(done); + } else if (this._cacheSize) { + chunk = Buffer.concat(this._cache, this._cacheSize); + this._cacheSize = 0; + this._cache = []; + } + + this._parser = new Parser(chunk, this.defaultCharset); + } else if (this._cacheSize) { + // this only happens if we had an uncompleted 8bit sequence from the last iteration + this._cache.push(chunk); + this._cacheSize += chunk.length; + chunk = Buffer.concat(this._cache, this._cacheSize); + this._cacheSize = 0; + this._cache = []; + } + + // cache 8bit bytes from the end of the chunk + // helps if the chunk ends in the middle of an utf-8 sequence + for (i = chunk.length - 1; i >= 0; i--) { + if (chunk[i] >= 0x80) { + len++; + continue; + } + break; + } + // it seems we found some 8bit bytes from the end of the string, so let's cache these + if (len) { + this._cache = [chunk.slice(chunk.length - len)]; + this._cacheSize = this._cache[0].length; + chunk = chunk.slice(0, chunk.length - len); + } + + // chunk might be empty if it only contined of 8bit bytes and these were all cached + if (chunk.length) { + this._parser._lexer(this._parser._toString(chunk)); + } + + setImmediate(done); +}; + +/** + * Once all input has been processed emit the parsed translation table as an object + */ +PoParserTransform.prototype._flush = function(done) { + var chunk; + + if (this._cacheSize) { + chunk = Buffer.concat(this._cache, this._cacheSize); + } + + if (!this._parser && chunk) { + this._parser = new Parser(chunk, this.defaultCharset); + } + + if (chunk) { + this._parser._lexer(this._parser._toString(chunk)); + } + + if (this._parser) { + this.push(this._parser._finalize(this._parser._lex)); + } + + setImmediate(done); +}; \ No newline at end of file -- cgit v1.2.3