525 lines
15 KiB
JavaScript
525 lines
15 KiB
JavaScript
|
'use strict';
|
||
|
|
||
|
var encoding = require('encoding');
|
||
|
var sharedFuncs = require('./shared');
|
||
|
var Transform = require('stream').Transform;
|
||
|
var util = require('util');
|
||
|
|
||
|
/**
|
||
|
* Parses a PO object into translation table
|
||
|
*
|
||
|
* @param {Buffer|String} buffer PO object
|
||
|
* @param {String} [defaultCharset] Default charset to use
|
||
|
* @return {Object} Translation object
|
||
|
*/
|
||
|
module.exports.parse = function(buffer, defaultCharset) {
|
||
|
var parser = new Parser(buffer, defaultCharset);
|
||
|
return parser.parse();
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Parses a PO stream, emits translation table in object mode
|
||
|
*
|
||
|
* @param {String} [defaultCharset] Default charset to use
|
||
|
* @param {String} [options] Stream options
|
||
|
* @return {Stream} Transform stream
|
||
|
*/
|
||
|
module.exports.stream = function(defaultCharset, options) {
|
||
|
return new PoParserTransform(defaultCharset, options);
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Creates a PO parser object. If PO object is a string,
|
||
|
* UTF-8 will be used as the charset
|
||
|
*
|
||
|
* @constructor
|
||
|
* @param {Buffer|String} fileContents PO object
|
||
|
* @param {String} [defaultCharset] Default charset to use
|
||
|
*/
|
||
|
function Parser(fileContents, defaultCharset) {
|
||
|
|
||
|
this._charset = defaultCharset || 'iso-8859-1';
|
||
|
|
||
|
this._lex = [];
|
||
|
this._escaped = false;
|
||
|
this._node;
|
||
|
this._state = this.states.none;
|
||
|
|
||
|
if (typeof fileContents === 'string') {
|
||
|
this._charset = 'utf-8';
|
||
|
this._fileContents = fileContents;
|
||
|
} else {
|
||
|
this._handleCharset(fileContents);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Parses the PO object and returns translation table
|
||
|
*
|
||
|
* @return {Object} Translation table
|
||
|
*/
|
||
|
Parser.prototype.parse = function() {
|
||
|
this._lexer(this._fileContents);
|
||
|
return this._finalize(this._lex);
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Detects charset for PO strings from the header
|
||
|
*
|
||
|
* @param {Buffer} headers Header value
|
||
|
*/
|
||
|
Parser.prototype._handleCharset = function(buf) {
|
||
|
var str = (buf || '').toString(),
|
||
|
pos, headers = '',
|
||
|
match;
|
||
|
|
||
|
if ((pos = str.search(/^\s*msgid/im)) >= 0) {
|
||
|
if ((pos = pos + str.substr(pos + 5).search(/^\s*(msgid|msgctxt)/im))) {
|
||
|
headers = str.substr(0, pos);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ((match = headers.match(/[; ]charset\s*=\s*([\w\-]+)(?:[\s;]|\\n)*"\s*$/mi))) {
|
||
|
this._charset = sharedFuncs.formatCharset(match[1], this._charset);
|
||
|
}
|
||
|
|
||
|
if (this._charset === 'utf-8') {
|
||
|
this._fileContents = str;
|
||
|
} else {
|
||
|
this._fileContents = this._toString(buf);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
Parser.prototype._toString = function(buf) {
|
||
|
return encoding.convert(buf, 'utf-8', this._charset).toString('utf-8');
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* State constants for parsing FSM
|
||
|
*/
|
||
|
Parser.prototype.states = {
|
||
|
none: 0x01,
|
||
|
comments: 0x02,
|
||
|
key: 0x03,
|
||
|
string: 0x04
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Value types for lexer
|
||
|
*/
|
||
|
Parser.prototype.types = {
|
||
|
comments: 0x01,
|
||
|
key: 0x02,
|
||
|
string: 0x03
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* String matches for lexer
|
||
|
*/
|
||
|
Parser.prototype.symbols = {
|
||
|
quotes: /["']/,
|
||
|
comments: /\#/,
|
||
|
whitespace: /\s/,
|
||
|
key: /[\w\-\[\]]/
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Token parser. Parsed state can be found from this._lex
|
||
|
*
|
||
|
* @param {String} chunk String
|
||
|
*/
|
||
|
Parser.prototype._lexer = function(chunk) {
|
||
|
var chr;
|
||
|
|
||
|
for (var i = 0, len = chunk.length; i < len; i++) {
|
||
|
chr = chunk.charAt(i);
|
||
|
switch (this._state) {
|
||
|
case this.states.none:
|
||
|
if (chr.match(this.symbols.quotes)) {
|
||
|
this._node = {
|
||
|
type: this.types.string,
|
||
|
value: '',
|
||
|
quote: chr
|
||
|
};
|
||
|
this._lex.push(this._node);
|
||
|
this._state = this.states.string;
|
||
|
} else if (chr.match(this.symbols.comments)) {
|
||
|
this._node = {
|
||
|
type: this.types.comments,
|
||
|
value: ''
|
||
|
};
|
||
|
this._lex.push(this._node);
|
||
|
this._state = this.states.comments;
|
||
|
} else if (!chr.match(this.symbols.whitespace)) {
|
||
|
this._node = {
|
||
|
type: this.types.key,
|
||
|
value: chr
|
||
|
};
|
||
|
this._lex.push(this._node);
|
||
|
this._state = this.states.key;
|
||
|
}
|
||
|
break;
|
||
|
case this.states.comments:
|
||
|
if (chr === '\n') {
|
||
|
this._state = this.states.none;
|
||
|
} else if (chr !== '\r') {
|
||
|
this._node.value += chr;
|
||
|
}
|
||
|
break;
|
||
|
case this.states.string:
|
||
|
if (this._escaped) {
|
||
|
switch (chr) {
|
||
|
case 't':
|
||
|
this._node.value += '\t';
|
||
|
break;
|
||
|
case 'n':
|
||
|
this._node.value += '\n';
|
||
|
break;
|
||
|
case 'r':
|
||
|
this._node.value += '\r';
|
||
|
break;
|
||
|
default:
|
||
|
this._node.value += chr;
|
||
|
}
|
||
|
this._escaped = false;
|
||
|
} else {
|
||
|
if (chr === this._node.quote) {
|
||
|
this._state = this.states.none;
|
||
|
} else if (chr === '\\') {
|
||
|
this._escaped = true;
|
||
|
break;
|
||
|
} else {
|
||
|
this._node.value += chr;
|
||
|
}
|
||
|
this._escaped = false;
|
||
|
}
|
||
|
break;
|
||
|
case this.states.key:
|
||
|
if (!chr.match(this.symbols.key)) {
|
||
|
this._state = this.states.none;
|
||
|
i--;
|
||
|
} else {
|
||
|
this._node.value += chr;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Join multi line strings
|
||
|
*
|
||
|
* @param {Object} tokens Parsed tokens
|
||
|
* @return {Object} Parsed tokens, with multi line strings joined into one
|
||
|
*/
|
||
|
Parser.prototype._joinStringValues = function(tokens) {
|
||
|
var lastNode, response = [];
|
||
|
|
||
|
for (var i = 0, len = tokens.length; i < len; i++) {
|
||
|
if (lastNode && tokens[i].type === this.types.string && lastNode.type === this.types.string) {
|
||
|
lastNode.value += tokens[i].value;
|
||
|
} else if (lastNode && tokens[i].type === this.types.comments && lastNode.type === this.types.comments) {
|
||
|
lastNode.value += '\n' + tokens[i].value;
|
||
|
} else {
|
||
|
response.push(tokens[i]);
|
||
|
lastNode = tokens[i];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return response;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Parse comments into separate comment blocks
|
||
|
*
|
||
|
* @param {Object} tokens Parsed tokens
|
||
|
*/
|
||
|
Parser.prototype._parseComments = function(tokens) {
|
||
|
// parse comments
|
||
|
tokens.forEach((function(node) {
|
||
|
var comment, lines;
|
||
|
|
||
|
if (node && node.type === this.types.comments) {
|
||
|
comment = {
|
||
|
translator: [],
|
||
|
extracted: [],
|
||
|
reference: [],
|
||
|
flag: [],
|
||
|
previous: []
|
||
|
};
|
||
|
lines = (node.value || '').split(/\n/);
|
||
|
lines.forEach(function(line) {
|
||
|
switch (line.charAt(0) || '') {
|
||
|
case ':':
|
||
|
comment.reference.push(line.substr(1).trim());
|
||
|
break;
|
||
|
case '.':
|
||
|
comment.extracted.push(line.substr(1).replace(/^\s+/, ''));
|
||
|
break;
|
||
|
case ',':
|
||
|
comment.flag.push(line.substr(1).replace(/^\s+/, ''));
|
||
|
break;
|
||
|
case '|':
|
||
|
comment.previous.push(line.substr(1).replace(/^\s+/, ''));
|
||
|
break;
|
||
|
default:
|
||
|
comment.translator.push(line.replace(/^\s+/, ''));
|
||
|
}
|
||
|
});
|
||
|
|
||
|
node.value = {};
|
||
|
|
||
|
Object.keys(comment).forEach(function(key) {
|
||
|
if (comment[key] && comment[key].length) {
|
||
|
node.value[key] = comment[key].join('\n');
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
}).bind(this));
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Join gettext keys with values
|
||
|
*
|
||
|
* @param {Object} tokens Parsed tokens
|
||
|
* @return {Object} Tokens
|
||
|
*/
|
||
|
Parser.prototype._handleKeys = function(tokens) {
|
||
|
var response = [],
|
||
|
lastNode;
|
||
|
|
||
|
for (var i = 0, len = tokens.length; i < len; i++) {
|
||
|
if (tokens[i].type === this.types.key) {
|
||
|
lastNode = {
|
||
|
key: tokens[i].value
|
||
|
};
|
||
|
if (i && tokens[i - 1].type === this.types.comments) {
|
||
|
lastNode.comments = tokens[i - 1].value;
|
||
|
}
|
||
|
lastNode.value = '';
|
||
|
response.push(lastNode);
|
||
|
} else if (tokens[i].type === this.types.string && lastNode) {
|
||
|
lastNode.value += tokens[i].value;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return response;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Separate different values into individual translation objects
|
||
|
*
|
||
|
* @param {Object} tokens Parsed tokens
|
||
|
* @return {Object} Tokens
|
||
|
*/
|
||
|
Parser.prototype._handleValues = function(tokens) {
|
||
|
var response = [],
|
||
|
lastNode, curContext, curComments;
|
||
|
|
||
|
for (var i = 0, len = tokens.length; i < len; i++) {
|
||
|
if (tokens[i].key.toLowerCase() === 'msgctxt') {
|
||
|
curContext = tokens[i].value;
|
||
|
curComments = tokens[i].comments;
|
||
|
} else if (tokens[i].key.toLowerCase() === 'msgid') {
|
||
|
lastNode = {
|
||
|
msgid: tokens[i].value
|
||
|
};
|
||
|
|
||
|
if (curContext) {
|
||
|
lastNode.msgctxt = curContext;
|
||
|
}
|
||
|
|
||
|
if (curComments) {
|
||
|
lastNode.comments = curComments;
|
||
|
}
|
||
|
|
||
|
if (tokens[i].comments && !lastNode.comments) {
|
||
|
lastNode.comments = tokens[i].comments;
|
||
|
}
|
||
|
|
||
|
curContext = false;
|
||
|
curComments = false;
|
||
|
response.push(lastNode);
|
||
|
} else if (tokens[i].key.toLowerCase() === 'msgid_plural') {
|
||
|
if (lastNode) {
|
||
|
lastNode.msgid_plural = tokens[i].value;
|
||
|
}
|
||
|
|
||
|
if (tokens[i].comments && !lastNode.comments) {
|
||
|
lastNode.comments = tokens[i].comments;
|
||
|
}
|
||
|
|
||
|
curContext = false;
|
||
|
curComments = false;
|
||
|
} else if (tokens[i].key.substr(0, 6).toLowerCase() === 'msgstr') {
|
||
|
if (lastNode) {
|
||
|
lastNode.msgstr = (lastNode.msgstr || []).concat(tokens[i].value);
|
||
|
}
|
||
|
|
||
|
if (tokens[i].comments && !lastNode.comments) {
|
||
|
lastNode.comments = tokens[i].comments;
|
||
|
}
|
||
|
|
||
|
curContext = false;
|
||
|
curComments = false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return response;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Compose a translation table from tokens object
|
||
|
*
|
||
|
* @param {Object} tokens Parsed tokens
|
||
|
* @return {Object} Translation table
|
||
|
*/
|
||
|
Parser.prototype._normalize = function(tokens) {
|
||
|
var msgctxt,
|
||
|
table = {
|
||
|
charset: this._charset,
|
||
|
headers: undefined,
|
||
|
translations: {}
|
||
|
};
|
||
|
|
||
|
for (var i = 0, len = tokens.length; i < len; i++) {
|
||
|
msgctxt = tokens[i].msgctxt || '';
|
||
|
|
||
|
if (!table.translations[msgctxt]) {
|
||
|
table.translations[msgctxt] = {};
|
||
|
}
|
||
|
|
||
|
if (!table.headers && !msgctxt && !tokens[i].msgid) {
|
||
|
table.headers = sharedFuncs.parseHeader(tokens[i].msgstr[0]);
|
||
|
}
|
||
|
|
||
|
table.translations[msgctxt][tokens[i].msgid] = tokens[i];
|
||
|
}
|
||
|
|
||
|
return table;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Converts parsed tokens to a translation table
|
||
|
*
|
||
|
* @param {Object} tokens Parsed tokens
|
||
|
* @returns {Object} Translation table
|
||
|
*/
|
||
|
Parser.prototype._finalize = function(tokens) {
|
||
|
var data = this._joinStringValues(tokens);
|
||
|
this._parseComments(data);
|
||
|
data = this._handleKeys(data);
|
||
|
data = this._handleValues(data);
|
||
|
|
||
|
return this._normalize(data);
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Creates a transform stream for parsing PO input
|
||
|
*
|
||
|
* @constructor
|
||
|
* @param {String} [defaultCharset] Default charset to use
|
||
|
* @param {String} [options] Stream options
|
||
|
*/
|
||
|
function PoParserTransform(defaultCharset, options) {
|
||
|
if (!options && defaultCharset && typeof defaultCharset === 'object') {
|
||
|
options = defaultCharset;
|
||
|
defaultCharset = undefined;
|
||
|
}
|
||
|
|
||
|
this.defaultCharset = defaultCharset;
|
||
|
this._parser = false;
|
||
|
this._tokens = {};
|
||
|
|
||
|
this._cache = [];
|
||
|
this._cacheSize = 0;
|
||
|
|
||
|
this.initialTreshold = options.initialTreshold || 2 * 1024;
|
||
|
|
||
|
Transform.call(this, options);
|
||
|
this._writableState.objectMode = false;
|
||
|
this._readableState.objectMode = true;
|
||
|
}
|
||
|
util.inherits(PoParserTransform, Transform);
|
||
|
|
||
|
/**
|
||
|
* Processes a chunk of the input stream
|
||
|
*/
|
||
|
PoParserTransform.prototype._transform = function(chunk, encoding, done) {
|
||
|
var i, len = 0;
|
||
|
|
||
|
if (!chunk || !chunk.length) {
|
||
|
return done();
|
||
|
}
|
||
|
|
||
|
if (!this._parser) {
|
||
|
this._cache.push(chunk);
|
||
|
this._cacheSize += chunk.length;
|
||
|
|
||
|
// wait until the first 1kb before parsing headers for charset
|
||
|
if (this._cacheSize < this.initialTreshold) {
|
||
|
return setImmediate(done);
|
||
|
} else if (this._cacheSize) {
|
||
|
chunk = Buffer.concat(this._cache, this._cacheSize);
|
||
|
this._cacheSize = 0;
|
||
|
this._cache = [];
|
||
|
}
|
||
|
|
||
|
this._parser = new Parser(chunk, this.defaultCharset);
|
||
|
} else if (this._cacheSize) {
|
||
|
// this only happens if we had an uncompleted 8bit sequence from the last iteration
|
||
|
this._cache.push(chunk);
|
||
|
this._cacheSize += chunk.length;
|
||
|
chunk = Buffer.concat(this._cache, this._cacheSize);
|
||
|
this._cacheSize = 0;
|
||
|
this._cache = [];
|
||
|
}
|
||
|
|
||
|
// cache 8bit bytes from the end of the chunk
|
||
|
// helps if the chunk ends in the middle of an utf-8 sequence
|
||
|
for (i = chunk.length - 1; i >= 0; i--) {
|
||
|
if (chunk[i] >= 0x80) {
|
||
|
len++;
|
||
|
continue;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
// it seems we found some 8bit bytes from the end of the string, so let's cache these
|
||
|
if (len) {
|
||
|
this._cache = [chunk.slice(chunk.length - len)];
|
||
|
this._cacheSize = this._cache[0].length;
|
||
|
chunk = chunk.slice(0, chunk.length - len);
|
||
|
}
|
||
|
|
||
|
// chunk might be empty if it only contined of 8bit bytes and these were all cached
|
||
|
if (chunk.length) {
|
||
|
this._parser._lexer(this._parser._toString(chunk));
|
||
|
}
|
||
|
|
||
|
setImmediate(done);
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Once all input has been processed emit the parsed translation table as an object
|
||
|
*/
|
||
|
PoParserTransform.prototype._flush = function(done) {
|
||
|
var chunk;
|
||
|
|
||
|
if (this._cacheSize) {
|
||
|
chunk = Buffer.concat(this._cache, this._cacheSize);
|
||
|
}
|
||
|
|
||
|
if (!this._parser && chunk) {
|
||
|
this._parser = new Parser(chunk, this.defaultCharset);
|
||
|
}
|
||
|
|
||
|
if (chunk) {
|
||
|
this._parser._lexer(this._parser._toString(chunk));
|
||
|
}
|
||
|
|
||
|
if (this._parser) {
|
||
|
this.push(this._parser._finalize(this._parser._lex));
|
||
|
}
|
||
|
|
||
|
setImmediate(done);
|
||
|
};
|