wallet-core/node_modules/gettext-parser/lib/poparser.js

'use strict';

var encoding = require('encoding');
var sharedFuncs = require('./shared');
var Transform = require('stream').Transform;
var util = require('util');

/**
 * Parses a PO object into translation table
 *
 * @param {Buffer|String} buffer PO object
 * @param {String} [defaultCharset] Default charset to use
 * @return {Object} Translation object
 */
module.exports.parse = function(buffer, defaultCharset) {
    var parser = new Parser(buffer, defaultCharset);
    return parser.parse();
};

/**
 * Parses a PO stream, emits translation table in object mode
 *
 * @param {String} [defaultCharset] Default charset to use
 * @param {String} [options] Stream options
 * @return {Stream} Transform stream
 */
module.exports.stream = function(defaultCharset, options) {
    return new PoParserTransform(defaultCharset, options);
};

/**
 * Creates a PO parser object. If PO object is a string,
 * UTF-8 will be used as the charset
 *
 * @constructor
 * @param {Buffer|String} fileContents PO object
 * @param {String} [defaultCharset] Default charset to use
 */
function Parser(fileContents, defaultCharset) {

    this._charset = defaultCharset || 'iso-8859-1';

    this._lex = [];
    this._escaped = false;
    this._node;
    this._state = this.states.none;

    if (typeof fileContents === 'string') {
        this._charset = 'utf-8';
        this._fileContents = fileContents;
    } else {
        this._handleCharset(fileContents);
    }
}

/**
 * Parses the PO object and returns translation table
 *
 * @return {Object} Translation table
 */
Parser.prototype.parse = function() {
    this._lexer(this._fileContents);
    return this._finalize(this._lex);
};

/**
 * Detects charset for PO strings from the header
 *
 * @param {Buffer} headers Header value
 */
Parser.prototype._handleCharset = function(buf) {
    var str = (buf || '').toString(),
        pos, headers = '',
        match;

    if ((pos = str.search(/^\s*msgid/im)) >= 0) {
        if ((pos = pos + str.substr(pos + 5).search(/^\s*(msgid|msgctxt)/im))) {
            headers = str.substr(0, pos);
        }
    }

    if ((match = headers.match(/[; ]charset\s*=\s*([\w\-]+)(?:[\s;]|\\n)*"\s*$/mi))) {
        this._charset = sharedFuncs.formatCharset(match[1], this._charset);
    }

    if (this._charset === 'utf-8') {
        this._fileContents = str;
    } else {
        this._fileContents = this._toString(buf);
    }
};

Parser.prototype._toString = function(buf) {
    return encoding.convert(buf, 'utf-8', this._charset).toString('utf-8');
};

/**
 * State constants for parsing FSM
 */
Parser.prototype.states = {
    none: 0x01,
    comments: 0x02,
    key: 0x03,
    string: 0x04
};

/**
 * Value types for lexer
 */
Parser.prototype.types = {
    comments: 0x01,
    key: 0x02,
    string: 0x03
};

/**
 * String matches for lexer
 */
Parser.prototype.symbols = {
    quotes: /["']/,
    comments: /\#/,
    whitespace: /\s/,
    key: /[\w\-\[\]]/
};

/**
 * Token parser. Parsed state can be found from this._lex
 *
 * @param {String} chunk String
 */
Parser.prototype._lexer = function(chunk) {
    var chr;

    for (var i = 0, len = chunk.length; i < len; i++) {
        chr = chunk.charAt(i);
        switch (this._state) {
            case this.states.none:
                if (chr.match(this.symbols.quotes)) {
                    this._node = {
                        type: this.types.string,
                        value: '',
                        quote: chr
                    };
                    this._lex.push(this._node);
                    this._state = this.states.string;
                } else if (chr.match(this.symbols.comments)) {
                    this._node = {
                        type: this.types.comments,
                        value: ''
                    };
                    this._lex.push(this._node);
                    this._state = this.states.comments;
                } else if (!chr.match(this.symbols.whitespace)) {
                    this._node = {
                        type: this.types.key,
                        value: chr
                    };
                    this._lex.push(this._node);
                    this._state = this.states.key;
                }
                break;
            case this.states.comments:
                if (chr === '\n') {
                    this._state = this.states.none;
                } else if (chr !== '\r') {
                    this._node.value += chr;
                }
                break;
            case this.states.string:
                if (this._escaped) {
                    switch (chr) {
                        case 't':
                            this._node.value += '\t';
                            break;
                        case 'n':
                            this._node.value += '\n';
                            break;
                        case 'r':
                            this._node.value += '\r';
                            break;
                        default:
                            this._node.value += chr;
                    }
                    this._escaped = false;
                } else {
                    if (chr === this._node.quote) {
                        this._state = this.states.none;
                    } else if (chr === '\\') {
                        this._escaped = true;
                        break;
                    } else {
                        this._node.value += chr;
                    }
                    this._escaped = false;
                }
                break;
            case this.states.key:
                if (!chr.match(this.symbols.key)) {
                    this._state = this.states.none;
                    i--;
                } else {
                    this._node.value += chr;
                }
                break;
        }
    }
};

/**
 * Join multi line strings
 *
 * @param {Object} tokens Parsed tokens
 * @return {Object} Parsed tokens, with multi line strings joined into one
 */
Parser.prototype._joinStringValues = function(tokens) {
    var lastNode, response = [];

    for (var i = 0, len = tokens.length; i < len; i++) {
        if (lastNode && tokens[i].type === this.types.string && lastNode.type === this.types.string) {
            lastNode.value += tokens[i].value;
        } else if (lastNode && tokens[i].type === this.types.comments && lastNode.type === this.types.comments) {
            lastNode.value += '\n' + tokens[i].value;
        } else {
            response.push(tokens[i]);
            lastNode = tokens[i];
        }
    }

    return response;
};

/**
 * Parse comments into separate comment blocks
 *
 * @param {Object} tokens Parsed tokens
 */
Parser.prototype._parseComments = function(tokens) {
    // parse comments
    tokens.forEach((function(node) {
        var comment, lines;

        if (node && node.type === this.types.comments) {
            comment = {
                translator: [],
                extracted: [],
                reference: [],
                flag: [],
                previous: []
            };
            lines = (node.value || '').split(/\n/);
            lines.forEach(function(line) {
                switch (line.charAt(0) || '') {
                    case ':':
                        comment.reference.push(line.substr(1).trim());
                        break;
                    case '.':
                        comment.extracted.push(line.substr(1).replace(/^\s+/, ''));
                        break;
                    case ',':
                        comment.flag.push(line.substr(1).replace(/^\s+/, ''));
                        break;
                    case '|':
                        comment.previous.push(line.substr(1).replace(/^\s+/, ''));
                        break;
                    default:
                        comment.translator.push(line.replace(/^\s+/, ''));
                }
            });

            node.value = {};

            Object.keys(comment).forEach(function(key) {
                if (comment[key] && comment[key].length) {
                    node.value[key] = comment[key].join('\n');
                }
            });
        }
    }).bind(this));
};

/**
 * Join gettext keys with values
 *
 * @param {Object} tokens Parsed tokens
 * @return {Object} Tokens
 */
Parser.prototype._handleKeys = function(tokens) {
    var response = [],
        lastNode;

    for (var i = 0, len = tokens.length; i < len; i++) {
        if (tokens[i].type === this.types.key) {
            lastNode = {
                key: tokens[i].value
            };
            if (i && tokens[i - 1].type === this.types.comments) {
                lastNode.comments = tokens[i - 1].value;
            }
            lastNode.value = '';
            response.push(lastNode);
        } else if (tokens[i].type === this.types.string && lastNode) {
            lastNode.value += tokens[i].value;
        }
    }

    return response;
};

/**
 * Separate different values into individual translation objects
 *
 * @param {Object} tokens Parsed tokens
 * @return {Object} Tokens
 */
Parser.prototype._handleValues = function(tokens) {
    var response = [],
        lastNode, curContext, curComments;

    for (var i = 0, len = tokens.length; i < len; i++) {
        if (tokens[i].key.toLowerCase() === 'msgctxt') {
            curContext = tokens[i].value;
            curComments = tokens[i].comments;
        } else if (tokens[i].key.toLowerCase() === 'msgid') {
            lastNode = {
                msgid: tokens[i].value
            };

            if (curContext) {
                lastNode.msgctxt = curContext;
            }

            if (curComments) {
                lastNode.comments = curComments;
            }

            if (tokens[i].comments && !lastNode.comments) {
                lastNode.comments = tokens[i].comments;
            }

            curContext = false;
            curComments = false;
            response.push(lastNode);
        } else if (tokens[i].key.toLowerCase() === 'msgid_plural') {
            if (lastNode) {
                lastNode.msgid_plural = tokens[i].value;
            }

            if (tokens[i].comments && !lastNode.comments) {
                lastNode.comments = tokens[i].comments;
            }

            curContext = false;
            curComments = false;
        } else if (tokens[i].key.substr(0, 6).toLowerCase() === 'msgstr') {
            if (lastNode) {
                lastNode.msgstr = (lastNode.msgstr || []).concat(tokens[i].value);
            }

            if (tokens[i].comments && !lastNode.comments) {
                lastNode.comments = tokens[i].comments;
            }

            curContext = false;
            curComments = false;
        }
    }

    return response;
};

/**
 * Compose a translation table from tokens object
 *
 * @param {Object} tokens Parsed tokens
 * @return {Object} Translation table
 */
Parser.prototype._normalize = function(tokens) {
    var msgctxt,
        table = {
            charset: this._charset,
            headers: undefined,
            translations: {}
        };

    for (var i = 0, len = tokens.length; i < len; i++) {
        msgctxt = tokens[i].msgctxt || '';

        if (!table.translations[msgctxt]) {
            table.translations[msgctxt] = {};
        }

        if (!table.headers && !msgctxt && !tokens[i].msgid) {
            table.headers = sharedFuncs.parseHeader(tokens[i].msgstr[0]);
        }

        table.translations[msgctxt][tokens[i].msgid] = tokens[i];
    }

    return table;
};

/**
 * Converts parsed tokens to a translation table
 *
 * @param {Object} tokens Parsed tokens
 * @returns {Object} Translation table
 */
Parser.prototype._finalize = function(tokens) {
    var data = this._joinStringValues(tokens);
    this._parseComments(data);
    data = this._handleKeys(data);
    data = this._handleValues(data);

    return this._normalize(data);
};

/**
 * Creates a transform stream for parsing PO input
 *
 * @constructor
 * @param {String} [defaultCharset] Default charset to use
 * @param {String} [options] Stream options
 */
function PoParserTransform(defaultCharset, options) {
    if (!options && defaultCharset && typeof defaultCharset === 'object') {
        options = defaultCharset;
        defaultCharset = undefined;
    }

    this.defaultCharset = defaultCharset;
    this._parser = false;
    this._tokens = {};

    this._cache = [];
    this._cacheSize = 0;

    this.initialTreshold = options.initialTreshold || 2 * 1024;

    Transform.call(this, options);
    this._writableState.objectMode = false;
    this._readableState.objectMode = true;
}
util.inherits(PoParserTransform, Transform);

/**
 * Processes a chunk of the input stream
 */
PoParserTransform.prototype._transform = function(chunk, encoding, done) {
    var i, len = 0;

    if (!chunk || !chunk.length) {
        return done();
    }

    if (!this._parser) {
        this._cache.push(chunk);
        this._cacheSize += chunk.length;

        // wait until the first 1kb before parsing headers for charset
        if (this._cacheSize < this.initialTreshold) {
            return setImmediate(done);
        } else if (this._cacheSize) {
            chunk = Buffer.concat(this._cache, this._cacheSize);
            this._cacheSize = 0;
            this._cache = [];
        }

        this._parser = new Parser(chunk, this.defaultCharset);
    } else if (this._cacheSize) {
        // this only happens if we had an uncompleted 8bit sequence from the last iteration
        this._cache.push(chunk);
        this._cacheSize += chunk.length;
        chunk = Buffer.concat(this._cache, this._cacheSize);
        this._cacheSize = 0;
        this._cache = [];
    }

    // cache 8bit bytes from the end of the chunk
    // helps if the chunk ends in the middle of an utf-8 sequence
    for (i = chunk.length - 1; i >= 0; i--) {
        if (chunk[i] >= 0x80) {
            len++;
            continue;
        }
        break;
    }
    // it seems we found some 8bit bytes from the end of the string, so let's cache these
    if (len) {
        this._cache = [chunk.slice(chunk.length - len)];
        this._cacheSize = this._cache[0].length;
        chunk = chunk.slice(0, chunk.length - len);
    }

    // chunk might be empty if it only contined of 8bit bytes and these were all cached
    if (chunk.length) {
        this._parser._lexer(this._parser._toString(chunk));
    }

    setImmediate(done);
};

/**
 * Once all input has been processed emit the parsed translation table as an object
 */
PoParserTransform.prototype._flush = function(done) {
    var chunk;

    if (this._cacheSize) {
        chunk = Buffer.concat(this._cache, this._cacheSize);
    }

    if (!this._parser && chunk) {
        this._parser = new Parser(chunk, this.defaultCharset);
    }

    if (chunk) {
        this._parser._lexer(this._parser._toString(chunk));
    }

    if (this._parser) {
        this.push(this._parser._finalize(this._parser._lex));
    }

    setImmediate(done);
};