wallet-core/node_modules/fbjs/lib/UnicodeUtils.js
2017-10-14 18:40:54 +02:00

212 lines
6.0 KiB
JavaScript

/**
* Copyright (c) 2013-present, Facebook, Inc.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*
* @typechecks
*/
/**
* Unicode-enabled replacesments for basic String functions.
*
* All the functions in this module assume that the input string is a valid
* UTF-16 encoding of a Unicode sequence. If it's not the case, the behavior
* will be undefined.
*
* WARNING: Since this module is typechecks-enforced, you may find new bugs
* when replacing normal String functions with ones provided here.
*/
'use strict';
var invariant = require('./invariant');
// These two ranges are consecutive so anything in [HIGH_START, LOW_END] is a
// surrogate code unit.
var SURROGATE_HIGH_START = 0xD800;
var SURROGATE_HIGH_END = 0xDBFF;
var SURROGATE_LOW_START = 0xDC00;
var SURROGATE_LOW_END = 0xDFFF;
var SURROGATE_UNITS_REGEX = /[\uD800-\uDFFF]/;
/**
* @param {number} codeUnit A Unicode code-unit, in range [0, 0x10FFFF]
* @return {boolean} Whether code-unit is in a surrogate (hi/low) range
*/
function isCodeUnitInSurrogateRange(codeUnit) {
return SURROGATE_HIGH_START <= codeUnit && codeUnit <= SURROGATE_LOW_END;
}
/**
* Returns whether the two characters starting at `index` form a surrogate pair.
* For example, given the string s = "\uD83D\uDE0A", (s, 0) returns true and
* (s, 1) returns false.
*
* @param {string} str
* @param {number} index
* @return {boolean}
*/
function isSurrogatePair(str, index) {
!(0 <= index && index < str.length) ? process.env.NODE_ENV !== 'production' ? invariant(false, 'isSurrogatePair: Invalid index %s for string length %s.', index, str.length) : invariant(false) : void 0;
if (index + 1 === str.length) {
return false;
}
var first = str.charCodeAt(index);
var second = str.charCodeAt(index + 1);
return SURROGATE_HIGH_START <= first && first <= SURROGATE_HIGH_END && SURROGATE_LOW_START <= second && second <= SURROGATE_LOW_END;
}
/**
* @param {string} str Non-empty string
* @return {boolean} True if the input includes any surrogate code units
*/
function hasSurrogateUnit(str) {
return SURROGATE_UNITS_REGEX.test(str);
}
/**
* Return the length of the original Unicode character at given position in the
* String by looking into the UTF-16 code unit; that is equal to 1 for any
* non-surrogate characters in BMP ([U+0000..U+D7FF] and [U+E000, U+FFFF]); and
* returns 2 for the hi/low surrogates ([U+D800..U+DFFF]), which are in fact
* representing non-BMP characters ([U+10000..U+10FFFF]).
*
* Examples:
* - '\u0020' => 1
* - '\u3020' => 1
* - '\uD835' => 2
* - '\uD835\uDDEF' => 2
* - '\uDDEF' => 2
*
* @param {string} str Non-empty string
* @param {number} pos Position in the string to look for one code unit
* @return {number} Number 1 or 2
*/
function getUTF16Length(str, pos) {
return 1 + isCodeUnitInSurrogateRange(str.charCodeAt(pos));
}
/**
* Fully Unicode-enabled replacement for String#length
*
* @param {string} str Valid Unicode string
* @return {number} The number of Unicode characters in the string
*/
function strlen(str) {
// Call the native functions if there's no surrogate char
if (!hasSurrogateUnit(str)) {
return str.length;
}
var len = 0;
for (var pos = 0; pos < str.length; pos += getUTF16Length(str, pos)) {
len++;
}
return len;
}
/**
* Fully Unicode-enabled replacement for String#substr()
*
* @param {string} str Valid Unicode string
* @param {number} start Location in Unicode sequence to begin extracting
* @param {?number} length The number of Unicode characters to extract
* (default: to the end of the string)
* @return {string} Extracted sub-string
*/
function substr(str, start, length) {
start = start || 0;
length = length === undefined ? Infinity : length || 0;
// Call the native functions if there's no surrogate char
if (!hasSurrogateUnit(str)) {
return str.substr(start, length);
}
// Obvious cases
var size = str.length;
if (size <= 0 || start > size || length <= 0) {
return '';
}
// Find the actual starting position
var posA = 0;
if (start > 0) {
for (; start > 0 && posA < size; start--) {
posA += getUTF16Length(str, posA);
}
if (posA >= size) {
return '';
}
} else if (start < 0) {
for (posA = size; start < 0 && 0 < posA; start++) {
posA -= getUTF16Length(str, posA - 1);
}
if (posA < 0) {
posA = 0;
}
}
// Find the actual ending position
var posB = size;
if (length < size) {
for (posB = posA; length > 0 && posB < size; length--) {
posB += getUTF16Length(str, posB);
}
}
return str.substring(posA, posB);
}
/**
* Fully Unicode-enabled replacement for String#substring()
*
* @param {string} str Valid Unicode string
* @param {number} start Location in Unicode sequence to begin extracting
* @param {?number} end Location in Unicode sequence to end extracting
* (default: end of the string)
* @return {string} Extracted sub-string
*/
function substring(str, start, end) {
start = start || 0;
end = end === undefined ? Infinity : end || 0;
if (start < 0) {
start = 0;
}
if (end < 0) {
end = 0;
}
var length = Math.abs(end - start);
start = start < end ? start : end;
return substr(str, start, length);
}
/**
* Get a list of Unicode code-points from a String
*
* @param {string} str Valid Unicode string
* @return {array<number>} A list of code-points in [0..0x10FFFF]
*/
function getCodePoints(str) {
var codePoints = [];
for (var pos = 0; pos < str.length; pos += getUTF16Length(str, pos)) {
codePoints.push(str.codePointAt(pos));
}
return codePoints;
}
var UnicodeUtils = {
getCodePoints: getCodePoints,
getUTF16Length: getUTF16Length,
hasSurrogateUnit: hasSurrogateUnit,
isCodeUnitInSurrogateRange: isCodeUnitInSurrogateRange,
isSurrogatePair: isSurrogatePair,
strlen: strlen,
substring: substring,
substr: substr
};
module.exports = UnicodeUtils;