diff options
Diffstat (limited to 'node_modules/fbjs/lib/UnicodeUtils.js')
-rw-r--r-- | node_modules/fbjs/lib/UnicodeUtils.js | 214 |
1 files changed, 214 insertions, 0 deletions
diff --git a/node_modules/fbjs/lib/UnicodeUtils.js b/node_modules/fbjs/lib/UnicodeUtils.js new file mode 100644 index 000000000..f192b5219 --- /dev/null +++ b/node_modules/fbjs/lib/UnicodeUtils.js @@ -0,0 +1,214 @@ +/** + * Copyright (c) 2013-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + * + * @typechecks + */ + +/** + * Unicode-enabled replacesments for basic String functions. + * + * All the functions in this module assume that the input string is a valid + * UTF-16 encoding of a Unicode sequence. If it's not the case, the behavior + * will be undefined. + * + * WARNING: Since this module is typechecks-enforced, you may find new bugs + * when replacing normal String functions with ones provided here. + */ + +'use strict'; + +var invariant = require('./invariant'); + +// These two ranges are consecutive so anything in [HIGH_START, LOW_END] is a +// surrogate code unit. +var SURROGATE_HIGH_START = 0xD800; +var SURROGATE_HIGH_END = 0xDBFF; +var SURROGATE_LOW_START = 0xDC00; +var SURROGATE_LOW_END = 0xDFFF; +var SURROGATE_UNITS_REGEX = /[\uD800-\uDFFF]/; + +/** + * @param {number} codeUnit A Unicode code-unit, in range [0, 0x10FFFF] + * @return {boolean} Whether code-unit is in a surrogate (hi/low) range + */ +function isCodeUnitInSurrogateRange(codeUnit) { + return SURROGATE_HIGH_START <= codeUnit && codeUnit <= SURROGATE_LOW_END; +} + +/** + * Returns whether the two characters starting at `index` form a surrogate pair. + * For example, given the string s = "\uD83D\uDE0A", (s, 0) returns true and + * (s, 1) returns false. + * + * @param {string} str + * @param {number} index + * @return {boolean} + */ +function isSurrogatePair(str, index) { + !(0 <= index && index < str.length) ? process.env.NODE_ENV !== 'production' ? invariant(false, 'isSurrogatePair: Invalid index %s for string length %s.', index, str.length) : invariant(false) : void 0; + if (index + 1 === str.length) { + return false; + } + var first = str.charCodeAt(index); + var second = str.charCodeAt(index + 1); + return SURROGATE_HIGH_START <= first && first <= SURROGATE_HIGH_END && SURROGATE_LOW_START <= second && second <= SURROGATE_LOW_END; +} + +/** + * @param {string} str Non-empty string + * @return {boolean} True if the input includes any surrogate code units + */ +function hasSurrogateUnit(str) { + return SURROGATE_UNITS_REGEX.test(str); +} + +/** + * Return the length of the original Unicode character at given position in the + * String by looking into the UTF-16 code unit; that is equal to 1 for any + * non-surrogate characters in BMP ([U+0000..U+D7FF] and [U+E000, U+FFFF]); and + * returns 2 for the hi/low surrogates ([U+D800..U+DFFF]), which are in fact + * representing non-BMP characters ([U+10000..U+10FFFF]). + * + * Examples: + * - '\u0020' => 1 + * - '\u3020' => 1 + * - '\uD835' => 2 + * - '\uD835\uDDEF' => 2 + * - '\uDDEF' => 2 + * + * @param {string} str Non-empty string + * @param {number} pos Position in the string to look for one code unit + * @return {number} Number 1 or 2 + */ +function getUTF16Length(str, pos) { + return 1 + isCodeUnitInSurrogateRange(str.charCodeAt(pos)); +} + +/** + * Fully Unicode-enabled replacement for String#length + * + * @param {string} str Valid Unicode string + * @return {number} The number of Unicode characters in the string + */ +function strlen(str) { + // Call the native functions if there's no surrogate char + if (!hasSurrogateUnit(str)) { + return str.length; + } + + var len = 0; + for (var pos = 0; pos < str.length; pos += getUTF16Length(str, pos)) { + len++; + } + return len; +} + +/** + * Fully Unicode-enabled replacement for String#substr() + * + * @param {string} str Valid Unicode string + * @param {number} start Location in Unicode sequence to begin extracting + * @param {?number} length The number of Unicode characters to extract + * (default: to the end of the string) + * @return {string} Extracted sub-string + */ +function substr(str, start, length) { + start = start || 0; + length = length === undefined ? Infinity : length || 0; + + // Call the native functions if there's no surrogate char + if (!hasSurrogateUnit(str)) { + return str.substr(start, length); + } + + // Obvious cases + var size = str.length; + if (size <= 0 || start > size || length <= 0) { + return ''; + } + + // Find the actual starting position + var posA = 0; + if (start > 0) { + for (; start > 0 && posA < size; start--) { + posA += getUTF16Length(str, posA); + } + if (posA >= size) { + return ''; + } + } else if (start < 0) { + for (posA = size; start < 0 && 0 < posA; start++) { + posA -= getUTF16Length(str, posA - 1); + } + if (posA < 0) { + posA = 0; + } + } + + // Find the actual ending position + var posB = size; + if (length < size) { + for (posB = posA; length > 0 && posB < size; length--) { + posB += getUTF16Length(str, posB); + } + } + + return str.substring(posA, posB); +} + +/** + * Fully Unicode-enabled replacement for String#substring() + * + * @param {string} str Valid Unicode string + * @param {number} start Location in Unicode sequence to begin extracting + * @param {?number} end Location in Unicode sequence to end extracting + * (default: end of the string) + * @return {string} Extracted sub-string + */ +function substring(str, start, end) { + start = start || 0; + end = end === undefined ? Infinity : end || 0; + + if (start < 0) { + start = 0; + } + if (end < 0) { + end = 0; + } + + var length = Math.abs(end - start); + start = start < end ? start : end; + return substr(str, start, length); +} + +/** + * Get a list of Unicode code-points from a String + * + * @param {string} str Valid Unicode string + * @return {array<number>} A list of code-points in [0..0x10FFFF] + */ +function getCodePoints(str) { + var codePoints = []; + for (var pos = 0; pos < str.length; pos += getUTF16Length(str, pos)) { + codePoints.push(str.codePointAt(pos)); + } + return codePoints; +} + +var UnicodeUtils = { + getCodePoints: getCodePoints, + getUTF16Length: getUTF16Length, + hasSurrogateUnit: hasSurrogateUnit, + isCodeUnitInSurrogateRange: isCodeUnitInSurrogateRange, + isSurrogatePair: isSurrogatePair, + strlen: strlen, + substring: substring, + substr: substr +}; + +module.exports = UnicodeUtils;
\ No newline at end of file |