2017-05-03 15:35:00 +02:00
/ * *
* Copyright ( c ) 2013 - present , Facebook , Inc .
*
2017-10-14 18:40:54 +02:00
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree .
2017-05-03 15:35:00 +02:00
*
* @ typechecks
* /
/ * *
* Unicode - enabled replacesments for basic String functions .
*
* All the functions in this module assume that the input string is a valid
* UTF - 16 encoding of a Unicode sequence . If it ' s not the case , the behavior
* will be undefined .
*
* WARNING : Since this module is typechecks - enforced , you may find new bugs
* when replacing normal String functions with ones provided here .
* /
'use strict' ;
var invariant = require ( './invariant' ) ;
// These two ranges are consecutive so anything in [HIGH_START, LOW_END] is a
// surrogate code unit.
var SURROGATE _HIGH _START = 0xD800 ;
var SURROGATE _HIGH _END = 0xDBFF ;
var SURROGATE _LOW _START = 0xDC00 ;
var SURROGATE _LOW _END = 0xDFFF ;
var SURROGATE _UNITS _REGEX = /[\uD800-\uDFFF]/ ;
/ * *
* @ param { number } codeUnit A Unicode code - unit , in range [ 0 , 0x10FFFF ]
* @ return { boolean } Whether code - unit is in a surrogate ( hi / low ) range
* /
function isCodeUnitInSurrogateRange ( codeUnit ) {
return SURROGATE _HIGH _START <= codeUnit && codeUnit <= SURROGATE _LOW _END ;
}
/ * *
* Returns whether the two characters starting at ` index ` form a surrogate pair .
* For example , given the string s = "\uD83D\uDE0A" , ( s , 0 ) returns true and
* ( s , 1 ) returns false .
*
* @ param { string } str
* @ param { number } index
* @ return { boolean }
* /
function isSurrogatePair ( str , index ) {
! ( 0 <= index && index < str . length ) ? process . env . NODE _ENV !== 'production' ? invariant ( false , 'isSurrogatePair: Invalid index %s for string length %s.' , index , str . length ) : invariant ( false ) : void 0 ;
if ( index + 1 === str . length ) {
return false ;
}
var first = str . charCodeAt ( index ) ;
var second = str . charCodeAt ( index + 1 ) ;
return SURROGATE _HIGH _START <= first && first <= SURROGATE _HIGH _END && SURROGATE _LOW _START <= second && second <= SURROGATE _LOW _END ;
}
/ * *
* @ param { string } str Non - empty string
* @ return { boolean } True if the input includes any surrogate code units
* /
function hasSurrogateUnit ( str ) {
return SURROGATE _UNITS _REGEX . test ( str ) ;
}
/ * *
* Return the length of the original Unicode character at given position in the
* String by looking into the UTF - 16 code unit ; that is equal to 1 for any
* non - surrogate characters in BMP ( [ U + 0000. . U + D7FF ] and [ U + E000 , U + FFFF ] ) ; and
* returns 2 for the hi / low surrogates ( [ U + D800 . . U + DFFF ] ) , which are in fact
* representing non - BMP characters ( [ U + 10000. . U + 10 FFFF ] ) .
*
* Examples :
* - '\u0020' => 1
* - '\u3020' => 1
* - '\uD835' => 2
* - '\uD835\uDDEF' => 2
* - '\uDDEF' => 2
*
* @ param { string } str Non - empty string
* @ param { number } pos Position in the string to look for one code unit
* @ return { number } Number 1 or 2
* /
function getUTF16Length ( str , pos ) {
return 1 + isCodeUnitInSurrogateRange ( str . charCodeAt ( pos ) ) ;
}
/ * *
* Fully Unicode - enabled replacement for String # length
*
* @ param { string } str Valid Unicode string
* @ return { number } The number of Unicode characters in the string
* /
function strlen ( str ) {
// Call the native functions if there's no surrogate char
if ( ! hasSurrogateUnit ( str ) ) {
return str . length ;
}
var len = 0 ;
for ( var pos = 0 ; pos < str . length ; pos += getUTF16Length ( str , pos ) ) {
len ++ ;
}
return len ;
}
/ * *
* Fully Unicode - enabled replacement for String # substr ( )
*
* @ param { string } str Valid Unicode string
* @ param { number } start Location in Unicode sequence to begin extracting
* @ param { ? number } length The number of Unicode characters to extract
* ( default : to the end of the string )
* @ return { string } Extracted sub - string
* /
function substr ( str , start , length ) {
start = start || 0 ;
length = length === undefined ? Infinity : length || 0 ;
// Call the native functions if there's no surrogate char
if ( ! hasSurrogateUnit ( str ) ) {
return str . substr ( start , length ) ;
}
// Obvious cases
var size = str . length ;
if ( size <= 0 || start > size || length <= 0 ) {
return '' ;
}
// Find the actual starting position
var posA = 0 ;
if ( start > 0 ) {
for ( ; start > 0 && posA < size ; start -- ) {
posA += getUTF16Length ( str , posA ) ;
}
if ( posA >= size ) {
return '' ;
}
} else if ( start < 0 ) {
for ( posA = size ; start < 0 && 0 < posA ; start ++ ) {
posA -= getUTF16Length ( str , posA - 1 ) ;
}
if ( posA < 0 ) {
posA = 0 ;
}
}
// Find the actual ending position
var posB = size ;
if ( length < size ) {
for ( posB = posA ; length > 0 && posB < size ; length -- ) {
posB += getUTF16Length ( str , posB ) ;
}
}
return str . substring ( posA , posB ) ;
}
/ * *
* Fully Unicode - enabled replacement for String # substring ( )
*
* @ param { string } str Valid Unicode string
* @ param { number } start Location in Unicode sequence to begin extracting
* @ param { ? number } end Location in Unicode sequence to end extracting
* ( default : end of the string )
* @ return { string } Extracted sub - string
* /
function substring ( str , start , end ) {
start = start || 0 ;
end = end === undefined ? Infinity : end || 0 ;
if ( start < 0 ) {
start = 0 ;
}
if ( end < 0 ) {
end = 0 ;
}
var length = Math . abs ( end - start ) ;
start = start < end ? start : end ;
return substr ( str , start , length ) ;
}
/ * *
* Get a list of Unicode code - points from a String
*
* @ param { string } str Valid Unicode string
* @ return { array < number > } A list of code - points in [ 0. . 0x10FFFF ]
* /
function getCodePoints ( str ) {
var codePoints = [ ] ;
for ( var pos = 0 ; pos < str . length ; pos += getUTF16Length ( str , pos ) ) {
codePoints . push ( str . codePointAt ( pos ) ) ;
}
return codePoints ;
}
var UnicodeUtils = {
getCodePoints : getCodePoints ,
getUTF16Length : getUTF16Length ,
hasSurrogateUnit : hasSurrogateUnit ,
isCodeUnitInSurrogateRange : isCodeUnitInSurrogateRange ,
isSurrogatePair : isSurrogatePair ,
strlen : strlen ,
substring : substring ,
substr : substr
} ;
module . exports = UnicodeUtils ;