aboutsummaryrefslogtreecommitdiff
path: root/node_modules/fbjs/lib/UnicodeHangulKorean.js.flow
blob: d184b580fdee7bec42e2c6f638cf5c6c691de5db (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/**
 * Copyright (c) 2013-present, Facebook, Inc.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 *
 * @providesModule UnicodeHangulKorean
 * @typechecks
 */

/**
 * Unicode algorithms for Hangul script, the Korean writing system
 *
 * Hangul script has three encoded models in Unicode:
 *
 * A) Conjoining Jamo (covers modern and historic elements)
 *    * U+1100..U+11FF ; Hangul Jamo
 *    * U+A960..U+A97F ; Hangul Jamo Extended-A
 *    * U+D7B0..U+D7FF ; Hangul Jamo Extended-B
 *
 * B) Conjoined Syllables (only covers modern Korean language)
 *    * U+AC00..U+D7AF ; Hangul Syllables
 *
 * C) Compatibility Jamo (one code-point for each "shape")
 *    * U+3130..U+318F ; Hangul Compatibility Jamo
 *
 * This modules helps you convert characters from one model to another.
 * Primary functionalities are:
 *
 * 1) Convert from any encodings to Conjoining Jamo characters (A),
 *    e.g. for prefix matching
 *
 * 2) Convert from any encodings to Syllable characters, when possible (B),
 *    e.g. to reach the normal Unicode form (NFC)
 */

'use strict';

const HANGUL_COMPATIBILITY_OR_SYLLABLE_REGEX = /[\u3130-\u318F\uAC00-\uD7AF]/;

/**
 * Returns true if the input includes any Hangul Compatibility Jamo or
 * Hangul Conjoined Syllable.
 *
 * @param {string} str
 */
function hasCompatibilityOrSyllable(str) {
  return HANGUL_COMPATIBILITY_OR_SYLLABLE_REGEX.test(str);
}

/* Compatibility Jamo -> Conjoining Jamo
 *
 * Maps a compatibility character to the Conjoining Jamo character,
 * positioned at (compatibilityCodePoint - 0x3131).
 *
 * Generated by:
 * $ grep '^31[3-8].;' UnicodeData.txt |\
 *     awk -F';' '{print $6}' | awk '{print "  0x"$2","}'
 */
const CMAP = [0x1100, 0x1101, 0x11AA, 0x1102, 0x11AC, 0x11AD, 0x1103, 0x1104, 0x1105, 0x11B0, 0x11B1, 0x11B2, 0x11B3, 0x11B4, 0x11B5, 0x111A, 0x1106, 0x1107, 0x1108, 0x1121, 0x1109, 0x110A, 0x110B, 0x110C, 0x110D, 0x110E, 0x110F, 0x1110, 0x1111, 0x1112, 0x1161, 0x1162, 0x1163, 0x1164, 0x1165, 0x1166, 0x1167, 0x1168, 0x1169, 0x116A, 0x116B, 0x116C, 0x116D, 0x116E, 0x116F, 0x1170, 0x1171, 0x1172, 0x1173, 0x1174, 0x1175, 0x1160, 0x1114, 0x1115, 0x11C7, 0x11C8, 0x11CC, 0x11CE, 0x11D3, 0x11D7, 0x11D9, 0x111C, 0x11DD, 0x11DF, 0x111D, 0x111E, 0x1120, 0x1122, 0x1123, 0x1127, 0x1129, 0x112B, 0x112C, 0x112D, 0x112E, 0x112F, 0x1132, 0x1136, 0x1140, 0x1147, 0x114C, 0x11F1, 0x11F2, 0x1157, 0x1158, 0x1159, 0x1184, 0x1185, 0x1188, 0x1191, 0x1192, 0x1194, 0x119E, 0x11A1];

const CBASE = 0x3131;
const CCOUNT = CMAP.length;
const CTOP = CBASE + CCOUNT;

/**
 * Maps one Hangul Compatibility Jamo code-point to the equivalent Hangul
 * Conjoining Jamo characters, as defined in UnicodeData.txt.
 *
 * @param {number} codePoint  One Unicode code-point
 * @output {string}
 */
function fromCompatibility(codePoint) {
  return String.fromCharCode(CMAP[codePoint - CBASE]);
}

/**
 * Conjoined Syllable -> Conjoining Jamo
 *
 * Based on the "Hangul Syllable Decomposition" algorithm provided in
 * 3.12 Conjoining Jamo Behavior, The Unicode Standard, Version 6.3.0.
 * <http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf>
 */

const LBASE = 0x1100;
const VBASE = 0x1161;
const TBASE = 0x11A7;
const SBASE = 0xAC00;
const LCOUNT = 19;
const VCOUNT = 21;
const TCOUNT = 28;
const NCOUNT = VCOUNT * TCOUNT;
const SCOUNT = LCOUNT * NCOUNT;
const STOP = SBASE + SCOUNT;

/**
 * Maps one Hangul Syllable code-point to the equivalent Hangul
 * Conjoining Jamo characters, as defined in UnicodeData.txt.
 *
 * @param {number} codePoint  One Unicode character
 * @output {string}
 */
function decomposeSyllable(codePoint) {
  const sylSIndex = codePoint - SBASE;
  const sylTIndex = sylSIndex % TCOUNT;
  return String.fromCharCode(LBASE + sylSIndex / NCOUNT) + String.fromCharCode(VBASE + sylSIndex % NCOUNT / TCOUNT) + (sylTIndex > 0 ? String.fromCharCode(TBASE + sylTIndex) : '');
}

/* To Conjoining Jamo */

/**
 * Return Unicode characters as they are, except for Hangul characters, which
 * will be converted to the Conjoining Jamo form.
 *
 * @param {string} string
 * @output {string}
 */
function toConjoiningJamo(string) {
  if (!hasCompatibilityOrSyllable(string)) {
    return string;
  }

  const result = [];
  for (let i = 0; i < string.length; i++) {
    const charStr = string.charAt(i);
    const codeUnit = charStr.charCodeAt(0);
    result.push(CBASE <= codeUnit && codeUnit < CTOP ? fromCompatibility(codeUnit) : SBASE <= codeUnit && codeUnit < STOP ? decomposeSyllable(codeUnit) : charStr);
  }
  return result.join('');
}

const UnicodeHangulKorean = {
  toConjoiningJamo: toConjoiningJamo
};

module.exports = UnicodeHangulKorean;