•
阅读 3 分钟
编码字符串为 UTF8
JavaScript 字符串是“零个或多个 16 位无符号整数值的有限有序序列”。通常,这些整数值是 UTF-16 编码单位。UTF-16 编码对 U+0000 至 U+FFFF 的 Unicode 字符使用一个 16 位单位,对 U+10000 至 U+10FFFF 的字符使用两个单位。不幸的是,所有常用的字符串函数 length
、charAt
、charCodeAt
都是针对这些编码单位定义的,因此诸如 𝄞 (U+1D11E MUSICAL SYMBOL G CLEF) 这样的字符会显示为一对代用字符。这个小细节使得对字符串进行操作变得复杂。
这个 JavaScript 函数将字符串编码为 UTF-8 整数数组,同时将代理字符对考虑在内:
/**
* Encode UTF16 to UTF8.
* See: https://gist.github.com/joni/3760795
* @param str {string}
* @returns {Array} UTF8 array
*/
function toUTF8Array(str) {
const utf8 = []
for (let i = 0; i < str.length; i++) {
let charcode = str.charCodeAt(i)
if (charcode < 0x80) {
utf8.push(charcode)
}
else if (charcode < 0x800) {
utf8.push(0xC0 | (charcode >> 6), 0x80 | (charcode & 0x3F))
}
else if (charcode < 0xD800 || charcode >= 0xE000) {
utf8.push(
0xE0 | (charcode >> 12),
0x80 | ((charcode >> 6) & 0x3F),
0x80 | (charcode & 0x3F)
)
}
// surrogate pair
else {
i++
// UTF-16 encodes 0x10000-0x10FFFF by
// subtracting 0x10000 and splitting the
// 20 bits of 0x0-0xFFFFF into two halves
charcode = 0x10000 + (((charcode & 0x3FF) << 10)
| (str.charCodeAt(i) & 0x3FF))
utf8.push(
0xF0 | (charcode >> 18),
0x80 | ((charcode >> 12) & 0x3F),
0x80 | ((charcode >> 6) & 0x3F),
0x80 | (charcode & 0x3F)
)
}
}
return utf8
}
基于此,可以将字符串编码为 UTF-8 字符串:
/**
* Encode UTF16 to UTF8.
* @param str {string}
* @returns {string} UTF8 string
*/
function toUTF8String(str) {
let utf8Str = ''
for (let i = 0; i < str.length; i++) {
let charCode = str.charCodeAt(i)
if (charCode < 0x0080) {
utf8Str += String.fromCharCode(charCode)
} else if (charCode < 0x0800) {
utf8Str += String.fromCharCode(0xC0 | (charCode >> 6))
utf8Str += String.fromCharCode(0x80 | (charCode & 0x3F))
} else if (charCode < 0xD800 || charCode >= 0xE000) {
utf8Str += String.fromCharCode(0xE0 | (charCode >> 12))
utf8Str += String.fromCharCode(0x80 | ((charCode >> 6) & 0x3F))
utf8Str += String.fromCharCode(0x80 | (charCode & 0x3F))
} else {
// surrogate pair
i++
// UTF-16 encodes 0x10000-0x10FFFF by
// subtracting 0x10000 and splitting the
// 20 bits of 0x0-0xFFFFF into two halves
charCode
= 0x10000 + (((charCode & 0x3FF) << 10) | (str.charCodeAt(i) & 0x3FF))
utf8Str += String.fromCharCode(0xF0 | (charCode >> 18))
utf8Str += String.fromCharCode(0x80 | ((charCode >> 12) & 0x3F))
utf8Str += String.fromCharCode(0x80 | ((charCode >> 6) & 0x3F))
utf8Str += String.fromCharCode(0x80 | (charCode & 0x3F))
}
}
return utf8Str
}
> cd ..