2024 年 11 月 4 日 • 阅读 3 分钟

编码字符串为 UTF8

JavaScript 字符串是“零个或多个 16 位无符号整数值的有限有序序列”。通常，这些整数值是 UTF-16 编码单位。UTF-16 编码对 U+0000 至 U+FFFF 的 Unicode 字符使用一个 16 位单位，对 U+10000 至 U+10FFFF 的字符使用两个单位。不幸的是，所有常用的字符串函数 length、charAt、charCodeAt 都是针对这些编码单位定义的，因此诸如 𝄞 (U+1D11E MUSICAL SYMBOL G CLEF) 这样的字符会显示为一对代用字符。这个小细节使得对字符串进行操作变得复杂。

这个 JavaScript 函数将字符串编码为 UTF-8 整数数组，同时将代理字符对考虑在内：

/**
 * Encode UTF16 to UTF8.
 * See: https://gist.github.com/joni/3760795
 * @param str {string}
 * @returns {Array} UTF8 array
 */
function toUTF8Array(str) {
  const utf8 = []
  for (let i = 0; i < str.length; i++) {
    let charcode = str.charCodeAt(i)
    if (charcode < 0x80) {
      utf8.push(charcode)
    }
    else if (charcode < 0x800) {
      utf8.push(0xC0 | (charcode >> 6), 0x80 | (charcode & 0x3F))
    }
    else if (charcode < 0xD800 || charcode >= 0xE000) {
      utf8.push(
        0xE0 | (charcode >> 12),
        0x80 | ((charcode >> 6) & 0x3F),
        0x80 | (charcode & 0x3F)
      )
    }
    // surrogate pair
    else {
      i++
      // UTF-16 encodes 0x10000-0x10FFFF by
      // subtracting 0x10000 and splitting the
      // 20 bits of 0x0-0xFFFFF into two halves
      charcode = 0x10000 + (((charcode & 0x3FF) << 10)
        | (str.charCodeAt(i) & 0x3FF))
      utf8.push(
        0xF0 | (charcode >> 18),
        0x80 | ((charcode >> 12) & 0x3F),
        0x80 | ((charcode >> 6) & 0x3F),
        0x80 | (charcode & 0x3F)
      )
    }
  }
  return utf8
}

基于此，可以将字符串编码为 UTF-8 字符串：

/**
 * Encode UTF16 to UTF8.
 * @param str {string}
 * @returns {string} UTF8 string
 */
function toUTF8String(str) {
  let utf8Str = ''
  for (let i = 0; i < str.length; i++) {
    let charCode = str.charCodeAt(i)
    if (charCode < 0x0080) {
      utf8Str += String.fromCharCode(charCode)
    } else if (charCode < 0x0800) {
      utf8Str += String.fromCharCode(0xC0 | (charCode >> 6))
      utf8Str += String.fromCharCode(0x80 | (charCode & 0x3F))
    } else if (charCode < 0xD800 || charCode >= 0xE000) {
      utf8Str += String.fromCharCode(0xE0 | (charCode >> 12))
      utf8Str += String.fromCharCode(0x80 | ((charCode >> 6) & 0x3F))
      utf8Str += String.fromCharCode(0x80 | (charCode & 0x3F))
    } else {
      // surrogate pair
      i++
      // UTF-16 encodes 0x10000-0x10FFFF by
      // subtracting 0x10000 and splitting the
      // 20 bits of 0x0-0xFFFFF into two halves
      charCode
        = 0x10000 + (((charCode & 0x3FF) << 10) | (str.charCodeAt(i) & 0x3FF))
      utf8Str += String.fromCharCode(0xF0 | (charCode >> 18))
      utf8Str += String.fromCharCode(0x80 | ((charCode >> 12) & 0x3F))
      utf8Str += String.fromCharCode(0x80 | ((charCode >> 6) & 0x3F))
      utf8Str += String.fromCharCode(0x80 | (charCode & 0x3F))
    }
  }
  return utf8Str
}

> cd ..