如何将UTF8字符串转换为字节数组？

don kaka 发表于 Dev

唐卡卡

该.charCodeAt函数返回带有字符的unicode代码。但我想改为获取字节数组。我知道，如果字符代码超过127，则字符存储在两个或更多字节中。

var arr=[];
for(var i=0; i<str.length; i++) {
    arr.push(str.charCodeAt(i))
}

乔尼

以UTF-8编码Unicode的逻辑基本上是：

每个字符最多可以使用4个字节。使用了最少的字节数。
最多U + 007F的字符用单个字节编码。
对于多字节序列，第一个字节中前导1位的数目给出了字符的字节数。第一个字节的其余位可用于编码字符的位。
连续字节以10开头，其他6位对字符的位进行编码。

这是我不久前写的一个函数，用于以UTF-8编码JavaScript UTF-16字符串：

function toUTF8Array(str) {
    var utf8 = [];
    for (var i=0; i < str.length; i++) {
        var charcode = str.charCodeAt(i);
        if (charcode < 0x80) utf8.push(charcode);
        else if (charcode < 0x800) {
            utf8.push(0xc0 | (charcode >> 6), 
                      0x80 | (charcode & 0x3f));
        }
        else if (charcode < 0xd800 || charcode >= 0xe000) {
            utf8.push(0xe0 | (charcode >> 12), 
                      0x80 | ((charcode>>6) & 0x3f), 
                      0x80 | (charcode & 0x3f));
        }
        // surrogate pair
        else {
            i++;
            // UTF-16 encodes 0x10000-0x10FFFF by
            // subtracting 0x10000 and splitting the
            // 20 bits of 0x0-0xFFFFF into two halves
            charcode = 0x10000 + (((charcode & 0x3ff)<<10)
                      | (str.charCodeAt(i) & 0x3ff));
            utf8.push(0xf0 | (charcode >>18), 
                      0x80 | ((charcode>>12) & 0x3f), 
                      0x80 | ((charcode>>6) & 0x3f), 
                      0x80 | (charcode & 0x3f));
        }
    }
    return utf8;
}