java uint8array_uint8array和string的互轉

互轉的方法相信網上一搜有一大堆，都是比較簡單的互轉沒有考慮到中文或者是偏僻的中文。

理論上來說，互轉的話，轉過去再轉回來應該是同一個東西，列印的内容應該一緻，我們來嘗試一下網上給出的方法：

function Uint8ArrayToString(fileData){

var dataString = "";

for (var i = 0; i < fileData.length; i++) {

dataString += String.fromCharCode(fileData[i]);

}

return dataString

}

function stringToUint8Array(str){

var arr = [];

for (var i = 0, j = str.length; i < j; ++i) {

arr.push(str.charCodeAt(i));

}

var tmpUint8Array = new Uint8Array(arr);

return tmpUint8Array

}

我們的實驗代碼也很簡單：

var before= "𠮷中a";

var after = Uint8ArrayToString(stringToUint8Array(origin))

console.log(before,"===",after,before===after);

列印結果：

java uint8array_uint8array和string的互轉

結果

什麼鬼！！完全不一緻。

不過要是我們的字元串中隻有簡單字元，這種轉換也夠我們使用了。

再解開謎題之前，我們先複習一下功課。

總結成圖表就是下面這個：

二進制表示

UTF-8：

1位元組 0xxxxxxx 提供7個有效位

2位元組 110xxxxx 10xxxxxx 提供11個有效位

3位元組 1110xxxx 10xxxxxx 10xxxxxx 提供16個有效位

4位元組 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 提供21個有效位，下面兩個不大用到

5位元組 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 提供26個有效位

6位元組 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 提供31個有效位

UTF-16：

小于等于0xFFFF的字元

直接用0xFFFF 填充碼點

大于0xFFFF的字元需要把碼點減去0x10000,然後再填充到下面兩個UTF-16中，即32位

1101 10xx xxxxxxxx 1101 11xx xxxxxxxx

即第一個開頭必須是1101 10;第二個開頭必須是1101 11 提供20個有效位

上面的x代表了碼點填充的有效位置，可1可0。

我們的JavaScript/Java語言都是使用UTF-16作為字元編碼格式。

1 對中文字單獨分析

let ch1= '中';

let ch2 = '𠮷';

console.log("中.length="+ch1.length+",unicode="+ch1.charCodeAt(0));

console.log("𠮷.length="+ch2 .length+",unicode="+ch2 .charCodeAt(0)+",unicode2="+ch2 .charCodeAt(1));

java uint8array_uint8array和string的互轉

分析漢字

同樣是中文，由于'中'的unicode碼點小于0x10000，是以用一個UTF-16就可以表示，而'𠮷'的unicode大于0xFFFF，是以用了兩個UTF-16,我們看到他的length是2.

而我們的Uin8Array單個元素隻能存最大0xFF，而中文字都大于0xFF，用上面網上搜的方法不足以存下完整資料，是以這就導緻一次來回轉換資料丢失了。

一般情況下，我們想用Uint8Array儲存的是字元串的UTF-8編碼時候的序列。是以對任意字元串，我們首先要取到每個字的unicode碼點，然後再對照UTF-8的編碼方式填入Uin8Array。

取出來的時候當然也是先把UTF-8的編碼轉成unicode碼點，然後按照UTF-16的形式轉成字元串。

用圖表示就是：

string 轉 Uint8Array

char --> UTF-16 -> Unicode->UTF8 -> Uin8Array

Uint8Array 轉string

就是上面的序列反過來

由于我是在cocos creator環境下使用TypeScript，我編寫了如下的類：

interface UnicodeOk {

unicode?: number

ok: boolean

}

interface UnicodeLen {

unicode?: number

len: number

}

export class BufferBigEndian {

buffer: number[];//uint8array

private readOffset: number;

constructor() {

this.buffer = [];

this.readOffset = 0;

}

initWithUint8Array(array: ArrayLike, len?: number) {

len = len || array.length;

this.buffer = [];

for (let i = 0; i < len && i < array.length; i++)

this.buffer[i] = array[i];

this.readOffset = 0;

}

getUint8(): number {

// console.log("getUint8 readOffset=" + this.readOffset + ",total=" + this.buffer.length);

if (this.readOffset + 1 > this.buffer.length)

return null;

return this.buffer[this.readOffset++];

}

pushUint8(value: number): void {

if (value > 255)

throw Error("BufferBigEndian pushUint8 value need <= 255");

this.buffer.push(value);

}

getUint16(): number {

if (this.readOffset + 2 > this.buffer.length)

return null;

let uint1 = this.getUint8();

let uint2 = this.getUint8();

return (uint1 << 8) | uint2;

}

pushUint16(value: number): void {

this.pushUint8((value >> 8) & 0xFF);

this.pushUint8(value & 0xFF);

}

getUint32(): number {

if (this.readOffset + 4 > this.buffer.length)

return null;

let uint1 = this.getUint16();

let uint2 = this.getUint16();

return uint1 * 65536 + uint2;

}

pushUint32(value: number): void {

this.pushUint16((value >> 16) & 0xFFFF);

this.pushUint16(value & 0xFFFF);

}

getInt64(): number {

let hi = this.getUint32();

// console.log("hi=" + hi);

let lo = this.getUint32();

// console.log("lo=" + lo);

if (hi >> 31 == 1)

return -(hi * 4294967296 + lo);

return hi * 4294967296 + lo;

}

pushUnicodeWithUtf8(value: number): void {

// console.log("encodeUnicode value=" + value);

if (value <= 0x7F) {

this.pushUint8(value);

} else if (value <= 0xFF) {

this.pushUint8((value >> 6) | 0xC0);

this.pushUint8((value & 0x3F) | 0x80);

} else if (value <= 0xFFFF) {

this.pushUint8((value >> 12) | 0xE0);

this.pushUint8(((value >> 6) & 0x3F) | 0x80);

this.pushUint8((value & 0x3F) | 0x80);

} else if (value <= 0x1FFFFF) {

this.pushUint8((value >> 18) | 0xF0);

this.pushUint8(((value >> 12) & 0x3F) | 0x80);

this.pushUint8(((value >> 6) & 0x3F) | 0x80);

this.pushUint8((value & 0x3F) | 0x80);

} else if (value <= 0x3FFFFFF) {//後面兩種情況一般不大接觸到，看了下protobuf.js中的utf8，他沒去實作

this.pushUint8((value >> 24) | 0xF8);

this.pushUint8(((value >> 18) & 0x3F) | 0x80);

this.pushUint8(((value >> 12) & 0x3F) | 0x80);

this.pushUint8(((value >> 6) & 0x3F) | 0x80);

this.pushUint8((value & 0x3F) | 0x80);

} else {//Math.pow(2, 32) - 1

this.pushUint8((value >> 30) & 0x1 | 0xFC);

this.pushUint8(((value >> 24) & 0x3F) | 0x80);

this.pushUint8(((value >> 18) & 0x3F) | 0x80);

this.pushUint8(((value >> 12) & 0x3F) | 0x80);

this.pushUint8(((value >> 6) & 0x3F) | 0x80);

this.pushUint8((value & 0x3F) | 0x80);

}

getUnicodeWithUtf8(): UnicodeLen {

let result;

let start = this.getUint8();

if (start == null)

return null;

let n = 7;

while (((start >> n) & 1) == 1)

n--;

n = 7 - n;

if (n == 0)

result = start;

else

result = start & (Math.pow(2, 7 - n) - 1);

// console.log("start=" + start.toString(16).toUpperCase() + ",n=" + n + ",result=" + result);

for (let i = 1; i < n; i++) {

let follow = this.getUint8();

if ((follow & 0x80) == 0x80) {

result = result << 6 | (follow & 0x3F);

} else {

//不是标準的UTF8字元串。。我們直接取第一個。

result = start;

this.changeReadOffset(1 - n);

n = 0;

break;

}

return {unicode: result, len: n == 0 ? 1 : n};

}

parseUnicodeFromUtf16(ch1: number, ch2: number): UnicodeOk {

if ((ch1 & 0xFC00) === 0xD800 && (ch2 & 0xFC00) === 0xDC00) {

return {unicode: (((ch1 & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000, ok: true}

}

return {ok: false}

}

pushStringWithUtf8(value: string): number {

let oldlen = this.buffer.length;

for (let i = 0; i < value.length; i++) {

let ch1 = value.charCodeAt(i);

// console.log("pushStringWithUtf8 i=" + i + ",ch1=" + ch1 + "," + ch1.toString(16).toUpperCase());

if (ch1 < 128)

this.pushUnicodeWithUtf8(ch1);

else if (ch1 < 2048) {

this.pushUnicodeWithUtf8(ch1);

} else {

let ch2 = value.charCodeAt(i + 1);

// console.log("pushStringWithUtf8 i=" + i + ",ch2=" + ch2 + "," + ch2.toString(16).toUpperCase());

let unicodeOk = this.parseUnicodeFromUtf16(ch1, ch2);

// console.log("unicodeOk=" + JSON.stringify(unicodeOk));

if (unicodeOk.ok) {

this.pushUnicodeWithUtf8(unicodeOk.unicode);

i++;

} else {

this.pushUnicodeWithUtf8(ch1);

}

return this.buffer.length - oldlen;

}

getStringWithUtf8(len: number): string {

if (len < 1)

return "";

// console.log("this.readOffset=" + this.readOffset + ",len=" + len + ",total=" + this.buffer.length);

if (this.readOffset + len > this.buffer.length)

return "";

let str = "";

let read = 0;

while (read < len) {

let unicodeLen = this.getUnicodeWithUtf8();

if (!unicodeLen) {

break;

}

read += unicodeLen.len;

// console.log("read unicode=" + JSON.stringify(unicodeLen));

if (unicodeLen.unicode < 0x10000) {

str += String.fromCharCode(unicodeLen.unicode);

} else {

let minus = unicodeLen.unicode - 0x10000;

let ch1 = (minus >> 10) | 0xD800;

let ch2 = (minus & 0x3FF) | 0xDC00;

str += String.fromCharCode(ch1, ch2)

}

// console.log("getStringWithUtf8 len=" + len + ",str.len=" + str.length);

return str;

}

pushStringWithUtf16(value: string): number {

let oldlen = this.buffer.length;

for (let i = 0; i < value.length; i++) {

let ch = value[i].charCodeAt(0);

this.pushUint16(ch);

}

return this.buffer.length - oldlen;

}

getStringWithUtf16(len: number): string {

if (len < 1)

return "";

if (this.readOffset + len > this.buffer.length || len % 2 != 0)

return "";

let str = "";

for (let i = 0; i < len; i += 2) {

let ch1 = this.getUint16();

let ch2 = this.getUint16();

str += String.fromCharCode(ch1, ch2);

}

return str;

}

pushUint8List(val: ArrayLike) {

for (let i = 0; i < val.length; i++)

this.pushUint8(val[i]);

}

getUint8List(len?: number): Uint8Array {

len = len || this.buffer.length;

return new Uint8Array(this.buffer.slice(this.readOffset, this.readOffset + len));

}

tostring(): string {

let result = "";

for (let i = 0; i < this.buffer.length; i++) {

let ch = this.buffer[i].toString(16);

result += ch.length == 1 ? "0" + ch.toUpperCase() : ch.toUpperCase();

}

return result;

}

toUint8Array(): Uint8Array {

let array = new Uint8Array(this.buffer.length);

for (let i = 0; i < this.buffer.length; i++)

array[i] = this.buffer[i];

return array;

}

changeReadOffset(len: number) {

this.readOffset = Math.max(0, Math.min(this.buffer.length, this.readOffset + len))

}

測試代碼

let str0 = '𠮷';

let str1 = '中';//𝌆𠮷

let str2 = 'a';

let strAll = str0 + str1 + str2;

let buf = new BufferBigEndian();

let len = buf.pushStringWithUtf8(strAll);

console.log("buffer HEX=" + buf.tostring(), "encodeURI=" + encodeURI(strAll));

console.log("轉換前:" + strAll + ",轉換後:" + buf.getStringWithUtf8(len));

java uint8array_uint8array和string的互轉

列印結果

這就是我們要的結果了。