天天看點

中文字元串的編碼轉換(c實作)

    中文字元串在c/c++中表示為位元組序列,在分詞的時候需要根據不同的編碼方式進行分詞,一般分詞器需要轉換成統一的編碼方式再進行轉換,有些分詞器如ICTCLAS在分詞的時候可以不顯示定義編碼方式,可以檢測字元串的編碼方式再進行轉換,本文就項目中用到的幾種編碼轉換方式進行總結,主要利用了iconv進行編碼轉換。

const bchar_t zero[1] = {L'\0'};

void gbk_utf16le(char* inbuf, size_t inlen, char* outbuf)

{

iconv_t cd = iconv_open( "UTF-16LE//IGNORE", "GBK");

bzero( outbuf, inlen*4);

char *in = inbuf;

char *out = outbuf;

size_t outlen = inlen*4;

iconv(cd, &in, (size_t *)&inlen, &out,&outlen);

if( outlen>=sizeof(bchar_t) )

*((bchar_t *) out) = L'\0';

iconv_close(cd);

}

void utf16le_gbk(char* inbuf, size_t inlen, char* outbuf)

{

iconv_t cd = iconv_open( "GBK//IGNORE", "UTF-16LE");

bzero( outbuf, inlen*4);

char *in = inbuf;

char *out = outbuf;

size_t outlen = inlen*4;

iconv(cd, &in, (size_t *)&inlen, &out,&outlen);

if( outlen>=sizeof(char) )

*((char *) out) = '\0';

iconv_close(cd);

}

void utf16le_utf8(char* inbuf, size_t inlen, char* outbuf)

{

iconv_t cd = iconv_open( "UTF-8//IGNORE", "UTF-16LE");

bzero( outbuf, inlen*4);

char *in = inbuf;

char *out = outbuf;

size_t outlen = inlen*4;

iconv(cd, &in, (size_t *)&inlen, &out,&outlen);

if( outlen>=sizeof(char) )

*((char *) out) = '\0';

iconv_close(cd);

}

void gbk_utf16le(char* inbuf, size_t inlen, char* outbuf,uint32_t& outbuflen)

{

iconv_t cd = iconv_open( "UTF-16LE//IGNORE", "GBK");

bzero( outbuf, inlen*4);

char *in = inbuf;

char *out = outbuf;

size_t outlen = inlen*4;

outbuflen = outlen;

iconv(cd, &in, (size_t *)&inlen, &out,&outlen);

//outlen is bytes of outbuf not used

outbuflen -= outlen;

if( outlen>=sizeof(bchar_t) )

*((bchar_t *) out) = L'\0';

iconv_close(cd);

}

void utf8_gbk(char* inbuf, size_t inlen, char* outbuf)

{

iconv_t cd = iconv_open( "GBK//IGNORE", "UTF-8");

bzero( outbuf, inlen*4);

char *in = inbuf;

char *out = outbuf;

size_t outlen = inlen*4;

iconv(cd, &in, (size_t *)&inlen, &out,&outlen);

if( outlen>=sizeof(char) )

*((char *) out) = '\0';

iconv_close(cd);

}

/*将char類型轉換為uint16類型,主要是将2個位元組的char拼成1個16進制數*/

void ch_uint16(char* inbuf, int& inlen, uint16_t* outbuf, int& outlen)

{

char *in = inbuf;

uint16_t *out = outbuf;

int k = 0;

outlen = 0;

char temp[20];

memset(temp,0,20);

for(;k+2<=inlen;k+=2){

if( int(in[k]&0xFF) >15){

sprintf(temp,"%x%x",in[k+1]&0xFF ,in[k]&0xFF);}

else{

sprintf(temp,"%x%d%x",in[k+1]&0xFF,0,in[k]&0xFF);}

uint16_t ut;

sscanf(temp,"%x",&ut);

out[outlen] = ut ;

outlen++ ;

}

out[outlen] = L'\0';

}

繼續閱讀