中文字符串在c/c++中表示为字节序列,在分词的时候需要根据不同的编码方式进行分词,一般分词器需要转换成统一的编码方式再进行转换,有些分词器如ICTCLAS在分词的时候可以不显示定义编码方式,可以检测字符串的编码方式再进行转换,本文就项目中用到的几种编码转换方式进行总结,主要利用了iconv进行编码转换。
const bchar_t zero[1] = {L'\0'};
void gbk_utf16le(char* inbuf, size_t inlen, char* outbuf)
{
iconv_t cd = iconv_open( "UTF-16LE//IGNORE", "GBK");
bzero( outbuf, inlen*4);
char *in = inbuf;
char *out = outbuf;
size_t outlen = inlen*4;
iconv(cd, &in, (size_t *)&inlen, &out,&outlen);
if( outlen>=sizeof(bchar_t) )
*((bchar_t *) out) = L'\0';
iconv_close(cd);
}
void utf16le_gbk(char* inbuf, size_t inlen, char* outbuf)
{
iconv_t cd = iconv_open( "GBK//IGNORE", "UTF-16LE");
bzero( outbuf, inlen*4);
char *in = inbuf;
char *out = outbuf;
size_t outlen = inlen*4;
iconv(cd, &in, (size_t *)&inlen, &out,&outlen);
if( outlen>=sizeof(char) )
*((char *) out) = '\0';
iconv_close(cd);
}
void utf16le_utf8(char* inbuf, size_t inlen, char* outbuf)
{
iconv_t cd = iconv_open( "UTF-8//IGNORE", "UTF-16LE");
bzero( outbuf, inlen*4);
char *in = inbuf;
char *out = outbuf;
size_t outlen = inlen*4;
iconv(cd, &in, (size_t *)&inlen, &out,&outlen);
if( outlen>=sizeof(char) )
*((char *) out) = '\0';
iconv_close(cd);
}
void gbk_utf16le(char* inbuf, size_t inlen, char* outbuf,uint32_t& outbuflen)
{
iconv_t cd = iconv_open( "UTF-16LE//IGNORE", "GBK");
bzero( outbuf, inlen*4);
char *in = inbuf;
char *out = outbuf;
size_t outlen = inlen*4;
outbuflen = outlen;
iconv(cd, &in, (size_t *)&inlen, &out,&outlen);
//outlen is bytes of outbuf not used
outbuflen -= outlen;
if( outlen>=sizeof(bchar_t) )
*((bchar_t *) out) = L'\0';
iconv_close(cd);
}
void utf8_gbk(char* inbuf, size_t inlen, char* outbuf)
{
iconv_t cd = iconv_open( "GBK//IGNORE", "UTF-8");
bzero( outbuf, inlen*4);
char *in = inbuf;
char *out = outbuf;
size_t outlen = inlen*4;
iconv(cd, &in, (size_t *)&inlen, &out,&outlen);
if( outlen>=sizeof(char) )
*((char *) out) = '\0';
iconv_close(cd);
}
/*将char类型转换为uint16类型,主要是将2个字节的char拼成1个16进制数*/
void ch_uint16(char* inbuf, int& inlen, uint16_t* outbuf, int& outlen)
{
char *in = inbuf;
uint16_t *out = outbuf;
int k = 0;
outlen = 0;
char temp[20];
memset(temp,0,20);
for(;k+2<=inlen;k+=2){
if( int(in[k]&0xFF) >15){
sprintf(temp,"%x%x",in[k+1]&0xFF ,in[k]&0xFF);}
else{
sprintf(temp,"%x%d%x",in[k+1]&0xFF,0,in[k]&0xFF);}
uint16_t ut;
sscanf(temp,"%x",&ut);
out[outlen] = ut ;
outlen++ ;
}
out[outlen] = L'\0';
}