天天看点

使用Java将中文字符转换成Unicode编码

这两天操作xml使用到了jdom,在创建xml文件并输出到硬盘的时候遇到一个中文编码的问题:jdom默认输出的xml编码是utf-8,但是文档中如果出现中文字符那么该中文字符就会变成乱码,造成xml文件无法被正确解析。

utf-8应该是可以用来表示中文的吧?我不知道这是不是jdom的一个bug(jdom 1.0,beta了10次的产物哦!)。我google了一下,大家解决这个问题的办法无非是把jdom的输出字符集改为gbk或者gb2312,但是这样就会有一些副作用,如果在没有特定字符集(gbk或者gb2312)的操作系统上不是依然不能正确解析吗?一个比较好的解决办法是先把中文转换成unicode编码在直接输出,程序解析xml后的时候再把unicode编码转回中文就没有问题了。

于是我查看了jdk的文档,截至java 5好像都没有做类似转换的类可以直接使用,但是我发现一个类 java.util.properties,它的源代码里有两个私有(private)方法 loadconvert (char[] in, int off, int len, char[] convtbuf) 和 saveconvert(string thestring, boolean escapespace) 其实就是做特殊字符和unicode编码字符间转换的,我把它们提取出来,单独包装到一个类里就可以使用了。

下面是我包装的类 charactersettoolkit

/*

* charactersettoolkit.java

*

* created on 2006年10月27日, 下午2:06

* to change this template, choose tools | template manager

* and open the template in the editor.

*/

package mobi.chenwei.lang;

/**

* 进行字符操作的工具类

* @author chen wei

* @email [email protected]

public class charactersettoolkit {

     /** creates a new instance of charactersettoolkit */

     public charactersettoolkit() {

     }

     private static final char[] hexdigit = {

         '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'

     };

     private static char tohex(int nibble) {

         return hexdigit[(nibble & 0xf)];

     /**

      * 将字符串编码成 unicode 。

      * @param thestring 待转换成unicode编码的字符串。

      * @param escapespace 是否忽略空格。

      * @return 返回转换后unicode编码的字符串。

      */

     public static string tounicode(string thestring, boolean escapespace) {

         int len = thestring.length();

         int buflen = len * 2;

         if (buflen < 0) {

             buflen = integer.max_value;

         }

         stringbuffer outbuffer = new stringbuffer(buflen);

         for(int x=0; x<len; x++) {

             char achar = thestring.charat(x);

             // handle common case first, selecting largest block that

             // avoids the specials below

             if ((achar > 61) && (achar < 127)) {

                 if (achar == '\\') {

                     outbuffer.append('\\'); outbuffer.append('\\');

                     continue;

                 }

                 outbuffer.append(achar);

                 continue;

             }

             switch(achar) {

                 case ' ':

                     if (x == 0 || escapespace)

                         outbuffer.append('\\');

                     outbuffer.append(' ');

                     break;

                 case '\t':outbuffer.append('\\'); outbuffer.append('t');

                           break;

                 case '\n':outbuffer.append('\\'); outbuffer.append('n');

                 case '\r':outbuffer.append('\\'); outbuffer.append('r');

                 case '\f':outbuffer.append('\\'); outbuffer.append('f');

                 case '=': // fall through

                 case ':': // fall through

                 case '#': // fall through

                 case '!':

                     outbuffer.append('\\'); outbuffer.append(achar);

                 default:

                     if ((achar < 0x0020) || (achar > 0x007e)) {

                         outbuffer.append('u');

                         outbuffer.append(tohex((achar >> 12) & 0xf));

                         outbuffer.append(tohex((achar >>   8) & 0xf));

                         outbuffer.append(tohex((achar >>   4) & 0xf));

                         outbuffer.append(tohex( achar         & 0xf));

                     } else {

                         outbuffer.append(achar);

                     }

         return outbuffer.tostring();

      * 从 unicode 码转换成编码前的特殊字符串。

      * @param in unicode编码的字符数组。

      * @param off 转换的起始偏移量。

      * @param len 转换的字符长度。

      * @param convtbuf 转换的缓存字符数组。

      * @return 完成转换,返回编码前的特殊字符串。

     public string fromunicode(char[] in, int off, int len, char[] convtbuf) {

         if (convtbuf.length < len) {

             int newlen = len * 2;

             if (newlen < 0) {

                 newlen = integer.max_value;

             convtbuf = new char[newlen];

         char achar;

         char[] out = convtbuf;

         int outlen = 0;

         int end = off + len;

         while (off < end) {

             achar = in[off++];

             if (achar == '\\') {

                 achar = in[off++];

                 if (achar == 'u') {

                     // read the xxxx

                     int value = 0;

                     for (int i = 0; i < 4; i++) {

                         achar = in[off++];

                         switch (achar) {

                         case '0':

                         case '1':

                         case '2':

                         case '3':

                         case '4':

                         case '5':

                         case '6':

                         case '7':

                         case '8':

                         case '9':

                             value = (value << 4) + achar - '0';

                             break;

                         case 'a':

                         case 'b':

                         case 'c':

                         case 'd':

                         case 'e':

                         case 'f':

                             value = (value << 4) + 10 + achar - 'a';

                         default:

                             throw new illegalargumentexception(

                                     "malformed \\uxxxx encoding.");

                         }

                     out[outlen++] = (char) value;

                 } else {

                     if (achar == 't') {

                         achar = '\t';

                     } else if (achar == 'r') {

                         achar = '\r';

                     } else if (achar == 'n') {

                         achar = '\n';

                     } else if (achar == 'f') {

                         achar = '\f';

                     out[outlen++] = achar;

             } else {

                 out[outlen++] = (char) achar;

         return new string(out, 0, outlen);

}