天天看點

Android--判斷文本檔案編碼

鏂規硶1锛氬埄鐢╳indows鏂囨湰鏂囦歡缂栫爜鐗圭偣銆?

windows涓嬶紝Unicode銆乁nicode big endian鍜孶TF-8缂栫爜鐨則xt鏂囦歡鐨勫紑澶翠細澶氬嚭鍑犱釜瀛楄妭锛屽垎鍒槸FF銆丗E锛圲nicode锛?FE銆丗F锛圲nicode big endian锛?EF銆丅B銆丅F锛圲TF-8锛夈€?

public static String getCharset(File file) {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        try {
            boolean checked = false;
            BufferedInputStream bis = new BufferedInputStream(
                  new FileInputStream(file));
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1)
                return charset;
            if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
                charset = "UTF-16LE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1]
                == (byte) 0xFF) {
                charset = "UTF-16BE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1]
                    == (byte) 0xBB
                    && first3Bytes[2] == (byte) 0xBF) {
                charset = "UTF-8";
                checked = true;
            }
            bis.reset();
            if (!checked) {
                int loc = 0;
                while ((read = bis.read()) != -1) {
                    loc++;
                    if (read >= 0xF0)
                        break;
                    //鍗曠嫭鍑虹幇BF浠ヤ笅鐨勶紝涔熺畻鏄疓BK
                    if (0x80 <= read && read <= 0xBF)
                        break;
                    if (0xC0 <= read && read <= 0xDF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF)// 鍙屽瓧鑺?(0xC0 - 0xDF)
                            // (0x80 -
                            // 0xBF),涔熷彲鑳藉湪GB缂栫爜鍐?                            continue;
                        else
                            break;
                     // 涔熸湁鍙兘鍑洪敊锛屼絾鏄嚑鐜囪緝灏?                    } else if (0xE0 <= read && read <= 0xEF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else
                                break;
                        } else
                            break;
                    }
                }
                System.out.println(loc + " " + Integer.toHexString(read));
            }
            bis.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return charset;
    }      

缂虹偣锛氫笉鑳借繖鏍峰幓鎺㈡祴linux涓嬬殑鏂囦歡銆?

鏂規硶2锛氬紑婧愬伐绋婮CharDet

鈥嬧€媓ttp://www.iteye.com/topic/266501鈥嬧€?

package org.mozilla.intl.chardet;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

/**
 * 鍊熷姪JCharDet鑾峰彇鏂囦歡瀛楃闆? * @author icer
 * PS:
 * JCharDet 鏄痬ozilla鑷姩瀛楃闆嗘帰娴嬬畻娉曚唬鐮佺殑java绉繪锛屽叾瀹樻柟涓婚〉涓猴細
 *      http://jchardet.sourceforge.net/
 * @date  2008/11/13 
 */
public class FileCharsetDetector {

  private boolean found = false;

  /**
   * 濡傛灉瀹屽叏鍖歸厤鏌愪釜瀛楃闆嗘娴嬬畻娉? 鍒欒灞炴€т繚瀛樿瀛楃闆嗙殑鍚嶇О. 鍚﹀垯(濡備簩杩涘埗鏂囦歡)鍏跺€煎氨涓洪粯璁ゅ€?null, 杩欐椂搴斿綋鏌ヨ灞炴€?
   */
  private String encoding = null;

  public static void main(String[] argv) throws Exception {
    if (argv.length != 1 && argv.length != 2) {

      System.out
          .println("Usage: FileCharsetDetector <path> [<languageHint>]");

      System.out.println("");
      System.out.println("Where <path> is d:/demo.txt");
      System.out.println("For optional <languageHint>. Use following...");
      System.out.println("    1 => Japanese");
      System.out.println("    2 => Chinese");
      System.out.println("    3 => Simplified Chinese");
      System.out.println("    4 => Traditional Chinese");
      System.out.println("    5 => Korean");
      System.out.println("    6 => Dont know (default)");

      return;
    } else {
      String encoding = null;
      if (argv.length == 2) {
        encoding = new FileCharsetDetector().guestFileEncoding(argv[0],
            Integer.valueOf(argv[1]));
      } else {
        encoding = new FileCharsetDetector().guestFileEncoding(argv[0]);
      }
      System.out.println("鏂囦歡缂栫爜:" + encoding);
    }
  }

  /**
   * 浼犲叆涓€涓枃浠?File)瀵矽薄锛屾鏌ユ枃浠剁紪鐮?   * 
   * @param file
   *            File瀵矽薄瀹炰緥
   * @return 鏂囦歡缂栫爜锛岃嫢鏃狅紝鍒欒繑鍥瀗ull
   * @throws FileNotFoundException
   * @throws IOException
   */
  public String guestFileEncoding(File file) throws FileNotFoundException,
      IOException {
    return geestFileEncoding(file, new nsDetector());
  }

  /**
   * 鑾峰彇鏂囦歡鐨勭紪鐮?   * 
   * @param file
   *            File瀵矽薄瀹炰緥
   * @param languageHint
   *            璇█鎻愮ず鍖哄煙浠g爜 eg锛? : Japanese; 2 : Chinese; 3 : Simplified Chinese;
   *            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
   * @return 鏂囦歡缂栫爜锛宔g锛歎TF-8,GBK,GB2312褰㈠紡锛岃嫢鏃狅紝鍒欒繑鍥瀗ull
   * @throws FileNotFoundException
   * @throws IOException
   */
  public String guestFileEncoding(File file, int languageHint)
      throws FileNotFoundException, IOException {
    return geestFileEncoding(file, new nsDetector(languageHint));
  }

  /**
   * 鑾峰彇鏂囦歡鐨勭紪鐮?   * 
   * @param path
   *            鏂囦歡璺緞
   * @return 鏂囦歡缂栫爜锛宔g锛歎TF-8,GBK,GB2312褰㈠紡锛岃嫢鏃狅紝鍒欒繑鍥瀗ull
   * @throws FileNotFoundException
   * @throws IOException
   */
  public String guestFileEncoding(String path) throws FileNotFoundException,
      IOException {
    return guestFileEncoding(new File(path));
  }

  /**
   * 鑾峰彇鏂囦歡鐨勭紪鐮?   * 
   * @param path
   *            鏂囦歡璺緞
   * @param languageHint
   *            璇█鎻愮ず鍖哄煙浠g爜 eg锛? : Japanese; 2 : Chinese; 3 : Simplified Chinese;
   *            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
   * @return
   * @throws FileNotFoundException
   * @throws IOException
   */
  public String guestFileEncoding(String path, int languageHint)
      throws FileNotFoundException, IOException {
    return guestFileEncoding(new File(path), languageHint);
  }

  /**
   * 鑾峰彇鏂囦歡鐨勭紪鐮?   * 
   * @param file
   * @param det
   * @return
   * @throws FileNotFoundException
   * @throws IOException
   */
  private String geestFileEncoding(File file, nsDetector det)
      throws FileNotFoundException, IOException {
    // Set an observer...
    // The Notify() will be called when a matching charset is found.
    det.Init(new nsICharsetDetectionObserver() {
      public void Notify(String charset) {
        found = true;
        encoding = charset;
      }
    });

    BufferedInputStream imp = new BufferedInputStream(new FileInputStream(
        file));

    byte[] buf = new byte[1024];
    int len;
    boolean done = false;
    boolean isAscii = true;

    while ((len = imp.read(buf, 0, buf.length)) != -1) {
      // Check if the stream is only ascii.
      if (isAscii)
        isAscii = det.isAscii(buf, len);

      // DoIt if non-ascii and not done yet.
      if (!isAscii && !done)
        done = det.DoIt(buf, len, false);
    }
    det.DataEnd();

    if (isAscii) {
      encoding = "ASCII";
      found = true;
    }

    if (!found) {
      String prob[] = det.getProbableCharsets();
      if (prob.length > 0) {
        // 鍦ㄦ病鏈夊彂鐜版儏鍐典笅锛屽垯鍙栫涓€涓彲鑳界殑缂栫爜
        encoding = prob[0];
      } else {
        return null;
      }
    }
    return encoding;
  }
}      

鏂規硶3锛氬紑婧愬伐绋媕universalcharde

鈥嬧€媓ttp://code.google.com/p/juniversalchardet/鈥嬧€嬄?

public static String getFileIncode(File file) {

    if (!file.exists()) {
      System.err.println("getFileIncode: file not exists!");
      return null;
    }

    byte[] buf = new byte[4096];
    FileInputStream fis = null;
    try {
      fis = new FileInputStream(file);
      // (1)
      UniversalDetector detector = new UniversalDetector(null);

      // (2)
      int nread;
      while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
        detector.handleData(buf, 0, nread);
      }
      // (3)
      detector.dataEnd();

      // (4)
      String encoding = detector.getDetectedCharset();
      if (encoding != null) {
        System.out.println("Detected encoding = " + encoding);
      } else {
        System.out.println("No encoding detected.");
      }

      // (5)
      detector.reset();
      fis.close();
      return encoding;
    } catch (Exception e) {
      e.printStackTrace();
    }

    return null;
  }      

寮曞叆鍖呯殑鏂規硶锛?

灏嗗寘鏀懼叆libs鏂囦歡澶癸紝

閫変腑鍖咃紝鍙抽敭 --> build path--> add to build path銆?