鏂规硶1锛氬埄鐢╳indows鏂囨湰鏂囦欢缂栫爜鐗圭偣銆?
windows涓嬶紝Unicode銆乁nicode big endian鍜孶TF-8缂栫爜鐨則xt鏂囦欢鐨勫紑澶翠細澶氬嚭鍑犱釜瀛楄妭锛屽垎鍒槸FF銆丗E锛圲nicode锛?FE銆丗F锛圲nicode big endian锛?EF銆丅B銆丅F锛圲TF-8锛夈€?
public static String getCharset(File file) {
String charset = "GBK";
byte[] first3Bytes = new byte[3];
try {
boolean checked = false;
BufferedInputStream bis = new BufferedInputStream(
new FileInputStream(file));
bis.mark(0);
int read = bis.read(first3Bytes, 0, 3);
if (read == -1)
return charset;
if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
charset = "UTF-16LE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1]
== (byte) 0xFF) {
charset = "UTF-16BE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1]
== (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
charset = "UTF-8";
checked = true;
}
bis.reset();
if (!checked) {
int loc = 0;
while ((read = bis.read()) != -1) {
loc++;
if (read >= 0xF0)
break;
//鍗曠嫭鍑虹幇BF浠ヤ笅鐨勶紝涔熺畻鏄疓BK
if (0x80 <= read && read <= 0xBF)
break;
if (0xC0 <= read && read <= 0xDF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF)// 鍙屽瓧鑺?(0xC0 - 0xDF)
// (0x80 -
// 0xBF),涔熷彲鑳藉湪GB缂栫爜鍐? continue;
else
break;
// 涔熸湁鍙兘鍑洪敊锛屼絾鏄嚑鐜囪緝灏? } else if (0xE0 <= read && read <= 0xEF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
charset = "UTF-8";
break;
} else
break;
} else
break;
}
}
System.out.println(loc + " " + Integer.toHexString(read));
}
bis.close();
} catch (Exception e) {
e.printStackTrace();
}
return charset;
}
缂虹偣锛氫笉鑳借繖鏍峰幓鎺㈡祴linux涓嬬殑鏂囦欢銆?
鏂规硶2锛氬紑婧愬伐绋婮CharDet
鈥嬧€媓ttp://www.iteye.com/topic/266501鈥嬧€?
package org.mozilla.intl.chardet;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
/**
* 鍊熷姪JCharDet鑾峰彇鏂囦欢瀛楃闆? * @author icer
* PS:
* JCharDet 鏄痬ozilla鑷姩瀛楃闆嗘帰娴嬬畻娉曚唬鐮佺殑java绉绘锛屽叾瀹樻柟涓婚〉涓猴細
* http://jchardet.sourceforge.net/
* @date 2008/11/13
*/
public class FileCharsetDetector {
private boolean found = false;
/**
* 濡傛灉瀹屽叏鍖归厤鏌愪釜瀛楃闆嗘娴嬬畻娉? 鍒欒灞炴€т繚瀛樿瀛楃闆嗙殑鍚嶇О. 鍚﹀垯(濡備簩杩涘埗鏂囦欢)鍏跺€煎氨涓洪粯璁ゅ€?null, 杩欐椂搴斿綋鏌ヨ灞炴€?
*/
private String encoding = null;
public static void main(String[] argv) throws Exception {
if (argv.length != 1 && argv.length != 2) {
System.out
.println("Usage: FileCharsetDetector <path> [<languageHint>]");
System.out.println("");
System.out.println("Where <path> is d:/demo.txt");
System.out.println("For optional <languageHint>. Use following...");
System.out.println(" 1 => Japanese");
System.out.println(" 2 => Chinese");
System.out.println(" 3 => Simplified Chinese");
System.out.println(" 4 => Traditional Chinese");
System.out.println(" 5 => Korean");
System.out.println(" 6 => Dont know (default)");
return;
} else {
String encoding = null;
if (argv.length == 2) {
encoding = new FileCharsetDetector().guestFileEncoding(argv[0],
Integer.valueOf(argv[1]));
} else {
encoding = new FileCharsetDetector().guestFileEncoding(argv[0]);
}
System.out.println("鏂囦欢缂栫爜:" + encoding);
}
}
/**
* 浼犲叆涓€涓枃浠?File)瀵硅薄锛屾鏌ユ枃浠剁紪鐮? *
* @param file
* File瀵硅薄瀹炰緥
* @return 鏂囦欢缂栫爜锛岃嫢鏃狅紝鍒欒繑鍥瀗ull
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(File file) throws FileNotFoundException,
IOException {
return geestFileEncoding(file, new nsDetector());
}
/**
* 鑾峰彇鏂囦欢鐨勭紪鐮? *
* @param file
* File瀵硅薄瀹炰緥
* @param languageHint
* 璇█鎻愮ず鍖哄煙浠g爜 eg锛? : Japanese; 2 : Chinese; 3 : Simplified Chinese;
* 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
* @return 鏂囦欢缂栫爜锛宔g锛歎TF-8,GBK,GB2312褰㈠紡锛岃嫢鏃狅紝鍒欒繑鍥瀗ull
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(File file, int languageHint)
throws FileNotFoundException, IOException {
return geestFileEncoding(file, new nsDetector(languageHint));
}
/**
* 鑾峰彇鏂囦欢鐨勭紪鐮? *
* @param path
* 鏂囦欢璺緞
* @return 鏂囦欢缂栫爜锛宔g锛歎TF-8,GBK,GB2312褰㈠紡锛岃嫢鏃狅紝鍒欒繑鍥瀗ull
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(String path) throws FileNotFoundException,
IOException {
return guestFileEncoding(new File(path));
}
/**
* 鑾峰彇鏂囦欢鐨勭紪鐮? *
* @param path
* 鏂囦欢璺緞
* @param languageHint
* 璇█鎻愮ず鍖哄煙浠g爜 eg锛? : Japanese; 2 : Chinese; 3 : Simplified Chinese;
* 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
* @return
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(String path, int languageHint)
throws FileNotFoundException, IOException {
return guestFileEncoding(new File(path), languageHint);
}
/**
* 鑾峰彇鏂囦欢鐨勭紪鐮? *
* @param file
* @param det
* @return
* @throws FileNotFoundException
* @throws IOException
*/
private String geestFileEncoding(File file, nsDetector det)
throws FileNotFoundException, IOException {
// Set an observer...
// The Notify() will be called when a matching charset is found.
det.Init(new nsICharsetDetectionObserver() {
public void Notify(String charset) {
found = true;
encoding = charset;
}
});
BufferedInputStream imp = new BufferedInputStream(new FileInputStream(
file));
byte[] buf = new byte[1024];
int len;
boolean done = false;
boolean isAscii = true;
while ((len = imp.read(buf, 0, buf.length)) != -1) {
// Check if the stream is only ascii.
if (isAscii)
isAscii = det.isAscii(buf, len);
// DoIt if non-ascii and not done yet.
if (!isAscii && !done)
done = det.DoIt(buf, len, false);
}
det.DataEnd();
if (isAscii) {
encoding = "ASCII";
found = true;
}
if (!found) {
String prob[] = det.getProbableCharsets();
if (prob.length > 0) {
// 鍦ㄦ病鏈夊彂鐜版儏鍐典笅锛屽垯鍙栫涓€涓彲鑳界殑缂栫爜
encoding = prob[0];
} else {
return null;
}
}
return encoding;
}
}
鏂规硶3锛氬紑婧愬伐绋媕universalcharde
鈥嬧€媓ttp://code.google.com/p/juniversalchardet/鈥嬧€嬄?
public static String getFileIncode(File file) {
if (!file.exists()) {
System.err.println("getFileIncode: file not exists!");
return null;
}
byte[] buf = new byte[4096];
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// (1)
UniversalDetector detector = new UniversalDetector(null);
// (2)
int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
// (3)
detector.dataEnd();
// (4)
String encoding = detector.getDetectedCharset();
if (encoding != null) {
System.out.println("Detected encoding = " + encoding);
} else {
System.out.println("No encoding detected.");
}
// (5)
detector.reset();
fis.close();
return encoding;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
寮曞叆鍖呯殑鏂规硶锛?
灏嗗寘鏀惧叆libs鏂囦欢澶癸紝
閫変腑鍖咃紝鍙抽敭 --> build path--> add to build path銆?