Java處理UTF-8檔案的BOM頭部
BOM——Byte Order Mark,就是位元組序标記。
基本概念
- 在
中有一個叫做”UCS 編碼
“的字元,它的編碼是ZERO WIDTH NO-BREAK SPACE
。而FEFF
在FFFE
中是不存在的字元,是以不應該出現在實際傳輸中。UCS
- UCS規範建議我們在傳輸位元組流前,先傳輸 字元”
“。ZERO WIDTH NO-BREAK SPACE
- 如果接收者收到
,就表明這個位元組流是大位元組序的;如果收到FEFF
,就表明這個位元組流是小位元組序的。是以字元”FFFE
“又被稱作ZERO WIDTH NO-BREAK SPACE
。BOM
UTF-8不需要BOM來表明位元組順序,但可以用BOM來表明編碼方式。字元”
ZERO WIDTH NO-BREAK SPACE
“的
UTF-8編碼
是
EF BB BF
。是以如果接收者收到以
EF BB BF
開頭的位元組流,就知道這是
UTF-8編碼
了。
這個BOM頭部對于UTF-8來說不是必須的,并且我建議最好不用有這個頭部,以避免可能的相容性問題。
下面就來看看怎麼用java來處理UTF-8的BOM頭部
增加BOM到UTF-8檔案
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
public class AddBomToUtf8File {
public static void main(String[] args) throws IOException {
Path path = Paths.get("/home/file.txt");
writeBomFile(path, "billy");
}
private static void writeBomFile(Path path, String content) {
// Java 8 default UTF-8
try (BufferedWriter bw = Files.newBufferedWriter(path)) {
bw.write("\ufeff");
bw.write(content);
bw.newLine();
bw.write(content);
} catch (IOException e) {
e.printStackTrace();
}
}
}
在Java 8 之前可以用下面的方法:
new OutputStreamWriter(
new FileOutputStream(path.toFile())
, StandardCharsets.UTF_8))) {
bw.write("\ufeff");
bw.write(content);
bw.newLine();
bw.write(content);
} catch (IOException e) {
e.printStackTrace();
}
}
或者可以用 PrintWriter 和OutputStreamWriter
try (PrintWriter pw = new PrintWriter(
new OutputStreamWriter(
new FileOutputStream(path.toFile()), StandardCharsets.UTF_8))) {
//pw.write("\ufeff");
pw.write(0xfeff); // alternative, codepoint
pw.write(content);
pw.write(System.lineSeparator());
pw.write(content);
} catch (IOException e) {
e.printStackTrace();
}
}
又或者,這樣:
private static void writeBomFile4(Path path, String content) {
try (FileOutputStream fos = new FileOutputStream(path.toFile())) {
byte[] BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
fos.write(BOM);
fos.write(content.getBytes(StandardCharsets.UTF_8));
fos.write(System.lineSeparator().getBytes(StandardCharsets.UTF_8));
fos.write(content.getBytes(StandardCharsets.UTF_8));
} catch (IOException e) {
e.printStackTrace();
}
}
檢查檔案是否包含BOM頭部
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
public class CheckBom {
public static void main(String[] args) throws IOException {
Path path = Paths.get("/home/file.txt");
if(isContainBOM(path)){
System.out.println("Found BOM!");
}else{
System.out.println("No BOM.");
}
}
private static boolean isContainBOM(Path path) throws IOException {
if(Files.notExists(path)){
throw new IllegalArgumentException("Path: " + path + " does not exists!");
}
boolean result = false;
byte[] bom = new byte[3];
try(InputStream is = new FileInputStream(path.toFile())){
// read first 3 bytes of a file.
is.read(bom);
// BOM encoded as ef bb bf
String content = new String(Hex.encodeHex(bom));
if ("efbbbf".equalsIgnoreCase(content)) {
result = true;
}
}
return result;
}
}
上面的代碼需要一個依賴:
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.14</version>
</dependency>
移除UTF-8檔案的BOM頭部
通常,我建議不要用這個BOM,不然處理不好産生什麼亂碼就麻煩了。
import org.apache.commons.codec.binary.Hex;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
public class RemoveBomFromUtf8File {
public static void main(String[] args) throws IOException {
Path path = Paths.get("/home/file.txt");
writeBomFile(path, "billy");
removeBom(path);
}
private static void writeBomFile(Path path, String content) {
// Java 8 default UTF-8
try (BufferedWriter bw = Files.newBufferedWriter(path)) {
bw.write("\ufeff");
bw.write(content);
bw.newLine();
bw.write(content);
} catch (IOException e) {
e.printStackTrace();
}
}
private static boolean isContainBOM(Path path) throws IOException {
if (Files.notExists(path)) {
throw new IllegalArgumentException("Path: " + path + " does not exists!");
}
boolean result = false;
byte[] bom = new byte[3];
try (InputStream is = new FileInputStream(path.toFile())) {
// read 3 bytes of a file.
is.read(bom);
// BOM encoded as ef bb bf
String content = new String(Hex.encodeHex(bom));
if ("efbbbf".equalsIgnoreCase(content)) {
result = true;
}
}
return result;
}
private static void removeBom(Path path) throws IOException {
if (isContainBOM(path)) {
byte[] bytes = Files.readAllBytes(path);
ByteBuffer bb = ByteBuffer.wrap(bytes);
System.out.println("Found BOM!");
byte[] bom = new byte[3];
// get the first 3 bytes
bb.get(bom, 0, bom.length);
// remaining
byte[] contentAfterFirst3Bytes = new byte[bytes.length - 3];
bb.get(contentAfterFirst3Bytes, 0, contentAfterFirst3Bytes.length);
System.out.println("Remove the first 3 bytes, and overwrite the file!");
// override the same path
Files.write(path, contentAfterFirst3Bytes);
} else {
System.out.println("This file doesn't contains UTF-8 BOM!");
}
}
}
複制UTF-8檔案并追加BOM
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
public class CopyAndAddBomToXmlFile {
public static void main(String[] args) {
Path src = Paths.get("src/main/resources/staff.xml");
Path dest = Paths.get("src/main/resources/staff-bom.xml");
writeBomFile(src, dest);
}
private static void writeBomFile(Path src, Path dest) {
try (FileOutputStream fos = new FileOutputStream(dest.toFile())) {
byte[] BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
// add BOM
fos.write(BOM);
// BOM + src to fos
Files.copy(src, fos);
} catch (IOException e) {
e.printStackTrace();
}
}
}
{
byte[] BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
// add BOM
fos.write(BOM);
// BOM + src to fos
Files.copy(src, fos);
} catch (IOException e) {
e.printStackTrace();
}
}
}
####