java 文件编码判断

2022-12-19 07:10:22

java的unicode对于无头Utf-8格式的编码的判断是缺失

工具类实现java对文件编码的判断

编码格式判断工具类方法

public static String getTxtEncode(InputStream in) throws IOException {
        String dc = Charset.defaultCharset().name();
        UnicodeInputStream uin = new UnicodeInputStream(in, dc);
        if ("UTF-8".equals(uin.getEncoding())) {
            uin.close();
            return "UTF-8";
        }
        uin.close();
        byte[] head = new byte[3];
        in.read(head);
        in.reset();
        String code = "GBK";
        if (head[0] == -1 && head[1] == -2)
            code = "UTF-16";
        if (head[0] == -2 && head[1] == -1)
            code = "Unicode";
        // 带BOM
        if (head[0] == -17 && head[1] == -69 && head[2] == -65)
            code = "UTF-8";
        if ("Unicode".equals(code)) {
            code = "UTF-16";
        }
        return code;
    }

此处代码实现对无头uff-8编码判断

import java.io.*;    

/**  
 * This inputstream will recognize unicode BOM marks and will skip bytes if  
 * getEncoding() method is called before any of the read(...) methods.  
 *   
 * Usage pattern: String enc = "ISO-8859-1"; // or NULL to use systemdefault  
 * FileInputStream fis = new FileInputStream(file); UnicodeInputStream uin = new  
 * UnicodeInputStream(fis, enc); enc = uin.getEncoding(); // check and skip  
 * possible BOM bytes InputStreamReader in; if (enc == null) in = new  
 * InputStreamReader(uin); else in = new InputStreamReader(uin, enc);  
 *
 * 添加对无Bom Utf-8文件编码的识别
 */    
public class UnicodeInputStream extends InputStream {    
    PushbackInputStream internalIn;    
    boolean isInited = false;    
    String defaultEnc;    
    String encoding;    

    private static final int BOM_SIZE = 4;    

    public UnicodeInputStream(InputStream in, String defaultEnc) {    
        internalIn = new PushbackInputStream(in, BOM_SIZE);    
        this.defaultEnc = defaultEnc;    
    }    

    public String getDefaultEncoding() {    
        return defaultEnc;    
    }    

    public String getEncoding() {    
        if (!isInited) {    
            try {    
                init();    
            } catch (IOException ex) {    
                IllegalStateException ise = new IllegalStateException(    
                        "Init method failed.");    
                ise.initCause(ise);    
                throw ise;    
            }    
        }    
        return encoding;    
    }    

    /**  
     * Read-ahead four bytes and check for BOM marks. Extra bytes are unread  
     * back to the stream, only BOM bytes are skipped.  
     */    
    protected void init() throws IOException {    
        if (isInited)    
            return;    

        byte bom[] = new byte[BOM_SIZE];    
        int n, unread;    
        n = internalIn.read(bom, 0, bom.length);    

        if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)    
                && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {    
            encoding = "UTF-32BE";    
            unread = n - 4;    
        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)    
                && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {    
            encoding = "UTF-32LE";    
            unread = n - 4;    
        } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)    
                && (bom[2] == (byte) 0xBF)) {    
            encoding = "UTF-8";    
            unread = n - 3;    
        } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {    
            encoding = "UTF-16BE";    
            unread = n - 2;    
        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {    
            encoding = "UTF-16LE";    
            unread = n - 2;    
        } else {    
            // Unicode BOM mark not found, unread all bytes    
            encoding = defaultEnc;    
            unread = n;    
        }    
        // System.out.println("read=" + n + ", unread=" + unread);    

        if (unread > 0)    
            internalIn.unread(bom, (n - unread), unread);    

        isInited = true;    
    }    

    public void close() throws IOException {    
        // init();    
        isInited = true;    
        internalIn.close();    
    }    

    public int read() throws IOException {    
        // init();    
        isInited = true;    
        return internalIn.read();    
    }    
}

参考此文： http://jybzjf.iteye.com/blog/2262392

java 文件编码判断

继续阅读

BOM——window对象下的属性、定时器、匀速动画函数1.系统对话框 2.页面加载的事件3.window对象下的属性4.定时器（计时器）——有两种5.计时器案例

BOM_window对象

bom-window-setInterval()和setTimeout()的区别

BOM之window 对象

BOM:window对象的常见事件之调整窗口大小事件

BOM中window对象的方法之弹窗

Java正确判别出文件的字符集（尤其是UTF-8(无BOM)和GBK的判断）

Bom中的方法

BOM清除工具

iOS开发的几种加密方式

测试的基本理论与方法（1）

记一次因MySQL编码问题导致的慢查询排查

java操作access数据库乱码问题

《程序员的职业素养》四——编码

V4L2视频采集与H264编码1—V4L2采集JPEG数据

Netty——自定义协议解决TCP粘包拆包问题什么是TCP粘包拆包自定义协议解决拆包粘包问题