根據霍夫曼編碼，壓縮和解壓檔案位元組數組的過程

哈夫曼編碼(Huffman Coding)，又稱霍夫曼編碼，是一種編碼方式，可變字長編碼(VLC)的一種。Huffman于1952年提出一種編碼方法，該方法完全依據字元出現機率來構造異字頭的平均長度最短的碼字，有時稱之為最佳編碼，一般就叫做Huffman編碼（有時也稱為霍夫曼編碼）。

哈夫曼編碼，主要目的是根據使用頻率來最大化節省字元（編碼）的存儲空間。

本文主要利用霍夫曼編碼來壓縮位元組數組（可以是任何檔案），同時修複了尚矽谷韓順平老師的bug，對于位元組數組最後一位的補位問題

流程主要是

1.源位元組數組—>根據各個位元組出現的頻次，建立有權重（即該位元組出現的次數）的二叉樹結點，存儲到清單中

2.根據上一步的清單list，建立霍夫曼樹（建立過程原理可參照百度，原理較為簡單），獲得根節點

3.獲得霍夫曼樹後，重新編碼，根據路徑左0右1的原則，對每個字元的霍夫曼編碼存儲到map中，便于後續壓縮

4.對照編碼map，對源位元組數組重新編碼，獲得一串二進制數組

5.編碼後的二進制數組，每8位存儲為一個位元組，得到的結果就是編碼壓縮後的新位元組數組。

6.解碼：先将壓縮後的位元組數組依次讀取轉成二進制字元串

7.反轉map，依次讀取二進制字元串，還原源位元組數組

附上源碼

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class HuffmanCode {
	public static void main(String[] args) {
		
		zipFile("G:/src.bmp", "G:/dest.zip");
		System.out.println("壓縮成功");
		
		unZipFile("G:/dest.zip", "G:/src2.bmp");
		System.out.println("解壓成功");
		
		/*
		String content = "i like like like java do you like a javae";
//		String content = "你好";
		
		byte[] contentBytes = content.getBytes();
		System.out.println("原檔案位元組數組："+Arrays.toString(contentBytes));
//		System.out.println("原檔案位元組數組長度="+contentBytes.length);//40
		
		byte[] huffmanCodeBytes = huffmanZip(contentBytes);
		System.out.println("壓縮後的數組為："+Arrays.toString(huffmanCodeBytes));
		
		byte[] decodeBytes = decode(huffmanCodes, huffmanCodeBytes);
		System.out.println("解碼後的結果："+new String(decodeBytes));
		*/
		
		/*
		List<Node> nodes = getNodes(contentBytes);
		System.out.println(nodes);
		
		Node huffmanTreeRoot = createHuffmanTree(nodes);
		huffmanTreeRoot.preList();
		
		Map<Byte, String> huffmanCodes = createHuffmanCode(huffmanTreeRoot);
		System.out.println(huffmanCodes);
		
		byte[] huffmanCodeBytes = zip(contentBytes, huffmanCodes);
		System.out.println(Arrays.toString(huffmanCodeBytes)); //隻有17長度*/
	}
	
	//存放哈夫曼編碼表
	static Map<Byte, String> huffmanCodes = new HashMap<>();
	//二進制字元串8位存放一個位元組，存放編碼後的最後一個數字位數
	static int lastNumCap;
	
	//解壓檔案
	public static void unZipFile(String src,String dest) {
		InputStream is = null;
		ObjectInputStream ois = null;
		OutputStream os = null;
		
		try {
			is = new FileInputStream(src);
			ois = new ObjectInputStream(is);
			byte[] srcBytes = (byte[]) ois.readObject();//讀取的編碼後的位元組數組
			Map<Byte, String> huffmanCodes = (Map<Byte, String>) ois.readObject();//讀取編碼表
			byte[] destBytes = decode(huffmanCodes, srcBytes);
			
			os = new FileOutputStream(dest);
			os.write(destBytes);
			os.flush();
		} catch (Exception e) {
			System.out.println(e.getMessage());
		}finally {
			try {
				os.close();
				ois.close();
				is.close();
			} catch (IOException e) {
				// TODO 自動生成的 catch 塊
				e.printStackTrace();
			}
		}
	}
	
	//利用霍夫曼編碼壓縮普通檔案
	public static void zipFile(String src,String dest) {
		InputStream is = null;
		OutputStream os = null;
		ObjectOutputStream oos = null;
		try {
			is = new FileInputStream(src);
			byte[] srcBytes = new byte[is.available()];
			is.read(srcBytes);
			
			byte[] zipBytes = huffmanZip(srcBytes);
			
			os = new FileOutputStream(dest);
			oos = new ObjectOutputStream(os);//為了便于依次寫入檔案和編碼表，便于後續解壓
			oos.writeObject(zipBytes);
			oos.writeObject(huffmanCodes);
			
		} catch (Exception e) {
			System.out.println(e.getMessage());
		}finally {
			try {
				oos.close();
				os.close();
				is.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
	
	//完成資料解壓
	//1.壓縮後的數組[-88, -65, -56, -65, -56, -65, -55, 77, -57, 6, -24, -14, -117, -4, -60, -90, 28]
	//byte轉成二進制字元串，并且拼接
	//2.逐個讀取二進制字元串，根據哈夫曼編碼表解碼，得到解碼後的位元組數組
	
	/**
	 * 解壓方法
	 * @param huffmanCodes 霍夫曼編碼表(壓縮時建立)
	 * @param huffmanBytes 需要解壓位元組數組
	 * @return 壓縮前的源檔案,即解壓後的位元組數組
	 */
	private static byte[] decode(Map<Byte,String> huffmanCodes,byte[] huffmanBytes) {
		StringBuilder stringBuilder = new StringBuilder();

		//1.位元組數組轉成二進制字元串[-89, -34, -121, -34, -121, -->
		//101010001011111111001000101111111100100。。。
		for(int i = 0;i<huffmanBytes.length;i++) {
			byte b = huffmanBytes[i];
			boolean flag = (i==huffmanBytes.length-1);//判斷是否是最後一個，根據實際需要是否要補位
			stringBuilder.append(byteToBitString(!flag, b));
		}
//		System.out.println(stringBuilder);
		
		//2.讀取二進制字元串，根據編碼表獲得原本的位元組數組
		//反轉map，便于查詢101010001011111111001000
		Map<String, Byte> map = new HashMap<>();
		for(Map.Entry<Byte, String> entry:huffmanCodes.entrySet()) {
			map.put(entry.getValue(), entry.getKey()); 
		}
		
		//列印解碼用的map,測試用
//		System.out.println(map);
		List<Byte> list = new ArrayList<>();
		int count;//每次讀取的長度，讀取不到遞增變化
		boolean flag = true;
		String str;
		
		for(int i = 0;i<stringBuilder.length();) {
//			if(i==stringBuilder.length()-4) {
//				System.out.println("???");
//			}
			count = 1;
			flag = true;
			while(flag) {
				str = stringBuilder.substring(i,i+count);
				if(map.containsKey(str)) {//包含key
					list.add(map.get(str));
					//退出循環
					flag = false;
				}else {
					//需要繼續往下讀取
					count++;
				}
			}
			
			//讀取完一個位元組後，i+count這裡還沒讀取
			i +=count;
		}
//		System.out.println(list.size());
		//将list中的byte放入數組
		byte[] decodeBytes = new byte[list.size()];
		
		for(int i = 0;i<decodeBytes.length;i++) {
			decodeBytes[i] = list.get(i);
		}
		
		return decodeBytes;
	}
	
	/**
	 * 位元組轉二進制字元串
	 * @param flag 表示是否是位元組數組最後一個，true表示不是。則需要補位
	 * @param b
	 * @return
	 */
	private static String byteToBitString(boolean flag,byte b) {
		int temp = b;//轉成int類型
		if(flag) {
			temp |=256;//1 0000 0000按位取或  處理正數	
		}else {
			int a;
			//測試使用
//			System.out.println("temp="+temp);
			if(lastNumCap!=0) {//例如最後剩下001,lastNumCap=3, a = 1000,是以補充3個0
				a = (int) (Math.pow(2, lastNumCap));		
			}else {//說明二進制恰好被整除，最後剩一個8位，仍然按照取8位處理
				a = 256;
			}	
			temp |= (int) a;
			//測試用代碼
//			System.out.println("a="+a);
//			System.out.println("temp="+temp);
		}
		
		String str = Integer.toBinaryString(temp);
		if(flag) {			
			return str.substring(str.length()-8);
		}else {
			lastNumCap = lastNumCap==0?8:lastNumCap;//整除正常取8位，否則取lastNumCap位1001,取001
			return str.substring(str.length()-lastNumCap);
		}
	}
	
	/**
	 * 封裝壓縮方法
	 * @param contentBytes 需要壓縮的原位元組數組
	 * @return 壓縮後的數組
	 */
	private static byte[] huffmanZip(byte[] contentBytes) {
		//根據傳入的字元串數組，獲得各個字元出現的權重
		List<Node> nodes = getNodes(contentBytes);
		//建立赫夫曼樹
		Node huffmanTreeRoot = createHuffmanTree(nodes);
		//根據赫夫曼樹建立赫夫曼編碼表
		Map<Byte, String> huffmanCodes = createHuffmanCode(huffmanTreeRoot);
		//根據赫夫曼編碼表，擷取編碼後的壓縮數組
		byte[] huffmanCodeBytes = zip(contentBytes, huffmanCodes);
		return huffmanCodeBytes;
	}
	
	//赫夫曼編碼位元組數組方法
	/**
	 * 
	 * @param bytes 原字元串的位元組數組
	 * @param huffmanCodes 霍夫曼編碼表
	 * @return 編碼後的壓縮數組
	 */
	private static byte[] zip(byte[] bytes,Map<Byte, String> huffmanCodes) {
		StringBuilder stringBuilder = new StringBuilder();
		for(byte key:bytes) {
			stringBuilder.append(huffmanCodes.get(key));
		}
		//這是編碼後的二進制位元組串
//		System.out.println(stringBuilder.length());
//		System.out.println(stringBuilder);
		
		int len;//壓縮數組的長度
		lastNumCap = stringBuilder.length() % 8; //記錄最後剩下的一位的長度
		//确定傳回壓縮數組的長度
		//也可以一句搞定 len = (stringBuilder.length()+7)/8
		if(stringBuilder.length()%8==0) {
			len = stringBuilder.length() / 8;
		}else {
			len = stringBuilder.length() / 8 + 1;
		}
		
		byte[] huffmanCodeBytes = new byte[len];
		int index = 0;
		
		for(int i = 0;i<stringBuilder.length();i +=8) {
			String strByte;
			if(i+8>stringBuilder.length()) {
				strByte = stringBuilder.substring(i);
			}else {
				strByte = stringBuilder.substring(i,i+8);
			}
			huffmanCodeBytes[index] = (byte) Integer.parseInt(strByte, 2);
			index++;
		}
		
		return huffmanCodeBytes;
	}
	
	
	
	//提供方法createHuffmanCode的重載
	private static Map<Byte, String> createHuffmanCode(Node root){
		if(root==null) {
			return null;
		}
		createHuffmanCode(root.left, "0", new StringBuilder());
		createHuffmanCode(root.right,"1",new StringBuilder());
		
		return huffmanCodes;
	}
	
	
	/**
	 * 生成相應的赫夫曼編碼
	 * @param node 赫夫曼樹根節點
	 * @param code 區分左右路徑，左為0，右為1
	 * @param stringBuilder 到達目前結點的路徑，不包括上一個0（1），需要附加code
	 */
	public static void createHuffmanCode(Node node,String code,StringBuilder stringBuilder){
		//這裡建立一個stringBuilder，避免後面的修改覆寫
		StringBuilder stringBuilder2 = new StringBuilder(stringBuilder);
		//這裡采用額外參數添加的方式，與上面的理由相同
		stringBuilder2.append(code);
		if(node!=null) {
			if(node.data!=null) {
				huffmanCodes.put(node.data, stringBuilder2.toString());
			}else {
				createHuffmanCode(node.left,"0", stringBuilder2);
				createHuffmanCode(node.right, "1",stringBuilder2);
			}
		}
	}
	
	/**
	 * 根據原位元組數組，獲得每個位元組權重，建立結點的清單
	 * @param bytes 需要壓縮的位元組數組
	 * @return
	 */
	public static List<Node> getNodes(byte[] bytes){
		List<Node> nodes = new ArrayList<Node>();
		
		Map<Byte,Integer> counts = new HashMap<Byte, Integer>();
		for(byte data:bytes) {
			Integer count = counts.get(data);
			if(count==null) {//說明map中還未出現此data
				counts.put(data, 1);
			}else {
				counts.put(data, count+1);
			}
		}
		
		for(Map.Entry<Byte, Integer> entry:counts.entrySet()) {
			nodes.add(new Node(entry.getKey(), entry.getValue()));
		}
		
		return nodes;
	}
	
	/**
	 * 建立霍夫曼樹
	 * @param nodes  例如 [Node [data=-70, weight=1], Node [data=-60, weight=1]
	 * @return 建立好的霍夫曼樹根節點
	 */
	public static Node createHuffmanTree(List<Node> nodes) {
		while(nodes.size()>1) {
			Collections.sort(nodes);
			Node leftNode = nodes.remove(0);
			Node rightNode = nodes.remove(0);
			Node parent = new Node(null, leftNode.weight+rightNode.weight);
			parent.left = leftNode;
			parent.right = rightNode;
			nodes.add(parent);
		}
		
		return nodes.get(0);
	}
}

class Node implements Comparable<Node>{
	Byte data;//字元
	int weight;//權重
	Node left;
	Node right;
	
	public Node(Byte data, int weight) {
		this.data = data;
		this.weight = weight;
	}
	
	@Override
	public String toString() {
		return "Node [data=" + data + ", weight=" + weight + "]";
	}
	
	//前序周遊
	public void preList() {
		System.out.println(this);
		if(this.left!=null) {
			this.left.preList();
		}
		
		if(this.right!=null) {
			this.right.preList();
		}
	}

	@Override
	public int compareTo(Node o) {
		// TODO 自動生成的方法存根
		return this.weight - o.weight;
	}
}

根據霍夫曼編碼，壓縮和解壓檔案位元組數組的過程

繼續閱讀

nginx location中斜線的位置的重要性

資料結構與算法（27）——排序（二）

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

Java String.format方法的簡單使用

neo4j之cypher使用文檔

GitHub連夜封殺！這份阿裡 10W 字内部 Java 字面試手冊到底有多強？

spark/scala關于【資源檔案】加載方法概述外部檔案加載方案測試資源檔案打包入jar包中小結

mybatis_入門程式Mybatis入門

AOP程式設計_Android優雅權限架構(1)概念基礎，2021金三銀四前言正文大綱正文

Effective Java 8:通用程式設計

OOM三種類型

工廠模式-三種類型

【遞歸】高效率求2的n次幂

win10本地scala和spark安裝安裝scala安裝spark

Linux裝置模型（中）之上層容器

scala (3) Function 和 Method