LZW編解碼算法
文章目錄
- LZW編解碼算法
- 編碼原理
- 解碼原理
- 代碼實作
LZW編碼(LZW Encoding)又稱“串表壓縮算法”,由J.Ziv和A.Lempel在1978年首次介紹,并由Terry A.Welch在1984年予以改進,最終該編碼方法由三人的名字命名。
該編碼方法屬于詞典壓縮編碼方法。詞典編碼是一種通用編碼方法,适用于無法觀察新源統計特性,或雖然可觀察但統計特性不固定的情形。
LZW編碼可應用于通用檔案壓縮(如WinZip)、動畫圖像壓縮(如GIF、TIFF)等領域。
編碼原理
LZW編碼的核心思想是從輸入資料中建立一個“短語詞典(Dictionary of Phrases)”,這種短語可以是任意字元的組合。編碼資料過程中當遇到已經在詞典中出現的“短語”時,編碼器就輸出該詞條的“索引号”,而不是短語本身,以實作壓縮。
編碼流程如下:
例如:對“abbababac”進行LZW編碼。
初始化詞典:a = 97,b = 98,c = 99(ASCII碼)
步驟 | P | C | PC在詞典中? | 若不在,輸出P | 新增詞條 | 說明 |
1 | NULL | a | 初始化,不處理 | |||
2 | a | b | 否 | 97(a) | ab:256 | ab不在詞典中,擴充,P = C |
3 | b | b | 否 | 98(b) | bb:257 | bb不在詞典中,擴充,P = C |
4 | b | a | 否 | 98(b) | ba:258 | ba不在詞典中,擴充,P = C |
5 | a | b | 是 | ab在詞典中,P = PC | ||
6 | ab | a | 否 | 256(ab) | aba:259 | aba不在詞典中,擴充,P = C |
7 | a | b | 是 | ab在詞典中,P = PC | ||
8 | ab | a | 是 | aba在詞典中,P = PC | ||
9 | aba | c | 否 | 259(aba) | abac:260 | abac不在詞典中,擴充,P = C |
10 | c | NULL | 99(c) | 結束,輸出未編碼的字元 |
即編碼結果為:
“97 98 98 256 259 99”
解碼原理
首先我們要明确一點,編碼過程中建立的詞典,實際上并不與碼流一起傳送。這主要考慮到以下兩點:一是由于碼表可能占用的空間很大,不傳送碼表可以将壓縮比最大化;二是如果需要等收到編碼端的詞典後再進行解碼,便不可能實作編解碼兩端的同步操作。
那麼,我們就要在解碼端按照與編碼端相同的規則同步地建立詞典(Trie樹),這便是LZW解碼的核心思想。
解碼流程如下圖所示:
例如将上述輸出的編碼碼流“97 98 98 256 259 99”進行解碼:
步驟 | pW | cW | cW在詞典中? | 輸出 | 新增詞條 | 說明 |
1 | 97 | 是 | a | 在詞典中,輸出cW;新增pW + cW首字元 | ||
2 | 97 | 98 | 是 | b | ab:256 | 在詞典中,輸出cW;新增pW + cW首字元 |
3 | 98 | 98 | 是 | b | bb:257 | 在詞典中,輸出cW;新增pW + cW首字元 |
4 | 98 | 256 | 是 | ab | ba:258 | 在詞典中,輸出cW;新增pW + cW首字元 |
5 | 256 | 259 | 否 | aba | aba:259 | 不在詞典中,輸出pW + pW首字元,并添加至詞典 |
6 | 259 | 99 | 是 | c | abac:260 | 在詞典中,輸出cW;新增pW + cW首字元 |
按上述流程解碼得“abbababac”,即成功複原原始資料。
代碼實作
declarations.h
#pragma once
#include "BitIO.h"
#define DICT_CAPACITY 65535// Capacity of the dictionary
/* Global variables */
struct {
int suffix;
int parent;
int firstChild;
int nextSibling;
} dictionary[DICT_CAPACITY + 1];
extern int nextNodeIdx; // Index of next node (i.e. next dictionary entry)
extern int decStack[DICT_CAPACITY]; // Stack for decoding a phrase
/* Functions */
void LzwEncoding(FILE* inFilePtr, BITFILE* outBitFilePtr);
void LzwDecoding(BITFILE* inBitFilePtr, FILE* outFilePtr);
BitIO.h
#pragma once
#include <iostream>
typedef struct {
FILE* fp;
unsigned char mask;
int rack;
}BITFILE;
BITFILE* OpenBitFileInput(char* fileName);
BITFILE* OpenBitFileOutput(char* fileName);
void CloseBitFileInput(BITFILE* bf);
void CloseBitFileOutput(BITFILE* bf);
int BitInput(BITFILE* bf);
unsigned long BitsInput(BITFILE* bf, int count);
void BitOutput(BITFILE* bf, int bit);
void BitsOutput(BITFILE* bf, unsigned long code, int count);
BitIO.cpp
/* Definitions for bitwise IO */
#include <iostream>
#include <stdlib.h>
#include "BitIO.h"
BITFILE* OpenBitFileInput(char* fileName) {
//BITFILE* bf = (BITFILE*)malloc(sizeof(BITFILE));
BITFILE* bf = new BITFILE;
if (bf == NULL) {
return NULL;
}
if (fileName == NULL) {
bf->fp = stdin;
} else {
fopen_s(&bf->fp, fileName, "rb");
}
if (bf->fp == NULL) {
return NULL;
}
bf->mask = 0x80;
bf->rack = 0;
return bf;
}
BITFILE* OpenBitFileOutput(char* fileName) {
//BITFILE* bf = (BITFILE*)malloc(sizeof(BITFILE));
BITFILE* bf = new BITFILE;
if (bf == NULL) {
return NULL;
}
if (fileName == NULL) {
bf->fp = stdout;
} else {
fopen_s(&bf->fp, fileName, "wb");
}
if (bf->fp == NULL) {
return NULL;
}
bf->mask = 0x80;
bf->rack = 0;
return bf;
}
void CloseBitFileInput(BITFILE* bf) {
fclose(bf->fp);
//free(bf);
delete bf;
}
void CloseBitFileOutput(BITFILE* bf) {
/* Output the remaining bits */
if (bf->mask != 0x80) {
fputc(bf->rack, bf->fp);
}
fclose(bf->fp);
//free(bf);
delete bf;
}
int BitInput(BITFILE* bf) {
int value;
if (bf->mask == 0x80) {
bf->rack = fgetc(bf->fp);
if (bf->rack == EOF) {
fprintf(stderr, "Reached the end of file.\n");
exit(-1);
}
}
value = bf->mask & bf->rack;
bf->mask >>= 1;
if (0 == bf->mask) {
bf->mask = 0x80;
}
return((value == 0) ? 0 : 1);
}
unsigned long BitsInput(BITFILE* bf, int count) {
unsigned long mask;
unsigned long value;
mask = 1L << (count - 1);
value = 0L;
while (mask != 0) {
if (BitInput(bf) == 1)
value |= mask;
mask >>= 1;
}
return value;
}
void BitOutput(BITFILE* bf, int bit) {
if (bit != 0) {
bf->rack |= bf->mask;
}
bf->mask >>= 1;
if (bf->mask == 0) { // 8 bits in rack
fputc(bf->rack, bf->fp);
bf->rack = 0;
bf->mask = 0x80;
}
}
void BitsOutput(BITFILE* bf, unsigned long code, int count) {
unsigned long mask;
mask = 1L << (count - 1);
while (mask != 0) {
BitOutput(bf, (int)((code & mask) == 0 ? 0 : 1));
mask >>= 1;
}
}
LzwED.cpp
#include <iostream>
#include "declarations.h"
#include "BitIO.h"
using namespace std;
/* Global variables */
int nextNodeIdx;
int decStack[DICT_CAPACITY];
/* Macros */
#define Input(f) ((int)BitsInput(f, 16))
#define Output(f, x) BitsOutput(f, (unsigned long)(x), 16)
void InitialiseDict() { // Dictionary initialisation (initialise root node 0-255)
for (int i = 0; i < 256; i++) {
dictionary[i].suffix = i; // The suffix of each node is the corresponding ASCII code
dictionary[i].parent = -1; // Temporarily doesn't have a parent node (i.e. prefix)
dictionary[i].firstChild = -1; // Temporarily doesn't have any child nodes
dictionary[i].nextSibling = i + 1; // The index of the next sibling root node is the next ASCII code
}
dictionary[255].nextSibling = -1; // No next sibling for the last root node
nextNodeIdx = 256; // The index of next dictionary entry
}
int InDict(int P, int C) {
if (P == -1) {
/* In this case, the current character is the start of the file,
and it's evidently in the dictionary,
thus return the corresponding ASCII code (let P = this character). */
return C;
}
/* Traverse all child node(s) of node P from left to right (i.e. all sibling nodes of the first child node) */
int sibling = dictionary[P].firstChild; // Start from the first child of P
while (sibling > -1) { // sibling == -1 indicates the end of sibling traversal
/* If a C-suffixed sibling is found, then return the code of PC (i.e. the index of this sibling) */
if (C == dictionary[sibling].suffix) {
return sibling;
}
/* If the suffixes don't match, then look for the next */
sibling = dictionary[sibling].nextSibling;
}
/* The suffix of all siblings don't match PC, thus PC isn't in the dictionary */
return -1;
}
void NewDictEntry(int P, int C) {
if (P < 0) {
return;
}
dictionary[nextNodeIdx].suffix = C;
dictionary[nextNodeIdx].parent = P;
dictionary[nextNodeIdx].nextSibling = -1; // Indicates that the node is the last sibling
dictionary[nextNodeIdx].firstChild = -1; // Temporarily this node doesn't have a child
int pFirstChild = dictionary[P].firstChild; // The first child of P
int pChild;
/* Set up the new sibling-relation */
if (pFirstChild > -1) { /* Parent of the new node originally have a child node */
pChild = pFirstChild; // Start from the first child of P
/* Look for the youngest child of P (i.e. the last sibling) */
while (dictionary[pChild].nextSibling > -1) {
pChild = dictionary[pChild].nextSibling;
}
dictionary[pChild].nextSibling = nextNodeIdx; // Set the new node as the next sibling of the current last sibling
} else { /* Parent of the new node originally doesn't have a child */
dictionary[P].firstChild = nextNodeIdx; // Set the new node as PC (i.e. the first child of P)
}
nextNodeIdx++; //Index of the next entry + 1
}
void LzwEncoding(FILE* inFilePtr, BITFILE* outBitFilePtr) {
int previousStr; // P
int currentChar; // C
int PC; // P & C combined as 1 character
/* Compute the size of file */
fseek(inFilePtr, 0, SEEK_END);
int inFileSize = ftell(inFilePtr);
fseek(inFilePtr, 0, SEEK_SET);
BitsOutput(outBitFilePtr, inFileSize, 4 * 8);
/* Initialise the dictionary and P */
InitialiseDict();
previousStr = -1; // Initialise P
//fprintf(outFilePtr, "LZW-encoded string: ");
while ( (currentChar = fgetc(inFilePtr)) != EOF ) {
/* Check if PC is in the dictionary */
PC = InDict(previousStr, currentChar);
if (PC >= 0) { /* PC is in the dictionary */
previousStr = PC; // Set P = PC
} else { /* PC isn't in the dictionary */
Output(outBitFilePtr, previousStr); // Output P
if (nextNodeIdx < DICT_CAPACITY) { /* Enough space to add PC into the dictionary */
NewDictEntry(previousStr, currentChar);
}
previousStr = currentChar; // Set P = C
}
}
Output(outBitFilePtr, previousStr); // Output the last unencoded character(s)
}
int DecodeString(int start, int code) {
int count = start;
while (code >= 0) {
/* Look for the root node */
decStack[count] = dictionary[code].suffix; // Store the original string in inverted order
code = dictionary[code].parent; // Set the parent of the current node as the next node
count++; // Points to the next node
}
return count; // The distance between the current node and the root
}
void LzwDecoding(BITFILE* inBitFilePtr, FILE* outFilePtr) {
int character;
int previousCode; // pW
int currentCode; // cW
int phraseLen; // Length of phrase
unsigned long inFileSize = BitsInput(inBitFilePtr, 4 * 8);
if (inFileSize == -1) {
inFileSize = 0;
}
/* Initialise dictionary and pW*/
InitialiseDict();
previousCode = -1;
while (inFileSize > 0) {
currentCode = Input(inBitFilePtr);
if (currentCode < nextNodeIdx) { /* cW is in dictionary */
phraseLen = DecodeString(0, currentCode); // The length of cW
} else { /* When cW ¡Ý next node index, which means cW > current node index, cW isn't in dictionary */
decStack[0] = character; // The last character in stack of the last loop, i.e. 1st character of pW
phraseLen = DecodeString(1, previousCode); // The length of pW + 1
}
character = decStack[phraseLen - 1]; // The last character in the stack, i.e. the 1st character of pW or cW
while (phraseLen > 0) {
phraseLen--;
fputc(decStack[phraseLen], outFilePtr); // Output the decoded string (in inverted order of decStack)
inFileSize--;
}
if (nextNodeIdx < DICT_CAPACITY) { /* Add the new phrase into dictionary */
NewDictEntry(previousCode, character); // Add "pW + 1st character of cW" or "pW + 1st character of pW" into dictionary
}
previousCode = currentCode; // Set pW = cW
}
}
main.cpp
#include <iostream>
#include "declarations.h"
#include "BitIO.h"
using namespace std;
int main(int argc, char* argv[]) {
FILE* fp;
BITFILE* bf;
if (argc < 4) {
fprintf(stdout, "Usage: \n%s <options> <inFile> <outFile>\n", argv[0]);
fprintf(stdout, "\t<options>: E for LZW encoding or D for LZW decoding.\n");
fprintf(stdout, "\t<inFile>: name of input file.\n");
fprintf(stdout, "\t<outFile>: name of output file.\n");
return -1;
}
if ('E' == argv[1][0]) { /* Do LZW encoding */
/* Open the files */
if (fopen_s(&fp, argv[2], "rb") != 0) {
cout << "Failed to open \"" << argv[2] << "\"." << endl;
exit(-1);
}
bf = OpenBitFileOutput(argv[3]);
if ((fp != NULL) && (bf != NULL)) {
LzwEncoding(fp, bf);
//LZWEncode(fp, bf);
fclose(fp);
CloseBitFileOutput(bf);
fprintf(stdout, "LZW encoding done.\n");
}
} else if ('D' == argv[1][0]) { /* Do LZW decoding */
/* Open the files */
bf = OpenBitFileInput(argv[2]);
if (fopen_s(&fp, argv[3], "wb") != 0) {
cout << "Failed to open \"" << argv[2] << "\"." << endl;
exit(-1);
}
if ((fp != NULL) && (bf != NULL)) {
LzwDecoding(bf, fp);
//LZWDecode(bf, fp);
fclose(fp);
CloseBitFileInput(bf);
fprintf(stdout, "LZW decoding done.\n");
}
} else {
fprintf(stderr, "Unsupported operation.\n");
}
}
原始檔案為string.txt,編碼後的檔案寫為了二進制檔案string.dat,解碼後的檔案為string_D.txt:
打開編解碼後的檔案: