天天看點

實驗三 LZW

LZW編解碼算法

文章目錄

  • ​​LZW編解碼算法​​
  • ​​編碼原理​​
  • ​​解碼原理​​
  • ​​代碼實作​​

LZW編碼(LZW Encoding)又稱“串表壓縮算法”,由J.Ziv和A.Lempel在1978年首次介紹,并由Terry A.Welch在1984年予以改進,最終該編碼方法由三人的名字命名。

該編碼方法屬于詞典壓縮編碼方法。詞典編碼是一種通用編碼方法,适用于無法觀察新源統計特性,或雖然可觀察但統計特性不固定的情形。

LZW編碼可應用于通用檔案壓縮(如WinZip)、動畫圖像壓縮(如GIF、TIFF)等領域。

編碼原理

LZW編碼的核心思想是從輸入資料中建立一個“短語詞典(Dictionary of Phrases)”,這種短語可以是任意字元的組合。編碼資料過程中當遇到已經在詞典中出現的“短語”時,編碼器就輸出該詞條的“索引号”,而不是短語本身,以實作壓縮。

編碼流程如下:

實驗三 LZW

例如:對“abbababac”進行LZW編碼。

初始化詞典:a = 97,b = 98,c = 99(ASCII碼)

步驟 P C PC在詞典中? 若不在,輸出P 新增詞條 說明
1 NULL a 初始化,不處理
2 a b 97(a) ab:256 ab不在詞典中,擴充,P = C
3 b b 98(b) bb:257 bb不在詞典中,擴充,P = C
4 b a 98(b) ba:258 ba不在詞典中,擴充,P = C
5 a b ab在詞典中,P = PC
6 ab a 256(ab) aba:259 aba不在詞典中,擴充,P = C
7 a b ab在詞典中,P = PC
8 ab a aba在詞典中,P = PC
9 aba c 259(aba) abac:260 abac不在詞典中,擴充,P = C
10 c NULL 99(c) 結束,輸出未編碼的字元

即編碼結果為:

“97 98 98 256 259 99”

解碼原理

首先我們要明确一點,編碼過程中建立的詞典,實際上并不與碼流一起傳送。這主要考慮到以下兩點:一是由于碼表可能占用的空間很大,不傳送碼表可以将壓縮比最大化;二是如果需要等收到編碼端的詞典後再進行解碼,便不可能實作編解碼兩端的同步操作。

那麼,我們就要在解碼端按照與編碼端相同的規則同步地建立詞典(Trie樹),這便是LZW解碼的核心思想。

解碼流程如下圖所示:

實驗三 LZW

例如将上述輸出的編碼碼流“97 98 98 256 259 99”進行解碼:

步驟 pW cW cW在詞典中? 輸出 新增詞條 說明
1 97 a 在詞典中,輸出cW;新增pW + cW首字元
2 97 98 b ab:256 在詞典中,輸出cW;新增pW + cW首字元
3 98 98 b bb:257 在詞典中,輸出cW;新增pW + cW首字元
4 98 256 ab ba:258 在詞典中,輸出cW;新增pW + cW首字元
5 256 259 aba aba:259 不在詞典中,輸出pW + pW首字元,并添加至詞典
6 259 99 c abac:260 在詞典中,輸出cW;新增pW + cW首字元

按上述流程解碼得“abbababac”,即成功複原原始資料。

代碼實作

​declarations.h​

#pragma once
#include "BitIO.h"
#define DICT_CAPACITY 65535// Capacity of the dictionary
/* Global variables */
struct {
    int suffix;
    int parent;
    int firstChild;
    int nextSibling;
} dictionary[DICT_CAPACITY + 1];
extern int nextNodeIdx;    // Index of next node (i.e. next dictionary entry)
extern int decStack[DICT_CAPACITY]; // Stack for decoding a phrase

/* Functions */
void LzwEncoding(FILE* inFilePtr, BITFILE* outBitFilePtr);
void LzwDecoding(BITFILE* inBitFilePtr, FILE* outFilePtr);      

​BitIO.h​

#pragma once
#include <iostream>

typedef struct {
  FILE* fp;
  unsigned char mask;
  int rack;
}BITFILE;

BITFILE* OpenBitFileInput(char* fileName);
BITFILE* OpenBitFileOutput(char* fileName);
void CloseBitFileInput(BITFILE* bf);
void CloseBitFileOutput(BITFILE* bf);
int BitInput(BITFILE* bf);
unsigned long BitsInput(BITFILE* bf, int count);
void BitOutput(BITFILE* bf, int bit);
void BitsOutput(BITFILE* bf, unsigned long code, int count);      

​BitIO.cpp​

/* Definitions for bitwise IO */

#include <iostream>
#include <stdlib.h>
#include "BitIO.h"

BITFILE* OpenBitFileInput(char* fileName) {
  //BITFILE* bf = (BITFILE*)malloc(sizeof(BITFILE));
  BITFILE* bf = new BITFILE;
  if (bf == NULL) {
    return NULL;
  }
  if (fileName == NULL) {
    bf->fp = stdin;
  } else {
    fopen_s(&bf->fp, fileName, "rb");
  }
  if (bf->fp == NULL) {
    return NULL;
  }
  bf->mask = 0x80;
  bf->rack = 0;
  return bf;
}

BITFILE* OpenBitFileOutput(char* fileName) {
  //BITFILE* bf = (BITFILE*)malloc(sizeof(BITFILE));
  BITFILE* bf = new BITFILE;
  if (bf == NULL) {
    return NULL;
  }
  if (fileName == NULL) {
    bf->fp = stdout;
  } else {
    fopen_s(&bf->fp, fileName, "wb");
  }
  if (bf->fp == NULL) {
    return NULL;
  }
  bf->mask = 0x80;
  bf->rack = 0;
  return bf;
}

void CloseBitFileInput(BITFILE* bf) {
  fclose(bf->fp);
  //free(bf);
  delete bf;
}

void CloseBitFileOutput(BITFILE* bf) {
  /* Output the remaining bits */
  if (bf->mask != 0x80) {
    fputc(bf->rack, bf->fp);
  }
  fclose(bf->fp);
  //free(bf);
  delete bf;
}

int BitInput(BITFILE* bf) {
  int value;

  if (bf->mask == 0x80) {
    bf->rack = fgetc(bf->fp);
    if (bf->rack == EOF) {
      fprintf(stderr, "Reached the end of file.\n");
      exit(-1);
    }
  }
  value = bf->mask & bf->rack;
  bf->mask >>= 1;
  if (0 == bf->mask) {
    bf->mask = 0x80;
  }
  return((value == 0) ? 0 : 1);
}

unsigned long BitsInput(BITFILE* bf, int count) {
  unsigned long mask;
  unsigned long value;
  mask = 1L << (count - 1);
  value = 0L;
  while (mask != 0) {
    if (BitInput(bf) == 1)
      value |= mask;
    mask >>= 1;
  }
  return value;
}

void BitOutput(BITFILE* bf, int bit) {
  if (bit != 0) {
    bf->rack |= bf->mask;
  }
  bf->mask >>= 1;
  if (bf->mask == 0) {  // 8 bits in rack
    fputc(bf->rack, bf->fp);
    bf->rack = 0;
    bf->mask = 0x80;
  }
}

void BitsOutput(BITFILE* bf, unsigned long code, int count) {
  unsigned long mask;

  mask = 1L << (count - 1);
  while (mask != 0) {
    BitOutput(bf, (int)((code & mask) == 0 ? 0 : 1));
    mask >>= 1;
  }
}      

​LzwED.cpp​

#include <iostream>
#include "declarations.h"
#include "BitIO.h"
using namespace std;

/* Global variables */
int nextNodeIdx;
int decStack[DICT_CAPACITY];

/* Macros */
#define Input(f) ((int)BitsInput(f, 16))
#define Output(f, x) BitsOutput(f, (unsigned long)(x), 16)

void InitialiseDict() {  // Dictionary initialisation (initialise root node 0-255)
  for (int i = 0; i < 256; i++) {
    dictionary[i].suffix = i;  // The suffix of each node is the corresponding ASCII code
    dictionary[i].parent = -1;  // Temporarily doesn't have a parent node (i.e. prefix)
    dictionary[i].firstChild = -1;  // Temporarily doesn't have any child nodes
    dictionary[i].nextSibling = i + 1;  // The index of the next sibling root node is the next ASCII code
  }

  dictionary[255].nextSibling = -1;  // No next sibling for the last root node
  nextNodeIdx = 256;  // The index of next dictionary entry
}

int InDict(int P, int C) {
  if (P == -1) {
    /* In this case, the current character is the start of the file,
    and it's evidently in the dictionary,
    thus return the corresponding ASCII code (let P = this character). */
    return C;
  }

  /* Traverse all child node(s) of node P from left to right (i.e. all sibling nodes of the first child node) */
  int sibling = dictionary[P].firstChild;  // Start from the first child of P
  while (sibling > -1) {  // sibling == -1 indicates the end of sibling traversal
    /* If a C-suffixed sibling is found, then return the code of PC (i.e. the index of this sibling) */
    if (C == dictionary[sibling].suffix) {
      return sibling;
    }
    /* If the suffixes don't match, then look for the next */
    sibling = dictionary[sibling].nextSibling;
  }

  /* The suffix of all siblings don't match PC, thus PC isn't in the dictionary */
  return -1;
}

void NewDictEntry(int P, int C) {
  if (P < 0) {
    return;
  }

  dictionary[nextNodeIdx].suffix = C;
  dictionary[nextNodeIdx].parent = P;
  dictionary[nextNodeIdx].nextSibling = -1;  // Indicates that the node is the last sibling
  dictionary[nextNodeIdx].firstChild = -1;  // Temporarily this node doesn't have a child
  int pFirstChild = dictionary[P].firstChild;  // The first child of P
  int pChild;

  /* Set up the new sibling-relation */
  if (pFirstChild > -1) {  /* Parent of the new node originally have a child node */
    pChild = pFirstChild;  // Start from the first child of P

    /* Look for the youngest child of P (i.e. the last sibling) */
    while (dictionary[pChild].nextSibling > -1) {
      pChild = dictionary[pChild].nextSibling;
    }

    dictionary[pChild].nextSibling = nextNodeIdx;  // Set the new node as the next sibling of the current last sibling
  } else {  /* Parent of the new node originally doesn't have a child */
    dictionary[P].firstChild = nextNodeIdx;  // Set the new node as PC (i.e. the first child of P)
  }

  nextNodeIdx++;  //Index of the next entry + 1
}

void LzwEncoding(FILE* inFilePtr, BITFILE* outBitFilePtr) {
  int previousStr;  // P
  int currentChar;  // C
  int PC;  // P & C combined as 1 character

  /* Compute the size of file */
  fseek(inFilePtr, 0, SEEK_END);
  int inFileSize = ftell(inFilePtr);
  fseek(inFilePtr, 0, SEEK_SET);
  BitsOutput(outBitFilePtr, inFileSize, 4 * 8);

  /* Initialise the dictionary and P */
  InitialiseDict();
  previousStr = -1;  // Initialise P

  //fprintf(outFilePtr, "LZW-encoded string: ");
  while ( (currentChar = fgetc(inFilePtr)) != EOF ) {
    /* Check if PC is in the dictionary */
    PC = InDict(previousStr, currentChar);
    if (PC >= 0) {  /* PC is in the dictionary */
      previousStr = PC;  // Set P = PC
    } else {  /* PC isn't in the dictionary */
      Output(outBitFilePtr, previousStr);  // Output P
      if (nextNodeIdx < DICT_CAPACITY) {  /* Enough space to add PC into the dictionary */
        NewDictEntry(previousStr, currentChar);
      }
      previousStr = currentChar;  // Set P = C
    }
  }

  Output(outBitFilePtr, previousStr);  // Output the last unencoded character(s)
}

int DecodeString(int start, int code) {
  int count = start;
  while (code >= 0) {
    /* Look for the root node */
    decStack[count] = dictionary[code].suffix;  // Store the original string in inverted order
    code = dictionary[code].parent;  // Set the parent of the current node as the next node
    count++;  // Points to the next node
  }
  return count;  // The distance between the current node and the root
}

void LzwDecoding(BITFILE* inBitFilePtr, FILE* outFilePtr) {
  int character;
  int previousCode;  // pW
  int currentCode;  // cW
  int phraseLen;  // Length of phrase

  unsigned long inFileSize = BitsInput(inBitFilePtr, 4 * 8);
  if (inFileSize == -1) {
    inFileSize = 0;
  }

  /* Initialise dictionary and pW*/
  InitialiseDict();
  previousCode = -1;

  while (inFileSize > 0) {
    currentCode = Input(inBitFilePtr);
    if (currentCode < nextNodeIdx) {  /* cW is in dictionary */
      phraseLen = DecodeString(0, currentCode);  // The length of cW
    } else {  /* When cW ¡Ý next node index, which means cW > current node index, cW isn't in dictionary */
      decStack[0] = character;  // The last character in stack of the last loop, i.e. 1st character of pW
      phraseLen = DecodeString(1, previousCode);  // The length of pW + 1
    }
    character = decStack[phraseLen - 1];  // The last character in the stack, i.e. the 1st character of pW or cW

    while (phraseLen > 0) {
      phraseLen--;
      fputc(decStack[phraseLen], outFilePtr);  // Output the decoded string (in inverted order of decStack)
      inFileSize--;
    }
    if (nextNodeIdx < DICT_CAPACITY) {  /* Add the new phrase into dictionary */
      NewDictEntry(previousCode, character);  // Add "pW + 1st character of cW" or "pW + 1st character of pW" into dictionary
    }
    previousCode = currentCode;  // Set pW = cW
  }
}      

​main.cpp​

#include <iostream>
#include "declarations.h"
#include "BitIO.h"
using namespace std;

int main(int argc, char* argv[]) {
  FILE* fp;
  BITFILE* bf;

  if (argc < 4) {
    fprintf(stdout, "Usage: \n%s <options> <inFile> <outFile>\n", argv[0]);
    fprintf(stdout, "\t<options>: E for LZW encoding or D for LZW decoding.\n");
    fprintf(stdout, "\t<inFile>: name of input file.\n");
    fprintf(stdout, "\t<outFile>: name of output file.\n");
    return -1;
  }

  if ('E' == argv[1][0]) {  /* Do LZW encoding */
    /* Open the files */
    if (fopen_s(&fp, argv[2], "rb") != 0) {
      cout << "Failed to open \"" << argv[2] << "\"." << endl;
      exit(-1);
    }
    bf = OpenBitFileOutput(argv[3]);

    if ((fp != NULL) && (bf != NULL)) {
      LzwEncoding(fp, bf);
      //LZWEncode(fp, bf);
      fclose(fp);
      CloseBitFileOutput(bf);
      fprintf(stdout, "LZW encoding done.\n");
    }
  } else if ('D' == argv[1][0]) {  /* Do LZW decoding */
    /* Open the files */
    bf = OpenBitFileInput(argv[2]);
    if (fopen_s(&fp, argv[3], "wb") != 0) {
      cout << "Failed to open \"" << argv[2] << "\"." << endl;
      exit(-1);
    }

    if ((fp != NULL) && (bf != NULL)) {
      LzwDecoding(bf, fp);
      //LZWDecode(bf, fp);
      fclose(fp);
      CloseBitFileInput(bf);
      fprintf(stdout, "LZW decoding done.\n");
    }
  } else {
    fprintf(stderr, "Unsupported operation.\n");
  }
}      

原始檔案為string.txt,編碼後的檔案寫為了二進制檔案string.dat,解碼後的檔案為string_D.txt:

實驗三 LZW

打開編解碼後的檔案:

實驗三 LZW
實驗三 LZW