天天看點

變種LZ77資料無損壓縮算法

  這是我在學習字典壓縮算法時自己設計的,采用64K固定視窗對資料進行分塊壓縮,編碼時使用了變長偏移長度,變長比對長度,哈希字典等對算法進行了優化,壓縮率和壓縮速度均比較理想,很适合用于網絡傳輸中的資料實時壓縮。

算法提供的兩個外部調用函數:

// ---------------------------------------------------------------------------

// 使用LZ77壓縮算法對資料進行壓縮

// dst:壓縮輸出緩沖區(輸出緩沖區長度至少應保留原始資料長度+128K)

// src:原始資料

// len:原始資料長度

// level:壓縮等級(可選0 - 4,一般使用2)

// return 傳回壓縮後資料長度

// ---------------------------------------------------------------------------

int Lz77Compress(void *dst, void *src, int len, int level);

// ---------------------------------------------------------------------------

// 解壓LZ7算法壓縮過的資料

// dst:解壓輸出緩沖區

// src:壓縮資料

// len:壓縮資料長度

// return 傳回解壓後資料長度

// ---------------------------------------------------------------------------

int Lz77Decompress(void *dst, void *src, int len);

lz77.c

#ifdef WIN32

#include <windows.h>

#define malloc(s) HeapAlloc(GetProcessHeap(), 0, s)

#define free(p) HeapFree(GetProcessHeap(), 0, p)

#else

#include <stdlib.h>

#include <string.h>

#endif

#define MAXBITS   15

#define MINOFFSET  0x01

#define MINMATCH  0x03

#define MAXMATCH  ((1 << 24) + MINMATCH)

#define MAXWND   (1 << MAXBITS)

#define NIL    0xffff

#define M    3

#define MAX(a, b)  ((a) > (b) ? (a) : (b))

#define MIN(a, b)  ((a) < (b) ? (a) : (b))

typedef unsigned char UCHAR;

typedef unsigned short USHORT;

typedef unsigned long ULONG;

typedef struct _LZ77_MATCHINFO

{

 ULONG len;

 ULONG off;

} LZ77_MATCHINFO;

typedef struct _LZ77_RUNSTATE

{

 ULONG wsize;

 UCHAR *pwnd;

 ULONG confine;

 USHORT *head;

 USHORT *prev;

 ULONG nice;

} LZ77_RUNSTATE;

typedef struct _LZ77_IOSTATE

{

 UCHAR *pbuf;

 ULONG bytenum;

 UCHAR bitnum;

 UCHAR codelen;

} LZ77_INPUTS, LZ77_OUTPUTS;

// ---------------------------------------------------------------------------

// 計算用二進制表示指定數值至少需要多少位

// ---------------------------------------------------------------------------

static UCHAR log2(ULONG n)

{

 UCHAR c, i;

 if (n > 0xffff)

 {

  for (i = 16; n > ((ULONG)-1 >> (sizeof(ULONG) * 8 - i)); i++);

  return i;

 }

 if (n & 0xff00)

 {

  if (n & 0xf000)

  {

   if (n & 0xc000)

   {

    if (n & 0x8000)

    {

     c = 16;

    }

    else

    {

     c = 15;

    }

   }

   else

   {

    if (n & 0x2000)

    {

     c = 14;

    }

    else

    {

     c = 13;

    }

   }

  }

  else

  {

   if (n & 0x0c00)

   {

    if (n & 0x0800)

    {

     c = 12;

    }

    else

    {

     c = 11;

    }

   }

   else

   {

    if (n & 0x0200)

    {

     c = 10;

    }

    else

    {

     c = 9;

    }

   }

  }

 }

 else

 {

  if (n & 0x00f0)

  {

   if (n & 0x00c0)

   {

    if (n & 0x0080)

    {

     c = 8;

    }

    else

    {

     c = 7;

    }

   }

   else

   {

    if (n & 0x0020)

    {

     c = 6;

    }

    else

    {

     c = 5;

    }

   }

  }

  else

  {

   if (n & 0x000c)

   {

    if (n & 0x0008)

    {

     c = 4;

    }

    else

    {

     c = 3;

    }

   }

   else

   {

    if (n & 0x0002)

    {

     c = 2;

    }

    else

    {

     c = 1;

    }

   }

  }

 }

 return c;

}

// ---------------------------------------------------------------------------

// 輸出指定長度的二進制位,最大長度為sizeof(ULONG)

// ---------------------------------------------------------------------------

static void PutBits(LZ77_OUTPUTS *out, ULONG v, int num)

{

 UCHAR *s = out->pbuf + out->bytenum;

 ULONG i = 0;

 ULONG temp = v & ~(-1 << num);

 do

 {

  s[i] &= ~(-1 << out->bitnum);

  s[i] |= (UCHAR)(temp << out->bitnum);

  if (8 - out->bitnum >= num)

   break;

  s[i + 1] = (UCHAR)(temp >> (8 - out->bitnum));

  temp >>= 8;

 } while ((++i << 3) < (ULONG)num);

 out->bitnum += (UCHAR)num;

 out->bytenum += out->bitnum >> 3;

 out->bitnum &= 7;

}

// ---------------------------------------------------------------------------

// 擷取指定長度的二進制位,最大長度為sizeof(ULONG)

// ---------------------------------------------------------------------------

static ULONG GetBits(LZ77_INPUTS *in, int num)

{

 UCHAR *s = in->pbuf + in->bytenum;

 ULONG i = 0, v = 0;

 do

 {

  v |= (s[i] >> in->bitnum) << (i << 3);

  if (8 - in->bitnum >= num)

   break;

  v |= (s[i + 1] << (8 - in->bitnum)) << (i << 3);

 } while ((++i << 3) < (ULONG)num);

 in->bitnum += (UCHAR)num;

 in->bytenum += in->bitnum >> 3;

 in->bitnum &= 7;

 return v & ~(-1 << num);

}

// ---------------------------------------------------------------------------

// 将指定位置開始的位元組串添加到字典中

// ---------------------------------------------------------------------------

static void insert(LZ77_RUNSTATE *rs, ULONG at, ULONG len)

{

 ULONG ins_h, ins_t;

 if (len == 1)

 {

  ins_h = *(USHORT *)(rs->pwnd + at);

  rs->prev[at] = rs->head[ins_h];

  rs->head[ins_h] = (USHORT)at;

  return;

 }

 if ((at + len) < MAXWND)

 {

  ins_t = -1;

  len += at--;

  while (++at != len)

  {

   ins_h = *(USHORT *)(rs->pwnd + at);

   if ((ins_t - rs->head[ins_h]) <= 2)

    continue;

   ins_t = at;

   rs->prev[at] = rs->head[ins_h];

   rs->head[ins_h] = (USHORT)at;

  }

 }

}

// ---------------------------------------------------------------------------

// 标志位定義:

// 長度:1,值:0,表示後面有一位元組未壓縮資料

// 長度:2,值:10,表示後面有一個比對(變長偏移+變長長度)

// 長度:3,值:110,表示後面有一個比對(7位偏移+1位長度,偏移為128時表示壓縮流結束)

// 長度:3,值:111,表示後面有多個位元組未壓縮資料

// ---------------------------------------------------------------------------

#define CHARBITS1 4

#define CHARBITS2 7

#define CHARBITS3 16

#define CHARNUMS0 7

#define CHARNUMS1 ((1 << CHARBITS1) - 1 + (CHARNUMS0 + 1) - 2)

#define CHARNUMS2 ((1 << CHARBITS2) - 1 + (CHARNUMS1 + 1))

#define CHARNUMS3 ((1 << CHARBITS3) - 1 + (CHARNUMS2 + 1))

// ---------------------------------------------------------------------------

// 以壓縮格式輸出指定長度的位元組串并針對格式長度進行優化

// ---------------------------------------------------------------------------

static void outcodec(LZ77_OUTPUTS *out, UCHAR *buffer, ULONG length)

{

 ULONG i, temp;

 if (length <= CHARNUMS0)

 {

  for (i = 0; i < length; i++)

  {

   // 逐位元組輸出,額外輸出位(length)

   temp = 0x00 | (buffer[i] << 1);

   PutBits(out, temp, 1 + 8);   // 标志位(1),資料位(8)

  }

 }

 else

 {

  if (length <= CHARNUMS1)

  {

   // 輸出(0-13)表示有(0-13)+8個連續位元組未壓縮,額外輸出位(7)

   temp = 0x07 | ((length - CHARNUMS0 - 1) << 3);

   PutBits(out, temp, 3 + CHARBITS1); // 标志位(3),資料位(4)

  }

  else if (length <= CHARNUMS1 * 2)

  {

   // 優化輸出,最大額外輸出位(14)

   outcodec(out, buffer, CHARNUMS1);

   outcodec(out, buffer + CHARNUMS1, length - CHARNUMS1);

   return;

  }

  else if (length <= CHARNUMS2)

  {

   // 輸出(14)表示未壓縮位元組數由後面7位決定,額外輸出位(14)

   temp = 0x07 | (14 << 3);

   PutBits(out, temp, 3 + CHARBITS1); // 标志位(3),資料位(4)

   temp = length - CHARNUMS1 - 1;

   PutBits(out, temp, CHARBITS2);  // 資料位(7)

  }

  else if (length <= CHARNUMS2 + CHARNUMS1)

  {

   // 優化輸出,最大額外輸出位(21)

   outcodec(out, buffer, CHARNUMS2);

   outcodec(out, buffer + CHARNUMS2, length - CHARNUMS2);

   return;

  }

  else

  {

   // 輸出(15)表示未壓縮位元組數由後面兩位元組決定,額外輸出位(23)

   temp = 0x07 | (15 << 3);

   PutBits(out, temp, 3 + CHARBITS1); // 标志位(3),資料位(4)

   // 輸出(0-65535)+18個連續位元組未壓縮

   temp = length - CHARNUMS2 - 1;

   PutBits(out, temp, CHARBITS3);  // 資料位(16)

  }

  {

   UCHAR *s = out->pbuf + out->bytenum;

   UCHAR x = out->bitnum;

   temp = buffer[0];

   PutBits(out, temp, 8 - x);

   // 拷貝連續的未壓縮位元組

   for (i = 1; i < length; i++)

   {

    s[i] = buffer[i];

   }

   temp >>= 8 - x;

   out->bytenum += length - 1;

   PutBits(out, temp, x);

  }

 }

}

// ---------------------------------------------------------------------------

// 以壓縮格式輸出位元組串比對資訊并針對資訊長度進行優化

// ---------------------------------------------------------------------------

static void outcodex(LZ77_OUTPUTS *out, ULONG offset, ULONG length)

{

 UCHAR i = 0;

 ULONG temp, m, n;

 switch (length)

 {

//  case 1:

//   temp = 0x03 | ((offset - MINOFFSET) << 3);

//   PutBits(out, temp, 3 + 4);    // 标志位(3),資料位(4)

//   return;

 case 3:

  if (offset > 127)

   break;

 case 2:

  temp = 0x03 | ((offset - MINOFFSET) << 3); // 短比對優化

  temp |= (length - 2) << (3 + 7);

  PutBits(out, temp, 3 + 7 + 1);   // 标志位(3),資料位(1+7)

  return;

 }

 // 寫入變長比對偏移

 temp = 0x01 | ((offset - MINOFFSET) << 2);

 PutBits(out, temp, 2 + out->codelen);  // 标志位(2),資料位(log2(資料))

 length -= MINMATCH;

 m = 1 << (M - 1);

 // 計算比對長度最少占用多少位

 do

 {

  n = ~(-1 << i++) << M;

  m <<= 1;

 } while ((m + n) <= length);

 // 寫入比對長度位數

 temp = ~(-1 << (i - 1));

 PutBits(out, temp, i);

 // 寫入變長比對長度

 temp = length - n;

 PutBits(out, temp, i + 3 - 1);

}

// ---------------------------------------------------------------------------

// 從目前字典中查找一個比對位元組串,成功傳回1同時設定比對位元組串所在位置和長度

// ---------------------------------------------------------------------------

static int match(LZ77_RUNSTATE *rs, ULONG strat, LZ77_MATCHINFO *mi)

{

 UCHAR *src, *s, *d, *c, *t;

 USHORT index, *prev;

 ULONG i, m = 0, n, nice, flag = 0;

 src = rs->pwnd;

 index = rs->head[*(USHORT *)(src + strat)];  // 從字典中取出比對資訊

 if (NIL != index)

 {

  c = src + MIN(rs->confine, MAXMATCH);  // 限制最大比對長度

  t = src + strat;

  m = MINMATCH - 1;

  prev = rs->prev;

  nice = rs->nice;

  do

  {

   // 開始尋找相同的位元組串

   s = t;

   d = src + index;

   // 優化速度,一次循環比較8個位元組

   while (s < (c - 8)

    && *(USHORT *)(s += 2) == *(USHORT *)(d += 2)

    && *(USHORT *)(s += 2) == *(USHORT *)(d += 2)

    && *(USHORT *)(s += 2) == *(USHORT *)(d += 2)

    && *(USHORT *)(s += 2) == *(USHORT *)(d += 2));

   while (s < c && *s == *d)

   {

    s++;

    d++;

   }

   if (s >= c)   // 達到限制長度了?

   {

    m = c - t;  // 如果是便是能找到的最好比對了

    n = index;

    break;

   }

   i = s - t;

   if (m < i)   // 是否達到最小長度要求?

   {

    m = i;

    n = index;  // 記錄下找到的資訊

    if (m > nice)

     flag = 1; // 如果達到預設的最優比對長度則設定該标志,然後再次查找最優比對

   }

   else if (flag)  // 如果已達到過預設的最優比對長度則可以退出查找了

    break;

   index = prev[index];  // 取出字典中的下一次比對記錄

  } while (NIL != index);

 }

 if (MINMATCH <= m)   // 是否找到合适的比對資訊?

 {

  mi->len = m;

  mi->off = strat - n;

  return 1;

 }

 else

 {

  if (strat + 2 <= rs->confine)  // 檢查所剩位元組數是否足夠2個位元組

  {

   index = rs->head[*(USHORT *)(src + strat)];  // 找不到合适的比對時嘗試查找2個位元組的短比對

   if (strat - index <= 127)  // 短比對要求所在位置與目前位置之間不能超過127個位元組

   {

    mi->len = 2;

    mi->off = strat - index;

    return 1;

   }

  }

  // 從前面16位元組中查找1位元組比對

 }

 return 0;

}

// ---------------------------------------------------------------------------

// 壓縮算法主循環,将輸入資料壓縮成一個獨立的塊

// ---------------------------------------------------------------------------

static ULONG deflate(LZ77_RUNSTATE rs, UCHAR *dst, ULONG *inbytes)

{

 LZ77_OUTPUTS out, prev_out;

 ULONG strstart = 0, prev_start = 0, count = 0, prev_count = 0;

 LZ77_MATCHINFO mi;

 out.pbuf = dst;

 out.bytenum = 0;

 out.bitnum = 0;

 out.codelen = 1;

 prev_out = out;

 memset(rs.head, NIL, sizeof(USHORT) * 65536);

 do

 {

  if (match(&rs, strstart, &mi))

  {

   if (count > 0)

   {

    outcodec(&out, rs.pwnd + strstart - count, count);  // 輸出無比對位元組

    count = 0;

   }

   if ((ULONG)(1 << out.codelen) <= (ULONG)(strstart - MINOFFSET))

    out.codelen = log2(strstart - MINOFFSET);

   insert(&rs, strstart, mi.len);      // 更新字典記錄

   outcodex(&out, mi.off, mi.len);      // 輸出壓縮代碼

   strstart += mi.len;

  }

  else

  {

   insert(&rs, strstart, 1);       // 更新字典記錄

   count++;           // 增加無比對位元組數量

   strstart += 1;

  }

  if (strstart - prev_start >= 0x1000)     // 跟蹤壓縮率變化

  {

   // 壓縮後的資料是否比原始資料大?

   if (strstart - prev_start + 4 < out.bytenum - prev_out.bytenum)

   {

    // 不壓縮直接輸出原始資料

    out = prev_out;

    outcodec(&out, rs.pwnd + prev_start - prev_count, strstart - prev_start + prev_count);

    count = 0;

   }

   prev_out = out;

   prev_start = strstart;

   prev_count = count;

  }

 } while (strstart < rs.wsize);

 if (count > 0)

 {

  outcodec(&out, rs.pwnd + strstart - count, count);  // 輸出無比對位元組

 }

 // 壓縮後的資料是否比原始資料大?

 if (strstart - prev_start + 4 < out.bytenum - prev_out.bytenum)

 {

  // 不壓縮直接輸出原始資料

  out = prev_out;

  outcodec(&out, rs.pwnd + prev_start - prev_count, strstart - prev_start + prev_count);

 }

 // 壓縮後的全部資料是否比原始資料大?

 if (strstart + 4 < out.bytenum)

 {

  // 直接輸出全部資料

  out.pbuf = dst;

  out.bytenum = 0;

  out.bitnum = 0;

  out.codelen = 1;

  outcodec(&out, rs.pwnd, strstart);

 }

 outcodex(&out, 128, 2);  // 輸出壓縮流結束标記

 if (out.bitnum)

  out.bytenum++;

 *inbytes = strstart;

 return out.bytenum;

}

// ---------------------------------------------------------------------------

// 解壓算法,每次處理一個壓縮塊

// ---------------------------------------------------------------------------

static ULONG inflate(UCHAR *src, UCHAR *dst, ULONG len, ULONG *inbytes)

{

 ULONG offset, length;

 UCHAR i, t;

 UCHAR *out, *s;

 LZ77_INPUTS in;

 in.pbuf = src;

 in.bytenum = 0;

 in.bitnum = 0;

 in.codelen = 1;

 out = dst;

 while (in.bytenum < len)

 {

  if (!GetBits(&in, 1))   // 0表示有一個未壓縮的位元組

  {

   *out++ = (UCHAR)GetBits(&in, 8); // 輸出一個未壓縮位元組

  }

  else

  {

   if (!GetBits(&in, 1))  // 10表示有一個長比對

   {

    if ((ULONG)(1 << in.codelen) <= (ULONG)(out - dst - MINOFFSET))

     in.codelen = log2(out - dst - MINOFFSET);

    offset = GetBits(&in, in.codelen) + MINOFFSET;

    for (i = 0; GetBits(&in, 1); i++); // 計算比對長度位數

    length = GetBits(&in, i + M);

    length += (~(-1 << i) << M) + MINMATCH;  // 計算比對長度

    do

    {

     *out++ = *(out - offset);

    } while (--length);

   }

   else

   {

    if (!GetBits(&in, 1)) // 110表示有一個短比對

    {

     offset = GetBits(&in, 7) + MINOFFSET;

     length = GetBits(&in, 1) + 2;

     if (offset == 128) // offset值128為壓縮流結束标志

      break;

     do

     {

      *out++ = *(out - offset); // 解壓短比對

     } while (--length);

    }

    else     // 111表示有一個未壓縮的位元組串

    {

     length = GetBits(&in, CHARBITS1);

     switch (length)

     {

     case 14:

      length = GetBits(&in, CHARBITS2); // 長度為14時表示實際長度用後面的7位記錄

      length += CHARNUMS1 + 1;

      break;

     case 15:

      length = GetBits(&in, CHARBITS3); // 長度為15時表示實際長度用後面的16位記錄

      length += CHARNUMS2 + 1;

      break;

     default:

      length += CHARNUMS0 + 1;

      break;

     }

     s = in.pbuf + in.bytenum;

     offset = 1;

     i = in.bitnum;

     do

     {

      out[offset] = s[offset]; // 還原位元組串

     } while (++offset < length);

     t = (UCHAR)GetBits(&in, 8 - i);

     in.bytenum += length - 1;

     t |= (UCHAR)(GetBits(&in, i) << (8 - i));

     *out = t;

     out += length;

    }

   }

  }

 }

 *inbytes = in.bytenum;

 *inbytes += in.bitnum == 0 ? 0 : 1;

 return out - dst;

}

// ---------------------------------------------------------------------------

// LZ77壓縮算法

// dst:壓縮輸出緩沖區

// src:原始資料

// len:原始資料長度

// level:壓縮等級(0 - 4)

// return 壓縮後資料大小

// ---------------------------------------------------------------------------

int Lz77Compress(void *dst, void *src, int len, int level)

{

 LZ77_RUNSTATE rs;

 int m, n, count = 0;

 if (len <= 0)

  return 0;

 if (!src || !dst)

  return -1;

 // 設定壓縮等級,等級越高引擎會花越多時間去查找一個更長的比對

 switch (level)

 {

 case 0:

  rs.nice = 3;

  break;

 case 1:

  rs.nice = 30;

  break;

 case 2:

  rs.nice = 70;

  break;

 case 3:

  rs.nice = 150;

  break;

 case 4:

  rs.nice = -1;

  break;

 }

 // 為壓縮字典配置設定記憶體空間

 rs.prev = (USHORT *)malloc(sizeof(USHORT) * 65536);

 rs.head = (USHORT *)malloc(sizeof(USHORT) * 65536);

 if (!rs.prev || !rs.head)

 {

  free(rs.head);

  free(rs.prev);

  return -1;

 }

 do

 {

  rs.wsize = MIN(len, MAXWND);

  rs.pwnd = src;

  // rs.confine = len;

  rs.confine = MIN(len, MAXWND);

  n = deflate(rs, dst, &m);

  len -= m;

  (UCHAR *)src += m;

  (UCHAR *)dst += n;

  count += n;

 } while (len > 0);

 free(rs.head);

 free(rs.prev);

 return count;

}

// ---------------------------------------------------------------------------

// LZ77解壓算法

// dst:解壓輸出緩沖區

// src:壓縮資料

// len:壓縮資料長度

// return 解壓後資料大小

// ---------------------------------------------------------------------------

int Lz77Decompress(void *dst, void *src, int len)

{

 int c = 0, i, o, a = 0;

 if (len <= 0)

  return 0;

 if (!src || !dst)

  return -1;

 do

 {

  o = inflate(src, dst, len, &i);

  (UCHAR *)src += i;

  (UCHAR *)dst += o;

  len -= i;

  c += o;

 } while (len > 0);

 return c;

}

測試代碼:

main.c

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <windows.h>

//---------------------------------------------------------------------------

extern int Lz77Compress(void *dst, void *src, int len, int level);

extern int Lz77Decompress(void *dst, void *src, int len);

//---------------------------------------------------------------------------

int main(int argc, char *argv[])

{

 FILE *input, *output;

 BYTE *inbuf, *outbuf;

 DWORD insize, outsize, len, a = 0;

 SYSTEMTIME t1, t2;

 DWORD milliS;

 if (argc < 4)

  return 0;

 input = fopen(argv[2], "rb");

 output = fopen(argv[3], "wb");

 if (!input || !output)

  return 0;

 fseek(input, 0, SEEK_END);

 insize = ftell(input);

 fseek(input, 0, SEEK_SET);

 inbuf = (BYTE *)malloc(insize);

 GetSystemTime(&t1);

 if (*argv[1] == 'c')

 {

  outbuf = (BYTE *)malloc(insize * 2);

  fread(inbuf, insize, 1, input);

  fwrite(&insize, sizeof(DWORD), 1, output);

  len = Lz77Compress(outbuf, inbuf, insize, 2);

 }

 else if (*argv[1] == 'd')

 {

  fread(&outsize, sizeof(DWORD), 1, input);

  insize -= sizeof(DWORD);

  outbuf = (BYTE *)malloc(outsize * 2);

  //fseek(input, sizeof(DWORD), SEEK_SET);

  fread(inbuf, insize, 1, input);

  Lz77Decompress(outbuf, inbuf, insize);

  len = outsize;

 }

 else

  return 0;

 GetSystemTime(&t2);

 milliS = ((t2.wHour - t1.wHour)*3600 + (t2.wMinute-t1.wMinute)*60 + (t2.wSecond-t1.wSecond)) * 1000 + (t2.wMilliseconds-t1.wMilliseconds);

 printf("Totally %ld milliseconds elapsed!/n/n", milliS);

 fwrite(outbuf, len, 1, output);

 fclose(output);

 fclose(input);

 free(inbuf);

 free(outbuf);

 return 0;

}

//---------------------------------------------------------------------------

繼續閱讀