哈希表--開散列【資料結構】

哈希表：

散清單（Hash table，也叫哈希表），是根據關鍵碼值(Key value)而直接進行通路的資料結構。也就是說，它通過把關鍵碼值映射到表中一個位置來通路記錄，以加快查找的速度。這個映射函數叫做散列函數，存放記錄的數組叫做散清單。———— 百度百科

哈希沖突：

對于兩個資料元素的關鍵字Ki和Kj（i != j），有Ki ！= Kj，但有： HashFun(Ki) == HashFun(Kj)

即不同關鍵字通過相同哈希哈數計算出相同的哈希位址，該種現象稱為哈希沖突或哈希碰撞。把具有不同關鍵碼而具有相同哈希位址的資料元素稱為“同義詞”。

引起哈希沖突的一個原因可能是：哈希函數設計不夠合理。

哈希函數設計原則：

哈希函數的定義域必須包括需要存儲的全部關鍵碼，而如果散清單允許有m個位址時，其值域必須在0到m-1之間

哈希函數計算出來的位址能均勻分布在整個空間中

哈希函數應該比較簡單

哈希函數：

直接定址法：

取關鍵字的某個線性函數為散列位址：Hash（Key）= A*Key + B

優點：簡單、均勻

缺點：需要事先知道關鍵字的分布情況

适合查找比較小且連續的情況

除留餘數法：

設散清單中允許的位址數為m，取一個不大于m，但最接近或者等于m的質數p作為除數，按照哈希函數：Hash(key) = key% p(p<=m),将關鍵碼轉換成哈希位址

平方取中法：

假設關鍵字為1234，對它平方就是1522756，抽取中間的3位227作為哈希位址；再比如關鍵字為4321，對它平方就是18671041，抽取中間的3位671(或710)作為哈希位址平方取中法比較适合：不知道關鍵字的分布，而位數又不是很大的情況

折疊法：

折疊法是将關鍵字從左到右分割成位數相等的幾部分(最後一部分位數可以短些)，然後将這幾部分疊加求和，并按散清單表長，取後幾位作為散列位址折疊法适合事先不需要知道關鍵字的分布，适合關鍵字位數比較多的情況

随機數法：

選擇一個随機函數，取關鍵字的随機函數值為它的哈希位址，即H(key) = random(key),其中random為随機數函數通常應用于關鍵字長度不等時采用此法

數學分析法：

設有n個d位數，每一位可能有r種不同的符号，這r種不同的符号在各位上出現的頻率不一定相同，可能在某些位上分布比較均勻，每種符号出現的機會均等，在某些位上分布不均勻隻有某幾種符号經常出現。可根據散清單的大小，選擇其中各種符号分布均勻的若幹位作為散列位址。

解決哈希沖突的辦法：

1.閉散列（開放定址法）

線性探測：隻要hash表沒存滿，一周之内一定可以找到空餘位置---缺點--->引起資料堆積

擴容----->

負載因子：A = 有效元素個數/哈希表的容量

當A > 0.7時則哈希沖突的可能性大大增加，需對哈希表進行擴容

二次探測：H(i+1)-H(i) addr = addr+ 2*i + 1;-----缺點-------->空間使用率太低

2.開散列（開鍊法）-----------連結清單

首先對關鍵碼集合用散列函數計算散列位址，具有相同位址的關鍵碼歸于同一子集合，每一個子集合稱為一個桶，各個桶中的元素通過一個單連結清單連結起來，各連結清單的頭結點存儲在哈希表中。

閉散列：

1.哈希表的構成：

typedef enum STATUS
{
	EMPTY,		//空
	EXIST,		//有元素
	DELETE		//該元素已經删除
}STATUS;

typedef struct Elem{
	STATUS _status;		//狀态
	HtDataType _data;	//值域
}Elem;

typedef struct HashTable
{
	Elem _array[MAX_SIZE];
	int _size;			//有效元素個數
}HT;

2.基本操作：

插入 ----------此處借順序表實作

int InsertHashTable(HT* ht, HtDataType data)
{
	int Addr = 0;
	int i = 0;
	if (NULL == ht)
	{
		return 0;
	}
	if (ht->_size == MAX_SIZE)			//表滿?    ht->size < max_size （有delete 和exist）
	{
		return 0;
	}
	//找到要插入元素的位置
	Addr = HashFunc(data);

	while (EMPTY != ht->_array[Addr]._status)//檢視表中元素狀态
	{
		if (ht->_array[Addr]._status == EXIST &&		//delete處不插入元素，保證不插入相同元素
			data == ht->_array[Addr]._data) //元素在表中已經存在,且狀态為存在
			return 0;

#ifdef  DETECTIVE_LINE
		Addr = DetectiveLine(Addr);
#else
		++i;
		Addr = DetectiveLine_T(Addr,i);
#endif
	}
	
	//隻有目前位置為空時插入
	ht->_array[Addr]._data = data;
	ht->_array[Addr]._status = EXIST;
	ht->_size++;
	return 1;
}

删除：

int DeleteElemHashTable(HT* ht, HtDataType data)
{
	int addr = 0;
	int Cur = 0;
	if (NULL == ht)
	{
		return 0;
	}


	if (EmptyHashTable(ht))
	{
		return 0;
	}
	//找到要删除元素的位置
	addr = HashFunc(data);
	Cur = FindHashTable(ht, data);


	//删除
	if (-1 != Cur)
	{
		ht->_array[Cur]._status = DELETE;
		ht->_size--;    
		return 1;
	}
	else
	{
		return 0;
	}
}

查找：

int FindHashTable(HT* ht, HtDataType data)//傳回找到元素的位址
{
	int addr = 0;
	int addrCount = 0;
	if (NULL == ht)
	{
		return -1;
	}
	
	if (EmptyHashTable(ht))
	{
		return -1;
	}

	addr = HashFunc(data);            //addr = addrstart = hashfunc(data)
	while (EMPTY != ht->_array[addr]._status)
	{
		if (data == ht->_array[addr]._data)
		{
			if (ht->_array[addr]._status == EXIST)//資料相等 且狀态為存在
			{
				return addr;
			}
		}

		addr++;
		addrCount++;

		if (MAX_SIZE == addr)//越界
		{
			addr = 0;
		}
		if (MAX_SIZE == addrCount)//滿
		{
			break;
		}
	}
	return -1;
}

完整代碼：

Hash.h

#pragma once

typedef int HtDataType;
#define MAX_SIZE 100
#define NULL 0
//#define DETECTIVE_LINE

typedef enum STATUS
{
	EMPTY,		//空
	EXIST,		//有元素
	DELETE		//該元素已經删除
}STATUS;

typedef struct Elem{
	STATUS _status;		//狀态
	HtDataType _data;	//值域
}Elem;

typedef struct HashTable
{
	Elem _array[MAX_SIZE];
	int _size;			//有效元素個數
}HT;

void InitHashTable(HT* ht);
int InsertHashTable(HT* ht, HtDataType data);
int HashFunc(HtDataType data);

int DetectiveLine(int addr); //線性探測
int DetectiveLine_T(int addr, int i);  //二次探測

int FindHashTable(HT* ht, HtDataType data);
int DeleteElemHashTable(HT* ht, HtDataType data);
int EmptyHashTable(HT* ht);
int HashSize(HT* ht);

void TestHash();

Hash.c

#include "Hash.h"
#include <assert.h>
#include <stdio.h>

void InitHashTable(HT* ht)
{
	int i = 0;
	assert(ht);
	
	for (; i < MAX_SIZE; i++)
	{
		ht->_array[i]._status = EMPTY;
	}
	ht->_size = 0;
}

int HashFunc(HtDataType data)	//除留取餘
{
	return data % MAX_SIZE;		//盡量模素數
}

int InsertHashTable(HT* ht, HtDataType data)
{
	int Addr = 0;
	int i = 0;
	if (NULL == ht)
	{
		return 0;
	}
	if (ht->_size == MAX_SIZE)			//表滿?    ht->size < max_size （有delete 和exist）
	{
		return 0;
	}
	//找到要插入元素的位置
	Addr = HashFunc(data);

	while (EMPTY != ht->_array[Addr]._status)//檢視表中元素狀态
	{
		if (ht->_array[Addr]._status == EXIST &&		//delete處不插入元素，保證不插入相同元素
			data == ht->_array[Addr]._data) //元素在表中已經存在,且狀态為存在
			return 0;

#ifdef  DETECTIVE_LINE
		Addr = DetectiveLine(Addr);
#else
		++i;
		Addr = DetectiveLine_T(Addr,i);
#endif
	}
	
	//隻有目前位置為空時插入
	ht->_array[Addr]._data = data;
	ht->_array[Addr]._status = EXIST;
	ht->_size++;
	return 1;
}

int DeleteElemHashTable(HT* ht, HtDataType data)
{
	int addr = 0;
	int Cur = 0;
	if (NULL == ht)
	{
		return 0;
	}

	if (EmptyHashTable(ht))
	{
		return 0;
	}
	//找到要删除元素的位置
	addr = HashFunc(data);
	Cur = FindHashTable(ht, data);

	//删除
	if (-1 != Cur)
	{
		ht->_array[Cur]._status = DELETE;
		ht->_size--;    
		return 1;
	}
	else
	{
		return 0;
	}
}

int FindHashTable(HT* ht, HtDataType data)//傳回找到元素的位址
{
	int addr = 0;
	int addrCount = 0;
	if (NULL == ht)
	{
		return -1;
	}
	
	if (EmptyHashTable(ht))
	{
		return -1;
	}

	addr = HashFunc(data);            //addr = addrstart = hashfunc(data)
	while (EMPTY != ht->_array[addr]._status)
	{
		if (data == ht->_array[addr]._data)
		{
			if (ht->_array[addr]._status == EXIST)//資料相等 且狀态為存在
			{
				return addr;
			}
		}

		addr++;
		addrCount++;

		if (MAX_SIZE == addr)//越界
		{
			addr = 0;
		}
		if (MAX_SIZE == addrCount)//滿
		{
			break;
		}
	}
	return -1;
}

int EmptyHashTable(HT* ht)
{
	assert(ht);
	return 0 == ht->_size;
}

int HashSize(HT* ht)
{
	assert(ht);
	return ht->_size;
}

int DetectiveLine(int addr)		//線性探測
{
	addr++;

	if (MAX_SIZE == addr)//越界
	{
		addr = 0;
	}
	return addr;
}
int DetectiveLine_T(int addr, int i)	//二次探測
{
	addr = addr + 2 * i + 1;		//H(i+1)- H(i)

	if (MAX_SIZE <= addr)//越界
	{
		addr = addr % MAX_SIZE;
	}
	return addr;
}


void TestHash()
{
	HT ht;
	int Cur = 0;

	InitHashTable(&ht);
	InsertHashTable(&ht, 3);
	InsertHashTable(&ht, 4);
	InsertHashTable(&ht, 5);
	InsertHashTable(&ht, 103);
	InsertHashTable(&ht, 199);
	InsertHashTable(&ht, 299);
	InsertHashTable(&ht, 399);
	InsertHashTable(&ht, 499);
	InsertHashTable(&ht, 599);
	DeleteElemHashTable(&ht, 599);
	FindHashTable(&ht, 599);

	Cur = FindHashTable(&ht, 599);
	if (-1 != Cur)
	{
		printf("yes!\n");
	}
	else
	{
		printf("no!\n");
	}
}

Common.c

#include "Common.h"
// 使用素數表對齊做哈希表的容量，降低哈希沖突
unsigned long _PrimeList[_PrimeSize] =
{
	53ul, 97ul, 193ul, 389ul, 769ul,
	1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
	49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
	1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
	50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
	1610612741ul, 3221225473ul, 4294967291ul
};

unsigned long GetNextPrime(unsigned long capacity)
{
	int i = 0;
	for (; i < _PrimeSize; i++)
	{
		if (capacity < _PrimeList[i])
		{
			return _PrimeList[i];
		}
	}
	return _PrimeList[_PrimeSize - 1];
}

Common.h

#pragma once
// 使用素數表對齊做哈希表的容量，降低哈希沖突
#define _PrimeSize  28
unsigned long GetNextPrime(unsigned long capacity);

Test.c

#include"Hash.h"
#include "Hash_D.h"
#include <stdlib.h>

int main()
{
	TestHash();
	
	system("pause");
	return 0;
}

動态哈希表完整代碼：

Hash_D.h

#pragma once 

typedef char* HtDataType;
typedef unsigned long (*PHF)(HtDataType);  

#define MAX_SIZE 100
#define NULL 0
#define DETECTIVE_LINE

typedef enum STATUS
{
	EMPTY,		//空
	EXIST,		//有元素
	DELETE		//該元素已經删除
}STATUS;

typedef struct Elem{
	STATUS _status;
	HtDataType _data;		//《鍵值對，《key, value》》-----結構體（key唯一，value可能重複）
}Elem;

typedef struct HashTable
{
	Elem* _array;		//哈希表空間
	int _capacity;		//容量
	int _size;		//有效元素個數
	PHF _pToInt;		//輸入為字元或整形
}HT;

void InitHashTable_D(HT* ht, unsigned long capacity, PHF pTo);	//初始化
int InsertHashTable_D(HT* ht, HtDataType data);			//插入
int FindHashTable_D(HT* ht, HtDataType data);			//查找
int DeleteHashTable_D(HT* ht, HtDataType data);			//删除
void DestoryHashTable_D(HT*ht);					//銷毀

unsigned long IntHash(int data);				//整形資料
int HashEmpty(HT* ht);
unsigned long BKDRHash(const char * str);			//字元串轉化為整形
int Chackcapacity(HT* ht);					//容量檢測
int HashFunc(HT* ht, HtDataType data);				//哈希位址

int Detective(HT* ht, int addr);				//線性探測
int Detective_T(HT* ht,int addr, int i);			//二次探測

void TestHash_D();

Hash_D.c

#include "Hash_D.h"
#include "Common.h"
#include <assert.h>
#include <malloc.h>
#include <stdio.h>

void Swap(HT* left, HT* right)
{
	HT tmp = *left;
	*left = *right;
	*right = tmp;
}

int HashEmpty(HT* ht)
{
	assert(ht);
	return 0 == ht->_size;
}

unsigned long BKDRHash(const char * str)		//字元串轉化為整形	
{
	unsigned int seed = 131; // 31 131 1313 13131 131313
	unsigned int hash = 0;
	while (*str)
	{
		hash = hash * seed + (*str++);
	}
	return (hash & 0x7FFFFFFF);
}

unsigned long IntHash(int data)			//整形
{
	return data;
}


void InitHashTable_D(HT* ht, unsigned long capacity, PHF pTo)
{
	unsigned long i = 0;
	if (NULL == ht)
	{
		return;
	}
	//申請空間
	capacity = GetNextPrime(capacity);
	ht->_array = (Elem*)malloc(sizeof(Elem) * capacity);
	assert(ht->_array);

	for (; i < capacity; i++){
		ht->_array[i]._status = EMPTY;
	}
	ht->_capacity = capacity;
	ht->_size = 0;
	ht->_pToInt = pTo;
}

int InsertHashTable_D(HT* ht, HtDataType data)//插入
{
	int addr = 0;
	int i = 0;
	if (NULL == ht)//判空
	{
		return 0;
	}
	if (!Chackcapacity(ht))	//測容
	{
		return 0;
	}
	addr = HashFunc(ht, data);
	while (EMPTY != ht->_array[addr]._status)
	{
		if (EXIST == ht->_array[addr]._status &&//避免插入相同元素
			data == ht->_array[addr]._data)
		{
			return 0;
		}
#ifdef DETECTIVE_LINE
		addr = Detective(ht, addr);
#else
		i++
		addr = Detective_T(ht, addr);
#endif

	}
	ht->_array[addr]._data = data;
	ht->_array[addr]._status = EXIST;
	ht->_size++;
	return 1;
}

int FindHashTable_D(HT* ht, HtDataType data)
{
	int addr = 0;
	int addrStart = 0;
	int count = 0;//記錄二次探測次數
	//判空
	if (NULL == ht)
	{
		return -1;
	}
	//在hash表中找起始位址
	addr = HashFunc(ht, data);
	addrStart = addr;
	//在狀态為EXIST中找data，找到傳回位址沒找到繼續進行線性探測或二次探測
	while (EMPTY != ht->_array[addr]._status)
	{
		if (data == ht->_array[addr]._data &&
			EXIST == ht->_array[addr]._status)
			return addr;

#ifdef DETECTIVE_LINE
		addr = Detective(ht, addr);
#else
		i++
		addr = Detective_T(ht, addr,i);
		
#endif
	}
	//沒找到傳回-1
	return -1;
}

int DeleteHashTable_D(HT* ht, HtDataType data)
{
	int addr = 0;
	if (NULL == ht || HashEmpty(ht)){
		return 0;
	}
	//找到删除的結點
	addr = FindHashTable_D(ht, data);//判斷是否找到
	if (-1 == addr)
	{
		return 0;
	}
	ht->_array[addr]._status = DELETE;
	ht->_size--;
	return 1;

}

int Chackcapacity(HT* ht)
{
	int i = 0;
	if (NULL == ht)
	{
		return 0;
	}
	if ((ht->_size * 10 / ht->_capacity) > 7)   //負載因子0.7
	{
		HT tmp;
		InitHashTable_D(&tmp, ht->_capacity, ht->_pToInt);  //初始化新的哈希表
		for (; i < ht->_capacity; i++)
		{
			InsertHashTable_D(&tmp, ht->_array[i]._data);//将舊空間的元素散列到新空間
		}
		Swap(&tmp, ht);			//交換新舊空間
		DestoryHashTable_D(&tmp);//釋放新空間
	}
	return 1;
}

int HashFunc(HT* ht, HtDataType data)
{
	return ht->_pToInt(data) % ht->_capacity;
}

int Detective(HT* ht, int addr)
{
	addr++;
	if (addr > ht->_capacity)	//越界判斷
	{
		addr = 0;
	}
	return addr;
}

int Detective_T(HT* ht, int addr, int i)
{
	addr = addr + 2 * i + 1;
	if (addr >= ht->_capacity)
	{
		addr = addr % ht->_capacity;//防止進入死循環
	}
	return addr;
}

void DestoryHashTable_D(HT*ht)
{
	if (NULL == ht){
		return;
	}
	free(ht->_array);
	ht->_array = NULL;
	ht->_capacity = 0;
	ht->_size = 0;
}

void TestHash_D()
{
	int tmp = 0;
	HT ht;
	InitHashTable_D(&ht, 10, BKDRHash);
	InsertHashTable_D(&ht,"陳");
	InsertHashTable_D(&ht, "陳2");
	DeleteHashTable_D(&ht, "陳");
	tmp = FindHashTable_D(&ht, "陳");
	if (-1 == tmp)
	{
		printf("no!\n");
	}
	else
	{
		printf("yes!\n");
	}
}

哈希表--開散列【資料結構】

繼續閱讀

樹的基本概念(定義、基本術語、性質)

GSL--GNU Scientific Library

D5|哈希表，善用資料結構

如何成為一名.net 工程師?

BMP檔案結構及圖像每行位元組計算方法

D. Ehab the Xorcist(構造+思維)

查找算法之二分查找查找算法之二分查找

JAVA高效程式設計指南

CQ V1.0分詞bates(基于雙數組tire樹)—應該是目前最快的中文分詞算法

【資料結構】醫院選址

[轉]ISUP信令的IAM消息詳細内容

筆試面試題目：滑動視窗(二)

交通/城市相關的公開的資料集上學時整理的Xie et al., 2020收集的

資料結構與算法（27）——排序（二）

GitHub連夜封殺！這份阿裡 10W 字内部 Java 字面試手冊到底有多強？

Linux裝置模型（中）之上層容器