- 哈希表及其碰撞解决策略
- 1. 引言
- 2. 哈希表简介
- 3. 哈希函数
- 4. 碰撞解决策略
- 4.1 分离链接法(拉链法)
- 4.2 开放寻址法
- 4.2.1 线性探测
- 4.2.2 二次探测
- 4.2.3 双重哈希
- 5. 总结
1. 引言
哈希表是一种高效的数据结构,用于将键映射到值(也称为表或映射抽象数据类型/ADT)。哈希表利用哈希函数将大的或非整数键映射到一个小的整数索引范围(通常是 [0..hash_table_size-1]
2. 哈希表简介
- 哈希函数:计算键的哈希值。
- 索引计算:将哈希值映射到数组的索引范围内。
index = hash_function(key) % hash_table_size
3. 哈希函数
- 快速计算:时间复杂度为O(1)。
- 最小冲突:尽可能减少碰撞。
- 均匀分布:键均匀地分散到哈希表中。
h(v) = v % M
如果负载因子α = N/M
4. 碰撞解决策略
4.1 分离链接法(拉链法)
分离链接法(Separate Chaining,简称SC)是最简单的碰撞解决方法之一。每个哈希表位置都包含一个链表,所有碰撞到同一位置的键值对都存储在这个链表中。这种方法将冲突键值对存储在链表中,避免了冲突带来的问题。
- 搜索(v):遍历链表,检查是否存在v。
- 插入(v):将v插入链表尾部。
- 删除(v):遍历链表,删除v。
class Node:def __init__(self, value: int):self.value = valueself.next: Node = Noneclass SeparateChaining:def __init__(self, size: int, same: bool=True):self.size = sizeself.same = sameself.nums = 0self.table: List[Node] = [Node(None) for _ in range(self.size)] # 初始化哨兵节点def hash_func(self, value: int) -> int:return value % self.sizedef add(self, value: int) -> None:node = self._search(value, self.same)new_node = Node(value)node.next, new_node.next = new_node, node.nextdef remove(self, value: int) -> None:node = self._search(value, False)if node.next and node.next.value == value:node.next = node.next.nextdef search(self, value: int) -> bool:if self._search(value, False).next:return Truereturn Falsedef _search(self, value: int, same: bool=True) -> Node:index = self.hash_func(value)node = self.table[index]while node.next:next_node = node.nextif not same and next_node.value == value:breaknode = next_nodereturn nodedef __repr__(self) -> str:out = []for node in self.table:s = []while node:s.append(node.value)node = node.nextout.append(f"{s !r}")return f"{out !r}"if __name__ == "__main__":sc = SeparateChaining(25, True)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:sc.add(i)print(sc)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:sc.remove(i)print(sc)# print输出
['[None]', '[None, 76]', '[None, 2]', '[None, 53]', '[None]', '[None]', '[None]', '[None, 7, 57]', '[None, 33]', '[None]', '[None]', '[None]', '[None]', '[None, 88, 38]', '[None, 89]', '[None, 40]', '[None, 41, 16]', '[None, 42, 42, 42]', '[None]', '[None, 19]', '[None, 45, 95]', '[None, 71]', '[None, 72]', '[None]', '[None]']
['[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]', '[None]']
4.2 开放寻址法
开放寻址法(Open Addressing)通过探测序列来解决碰撞。常见的探测方法有线性探测、二次探测和双重哈希。
from abc import abstractmethod
from typing import Listclass Probing:def __init__(self, size: int, max_collision: int = 10, same=True):self.size = size # 最大的存储容量self.table: List[int] = [None] * size # hash表self.max_collision = max_collision # 最大的hash碰撞次数self.nums = 0 # 有效元素的个数self.same = same # 在插入的时候是否容忍具有相同的元素存在@abstractmethoddef hash_func(self, value: int, step: int = 0) -> int:...def _search(self, value: int, same=True) -> int:# 搜寻该元素value可用的槽位,而不是搜寻该元素在不在step: int = 0first_none = -1while step <= self.max_collision:_hash = self.hash_func(value, step)if self.table[_hash] is None:# 可能这个位置的值已经被删除了,并不一定后续没有,所以需要继续查找,而不是返回None# 记录第一次为空的位置,如果要插入一个元素,就可以插入到这里if first_none == -1:first_none = _hashelse:# 不许相同,遇到相同值if not same and self.table[_hash] == value:return _hash#否则继续寻找,直到找到None或者超出hash碰撞最大值step += 1# first_none为-1, 说明超出hash碰撞最大值return first_nonedef search(self, value: int) -> int:# 搜寻元素,所以遇到该元素就返回return self._search(value, same=False)@propertydef alpha(self) -> float:return self.nums / self.sizedef add(self, value: int) -> None:# 搜寻槽位index = self._search(value, self.same)if index == -1:raise ValueError("超出hash碰撞的最大次数")if self.table[index] == value:returnself.table[index] = valueself.nums += 1def remove(self, value: int) -> None:# 搜寻该元素index = self._search(value, False)if index == -1 or self.table[index] is None:returnself.table[index] = Noneself.nums -= 1def __repr__(self) -> str:return f"Probing({self.table !r}), nums:{self.nums}, alpha:{self.alpha}"
4.2.1 线性探测
线性探测(Linear Probing)采用固定步长进行探测:
i = (base + step * 1) % M
:基地址(h(v) + 1 * 1) % M
(h(v) + 2 * 1) % M
- …
class LinearProbing(Probing):def __init__(self, size: int, max_collision: int = 10):super().__init__(size, max_collision)def hash_func(self, value: int, step: int = 0) -> int:return (value + step * 1) % self.sizedef __repr__(self) -> str:return f"LinearProbing({self.table !r}), nums:{self.nums}, alpha:{self.alpha}"
if __name__ == "__main__":lp = LinearProbing(25, 20)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:lp.add(i)print(lp)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:lp.remove(i)print(lp)
# print 输出
LinearProbing([16, 89, 2, 53, 76, None, None, 7, 57, 33, None, None, None, 88, 38, 40, 41, 42, 42, 19, 45, 71, 72, 42, 95]), nums:20, alpha:0.8
LinearProbing([None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]), nums:0, alpha:0.0
线性探测的优点是实现简单,但容易出现主聚类(Primary Clustering)问题,即连续的空闲槽位被逐渐填满,导致探测序列变长,性能下降。
4.2.2 二次探测
二次探测(Quadratic Probing)采用平方步长进行探测,step
i = (base + step * step) % M
:基地址(h(v) + 1 * 1) % M
(h(v) + 2 * 2) % M
(h(v) + 3 * 3) % M
- …
class QuadraticProbing(Probing):def __init__(self, size: int, max_collision: int = 10):super().__init__(size, max_collision)def hash_func(self, value: int, step: int = 0) -> int:return (value + step * step) % self.sizedef __repr__(self) -> str:return f"QuadraticProbing({self.table !r}), nums:{self.nums}, alpha:{self.alpha}"
if __name__ == "__main__":qp = QuadraticProbing(25, 20)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:qp.add(i)print(qp)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:qp.remove(i)print(qp)#print输出
QuadraticProbing([16, 42, 2, 53, None, 76, None, 7, 57, 33, None, None, None, 88, 38, 40, 41, 42, 42, 19, 45, 71, 72, 89, 95]), nums:20, alpha:0.8
QuadraticProbing([None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]), nums:0, alpha:0.0
i = (base + (-1)**step * ((step+1)//2)**2) % M
:基地址(h(v) + 1 * 1) % M
(h(v) - 1 * 1) % M
(h(v) + 2 * 2) % M
(h(v) - 2 * 2) % M
- …
class QuadraticProbing2(Probing):def __init__(self, size: int, max_collision: int = 10):super().__init__(size, max_collision)def hash_func(self, value: int, step: int = 0) -> int:return (value + ((-1) ** step) * (((step+1) // 2) ** 2)) % self.sizedef __repr__(self) -> str:return f"QuadraticProbing2({self.table !r}), nums:{self.nums}, alpha:{self.alpha}"if __name__ == "__main__":qp2 = QuadraticProbing2(25, 20)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:qp2.add(i)print(qp2)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:qp2.remove(i)print(qp2)# print输出
QuadraticProbing2([16, 76, 2, 53, None, None, 57, 7, 42, 33, None, None, 38, 88, 89, 40, 41, 42, 42, 19, 45, 71, 72, None, 95]), nums:20, alpha:0.8
QuadraticProbing2([None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]), nums:0, alpha:0.0
如果负载因子α < 0.5
个二次探测索引都是唯一的。二次探测减少了主聚类问题,但仍可能出现次聚类(Secondary Clustering)问题,即哈希值相同但初始位置不同的键会形成新的聚类。
4.2.3 双重哈希
双重哈希(Double Hashing)采用两个哈希函数进行探测:
i = (base + step * secondary) % M
其中secondary = smaller_prime - key % smaller_prime
:基地址(h(v) + 1 * h2(v)) % M
(h(v) + 2 * h2(v)) % M
(h(v) + 3 * h2(v)) % M
- …
h2(v) = smaller_prime - (v % smaller_prime)
class DoubleProbing(Probing):def __init__(self, size: int, max_collision: int = 10):super().__init__(size, max_collision)def hash_func(self, value: int, step: int = 0) -> int:h1 = value % self.sizesmaller_prime = 23h2 = smaller_prime - (value % smaller_prime)return (h1 + step * h2) % self.sizedef __repr__(self) -> str:return f"DoubleProbing({self.table !r}), nums:{self.nums}, alpha:{self.alpha}"if __name__ == "__main__":dp = DoubleProbing(25, 20)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:dp.add(i)print(dp)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:dp.remove(i)print(dp)#print输出
DoubleProbing([42, 76, 2, 53, 42, None, 57, 7, 33, None, 95, None, 38, 88, 89, 40, 41, 42, None, 19, 45, 71, 72, 16, None]), nums:20, alpha:0.8
DoubleProbing([None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]), nums:0, alpha:0.0
5. 总结
from abc import abstractmethod
from typing import Listclass Probing:def __init__(self, size: int, max_collision: int = 10, same: bool=True):self.size = sizeself.table: List[int] = [None] * sizeself.max_collision = max_collisionself.nums = 0self.same = same@abstractmethoddef hash_func(self, value: int, step: int = 0) -> int:...def _search(self, value: int, same: bool=True) -> int:step: int = 0first_none = -1while step <= self.max_collision:_hash = self.hash_func(value, step)if self.table[_hash] is None:# 可能这个位置的值已经被删除了,并不一定后续没有需要查找的值# 记录第一次为空的位置if first_none == -1:first_none = _hashelse:# 不许相同,遇到相同值if not same and self.table[_hash] == value:return _hash#否则继续寻找,直到找到None或者超出hash碰撞最大值step += 1# -1超出hash碰撞最大值return first_nonedef search(self, value: int) -> int:return self._search(value, same=False)@propertydef alpha(self) -> float:return self.nums / self.sizedef add(self, value: int) -> None:index = self._search(value, self.same)if index == -1:raise ValueError("超出hash碰撞的最大次数")if self.table[index] == value:returnself.table[index] = valueself.nums += 1def remove(self, value: int) -> None:index = self._search(value, False)if index == -1 or self.table[index] is None:returnself.table[index] = Noneself.nums -= 1def __repr__(self) -> str:return f"Probing({self.table !r}), nums:{self.nums}, alpha:{self.alpha}"class LinearProbing(Probing):def __init__(self, size: int, max_collision: int = 10):super().__init__(size, max_collision)def hash_func(self, value: int, step: int = 0) -> int:return (value + step * 1) % self.sizedef __repr__(self) -> str:return f"LinearProbing({self.table !r}), nums:{self.nums}, alpha:{self.alpha}"class QuadraticProbing(Probing):def __init__(self, size: int, max_collision: int = 10):super().__init__(size, max_collision)def hash_func(self, value: int, step: int = 0) -> int:return (value + step * step) % self.sizedef __repr__(self) -> str:return f"QuadraticProbing({self.table !r}), nums:{self.nums}, alpha:{self.alpha}"class QuadraticProbing2(Probing):def __init__(self, size: int, max_collision: int = 10):super().__init__(size, max_collision)def hash_func(self, value: int, step: int = 0) -> int:return (value + ((-1) ** step) * (((step+1) // 2) ** 2)) % self.sizedef __repr__(self) -> str:return f"QuadraticProbing2({self.table !r}), nums:{self.nums}, alpha:{self.alpha}"class DoubleProbing(Probing):def __init__(self, size: int, max_collision: int = 10):super().__init__(size, max_collision)def hash_func(self, value: int, step: int = 0) -> int:h1 = value % self.sizesmaller_prime = 23h2 = smaller_prime - (value % smaller_prime)return (h1 + step * h2) % self.sizedef __repr__(self) -> str:return f"DoubleProbing({self.table !r}), nums:{self.nums}, alpha:{self.alpha}"class Node:def __init__(self, value: int):self.value = valueself.next: Node = Noneclass SeparateChaining:def __init__(self, size: int, same: bool=True):self.size = sizeself.same = sameself.nums = 0self.table: List[Node] = [Node(None) for _ in range(self.size)] # 初始化哨兵节点def hash_func(self, value: int) -> int:return value % self.sizedef add(self, value: int) -> None:node = self._search(value, self.same)new_node = Node(value)node.next, new_node.next = new_node, node.nextdef remove(self, value: int) -> None:node = self._search(value, False)if node.next and node.next.value == value:node.next = node.next.nextdef search(self, value: int) -> bool:if self._search(value, False).next:return Truereturn Falsedef _search(self, value: int, same: bool=True) -> Node:index = self.hash_func(value)node = self.table[index]while node.next:next_node = node.nextif not same and next_node.value == value:breaknode = next_nodereturn nodedef __repr__(self) -> str:out = []for node in self.table:s = []while node:s.append(node.value)node = node.nextout.append(f"{s !r}")return f"{out !r}"if __name__ == "__main__":lp = LinearProbing(25, 20)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:lp.add(i)print(lp)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:lp.remove(i)print(lp)qp = QuadraticProbing(25, 20)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:qp.add(i)print(qp)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:qp.remove(i)print(qp)qp2 = QuadraticProbing2(25, 20)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:qp2.add(i)print(qp2)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:qp2.remove(i)print(qp2)dp = DoubleProbing(25, 20)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:dp.add(i)print(dp)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:dp.remove(i)print(dp)sc = SeparateChaining(25, True)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:sc.add(i)print(sc)for i in [72, 7, 53, 71, 40, 2, 45, 41, 42, 19, 42, 88, 42, 95, 38, 57, 16, 33, 89, 76]:sc.remove(i)print(sc)