在C++标准模板库中,无序容器(unordered_set/unordered_map)是基于哈希表实现的高效数据结构。与红黑树实现的有序容器不同,哈希容器通过哈希函数直接将键映射到存储位置,使得查找、插入操作在平均情况下达到O(1)时间复杂度。
哈希表的核心设计包含三个关键组件:
我们选择链地址法实现是因为:
首先定义哈希节点的基本结构:
cpp复制template <typename T>
struct HashNode {
T data;
HashNode* next;
HashNode(const T& val, HashNode* nxt = nullptr)
: data(val), next(nxt) {}
};
迭代器需要支持跨桶遍历,核心成员包括:
cpp复制template <typename HashTable>
class HashIterator {
using node_type = typename HashTable::node_type;
node_type* current;
const HashTable* hashtable;
size_t bucket_idx;
public:
// 迭代器常规操作实现...
void advance() {
if (current->next) {
current = current->next;
} else {
// 跨桶跳转逻辑...
}
}
};
默认哈希函数对整数类型直接返回其值,对字符串等类型使用FNV-1a算法:
cpp复制template <typename Key>
struct hash {
size_t operator()(const Key& key) const {
return static_cast<size_t>(key);
}
};
// 字符串特化版本
template <>
struct hash<std::string> {
size_t operator()(const std::string& str) const {
size_t hash = 14695981039346656037ULL;
for (char c : str) {
hash ^= c;
hash *= 1099511628211ULL;
}
return hash;
}
};
桶数组采用vector存储桶头指针,初始大小通常选择质数以减少冲突:
cpp复制std::vector<Node*> buckets;
static const size_t DEFAULT_BUCKET_SIZE = 53;
插入操作需要考虑:
cpp复制std::pair<iterator, bool> insert(const value_type& value) {
// 检查扩容
if (load_factor() > max_load_factor()) {
rehash(buckets.size() * 2 + 1);
}
size_t idx = bucket(value);
Node* curr = buckets[idx];
// 检查重复
while (curr) {
if (equal_(curr->data, value)) {
return {iterator(curr, this, idx), false};
}
curr = curr->next;
}
// 头插法
buckets[idx] = new Node(value, buckets[idx]);
++size_;
return {iterator(buckets[idx], this, idx), true};
}
扩容时的重新哈希操作:
cpp复制void rehash(size_t new_size) {
std::vector<Node*> new_buckets(next_prime(new_size), nullptr);
for (size_t i = 0; i < buckets.size(); ++i) {
Node* curr = buckets[i];
while (curr) {
Node* next = curr->next;
size_t new_idx = hash_(curr->data) % new_buckets.size();
curr->next = new_buckets[new_idx];
new_buckets[new_idx] = curr;
curr = next;
}
}
buckets.swap(new_buckets);
}
查找操作利用哈希快速定位桶,然后线性搜索链表:
cpp复制iterator find(const key_type& key) {
size_t idx = bucket(key);
Node* curr = buckets[idx];
while (curr) {
if (equal_(get_key(curr->data), key)) {
return iterator(curr, this, idx);
}
curr = curr->next;
}
return end();
}
删除操作需要注意维护链表完整性:
cpp复制size_t erase(const key_type& key) {
size_t idx = bucket(key);
Node* prev = nullptr;
Node* curr = buckets[idx];
while (curr) {
if (equal_(get_key(curr->data), key)) {
if (prev) {
prev->next = curr->next;
} else {
buckets[idx] = curr->next;
}
delete curr;
--size_;
return 1;
}
prev = curr;
curr = curr->next;
}
return 0;
}
频繁的节点分配释放会影响性能,可以采用:
对象池实现示例:
cpp复制class NodePool {
std::vector<Node*> free_list;
public:
Node* allocate(const T& val, Node* next) {
if (free_list.empty()) {
return new Node(val, next);
}
Node* node = free_list.back();
free_list.pop_back();
new (&node->data) T(val);
node->next = next;
return node;
}
void deallocate(Node* node) {
node->data.~T();
free_list.push_back(node);
}
};
好的哈希函数应满足:
对于复合类型可采用组合哈希:
cpp复制struct PairHash {
template <typename T1, typename T2>
size_t operator()(const std::pair<T1, T2>& p) const {
return hash<T1>()(p.first) ^ (hash<T2>()(p.second) << 1);
}
};
哈希表操作可能导致迭代器失效的场景:
解决方案:
cpp复制template <
typename Key,
typename Value,
typename Hash = std::hash<Key>,
typename KeyEqual = std::equal_to<Key>
>
class unordered_map {
private:
using node_type = HashNode<std::pair<const Key, Value>>;
std::vector<node_type*> buckets;
size_t size_ = 0;
float max_load_factor_ = 1.0;
Hash hash_;
KeyEqual equal_;
public:
// 接口声明...
iterator begin();
iterator end();
std::pair<iterator, bool> insert(const value_type& value);
iterator find(const key_type& key);
size_t erase(const key_type& key);
void rehash(size_t count);
// ...其他成员函数
};
验证核心功能的测试用例应包括:
cpp复制TEST(UnorderedMapTest, InsertAndFind) {
unordered_map<std::string, int> map;
map.insert({"apple", 5});
map.insert({"banana", 3});
auto it = map.find("apple");
ASSERT_NE(it, map.end());
EXPECT_EQ(it->second, 5);
EXPECT_EQ(map.erase("banana"), 1);
EXPECT_EQ(map.find("banana"), map.end());
}
TEST(UnorderedMapTest, RehashOperation) {
unordered_map<int, int> map;
size_t initial_buckets = map.bucket_count();
for (int i = 0; i < 1000; ++i) {
map.insert({i, i*2});
}
EXPECT_GT(map.bucket_count(), initial_buckets);
EXPECT_EQ(map.size(), 1000);
for (int i = 0; i < 1000; ++i) {
EXPECT_EQ(map.find(i)->second, i*2);
}
}
内存泄漏问题:
迭代器失效异常:
性能瓶颈分析:
为保持与std::unordered_map接口一致,需要:
cpp复制// 标准库兼容接口示例
size_type bucket_count() const noexcept {
return buckets.size();
}
size_type bucket_size(size_type n) const {
size_type count = 0;
Node* curr = buckets[n];
while (curr) {
++count;
curr = curr->next;
}
return count;
}
实际项目中可考虑添加:
cpp复制// 并行访问示例
template <typename Key, typename Value>
class ConcurrentUnorderedMap {
using MapType = unordered_map<Key, Value>;
std::vector<MapType> segments;
std::vector<std::mutex> mutexes;
public:
Value& operator[](const Key& key) {
size_t seg = hash(key) % segments.size();
std::lock_guard<std::mutex> lock(mutexes[seg]);
return segments[seg][key];
}
// ...其他线程安全操作
};
实现哈希容器时,选择质数作为桶大小可以减少冲突。以下是常用的质数表:
| 当前容量 | 下一个扩容容量 |
|---|---|
| 53 | 97 |
| 97 | 193 |
| 193 | 389 |
| 389 | 769 |
| 769 | 1543 |
| 1543 | 3079 |
在调试过程中,可以添加以下统计信息帮助优化:
对于性能要求极高的场景,可以考虑以下优化方向: