哈希表算法:快速查找的艺术

算法原理

哈希表(Hash Table),也称为散列表,是根据关键字值直接进行访问的数据结构。它通过哈希函数将关键字映射到表中的位置来访问记录,以加快查找的速度。哈希表的核心思想是用空间换时间。

基本思想

  1. 哈希函数:将任意大小的数据映射到固定大小的值
  2. 直接寻址:通过哈希值直接定位到存储位置
  3. 冲突处理:处理不同键映射到相同位置的情况
  4. 动态调整:根据负载因子调整表大小

核心组件

  1. 哈希函数:h(key) → index
  2. 存储结构:数组或链表
  3. 冲突解决策略:开放寻址或链式哈希
  4. 负载因子控制:维持性能的关键

优缺点分析

优点

  1. 查找快速:平均O(1)时间复杂度
  2. 插入删除高效:平均O(1)操作
  3. 空间利用率高:合理的负载因子下空间效率好
  4. 实现相对简单:基本操作逻辑清晰

缺点

  1. 最坏情况性能差:退化为O(n)
  2. 不支持有序操作:无法范围查询或排序
  3. 空间开销:需要额外空间处理冲突
  4. 哈希函数依赖:性能heavily依赖哈希函数质量

使用场景

适用场景

  1. 缓存系统:LRU缓存、数据库缓存
  2. 编程语言:符号表、变量查找
  3. 数据库索引:快速记录定位
  4. 分布式系统:一致性哈希、负载均衡
  5. 密码学:数字签名、完整性校验

C语言实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>

#define INITIAL_SIZE 16
#define MAX_LOAD_FACTOR 0.75
#define MIN_LOAD_FACTOR 0.25

/**
* 链式哈希表实现
*/
typedef struct HashNode {
char* key;
int value;
struct HashNode* next;
} HashNode;

typedef struct HashTable {
HashNode** buckets;
int size; // 桶的数量
int count; // 元素数量
double load_factor; // 负载因子
} HashTable;

// 哈希函数实现
unsigned int hash_function(const char* key, int table_size) {
unsigned int hash = 0;

// 简单的字符串哈希函数
while (*key) {
hash = hash * 31 + *key;
key++;
}

return hash % table_size;
}

// DJB2哈希函数(更好的分布性)
unsigned int djb2_hash(const char* key, int table_size) {
unsigned int hash = 5381;

while (*key) {
hash = ((hash << 5) + hash) + *key;
key++;
}

return hash % table_size;
}

// 创建哈希表
HashTable* create_hash_table() {
HashTable* table = (HashTable*)malloc(sizeof(HashTable));
table->size = INITIAL_SIZE;
table->count = 0;
table->load_factor = 0.0;

table->buckets = (HashNode**)calloc(table->size, sizeof(HashNode*));

return table;
}

// 创建节点
HashNode* create_node(const char* key, int value) {
HashNode* node = (HashNode*)malloc(sizeof(HashNode));
node->key = (char*)malloc(strlen(key) + 1);
strcpy(node->key, key);
node->value = value;
node->next = NULL;

return node;
}

// 调整表大小
void resize_hash_table(HashTable* table, int new_size) {
HashNode** old_buckets = table->buckets;
int old_size = table->size;

// 创建新的桶数组
table->buckets = (HashNode**)calloc(new_size, sizeof(HashNode*));
table->size = new_size;
table->count = 0;

// 重新插入所有元素
for (int i = 0; i < old_size; i++) {
HashNode* current = old_buckets[i];
while (current != NULL) {
HashNode* next = current->next;

// 重新计算哈希值并插入
unsigned int new_index = djb2_hash(current->key, new_size);
current->next = table->buckets[new_index];
table->buckets[new_index] = current;
table->count++;

current = next;
}
}

free(old_buckets);
table->load_factor = (double)table->count / table->size;
}

// 插入操作
void hash_table_insert(HashTable* table, const char* key, int value) {
unsigned int index = djb2_hash(key, table->size);

// 检查是否已存在相同的键
HashNode* current = table->buckets[index];
while (current != NULL) {
if (strcmp(current->key, key) == 0) {
current->value = value; // 更新值
return;
}
current = current->next;
}

// 创建新节点并插入到链表头部
HashNode* new_node = create_node(key, value);
new_node->next = table->buckets[index];
table->buckets[index] = new_node;
table->count++;

// 更新负载因子
table->load_factor = (double)table->count / table->size;

// 检查是否需要扩容
if (table->load_factor > MAX_LOAD_FACTOR) {
resize_hash_table(table, table->size * 2);
}
}

// 查找操作
bool hash_table_get(HashTable* table, const char* key, int* value) {
unsigned int index = djb2_hash(key, table->size);

HashNode* current = table->buckets[index];
while (current != NULL) {
if (strcmp(current->key, key) == 0) {
*value = current->value;
return true;
}
current = current->next;
}

return false;
}

// 删除操作
bool hash_table_delete(HashTable* table, const char* key) {
unsigned int index = djb2_hash(key, table->size);

HashNode* current = table->buckets[index];
HashNode* prev = NULL;

while (current != NULL) {
if (strcmp(current->key, key) == 0) {
if (prev == NULL) {
table->buckets[index] = current->next;
} else {
prev->next = current->next;
}

free(current->key);
free(current);
table->count--;

// 更新负载因子
table->load_factor = (double)table->count / table->size;

// 检查是否需要缩容
if (table->load_factor < MIN_LOAD_FACTOR && table->size > INITIAL_SIZE) {
resize_hash_table(table, table->size / 2);
}

return true;
}

prev = current;
current = current->next;
}

return false;
}

// 打印哈希表
void print_hash_table(HashTable* table) {
printf("哈希表状态:\n");
printf("大小: %d, 元素数量: %d, 负载因子: %.2f\n",
table->size, table->count, table->load_factor);

for (int i = 0; i < table->size; i++) {
printf("桶[%d]: ", i);
HashNode* current = table->buckets[i];

if (current == NULL) {
printf("空\n");
} else {
while (current != NULL) {
printf("(%s: %d)", current->key, current->value);
if (current->next != NULL) printf(" -> ");
current = current->next;
}
printf("\n");
}
}
printf("\n");
}

/**
* 开放寻址哈希表实现
*/
typedef enum {
EMPTY,
OCCUPIED,
DELETED
} SlotStatus;

typedef struct OpenHashSlot {
char* key;
int value;
SlotStatus status;
} OpenHashSlot;

typedef struct OpenHashTable {
OpenHashSlot* slots;
int size;
int count;
int deleted_count;
} OpenHashTable;

OpenHashTable* create_open_hash_table(int initial_size) {
OpenHashTable* table = (OpenHashTable*)malloc(sizeof(OpenHashTable));
table->size = initial_size;
table->count = 0;
table->deleted_count = 0;

table->slots = (OpenHashSlot*)malloc(table->size * sizeof(OpenHashSlot));
for (int i = 0; i < table->size; i++) {
table->slots[i].status = EMPTY;
table->slots[i].key = NULL;
}

return table;
}

// 线性探测
int linear_probe(OpenHashTable* table, const char* key, bool for_insertion) {
unsigned int hash = djb2_hash(key, table->size);
int index = hash;

while (table->slots[index].status != EMPTY) {
if (table->slots[index].status == OCCUPIED &&
strcmp(table->slots[index].key, key) == 0) {
return index; // 找到匹配的键
}

// 如果是插入操作,可以使用已删除的槽
if (for_insertion && table->slots[index].status == DELETED) {
return index;
}

index = (index + 1) % table->size;

// 防止无限循环
if (index == hash) {
return -1;
}
}

return index; // 返回空槽的索引
}

// 二次探测
int quadratic_probe(OpenHashTable* table, const char* key, bool for_insertion) {
unsigned int hash = djb2_hash(key, table->size);

for (int i = 0; i < table->size; i++) {
int index = (hash + i * i) % table->size;

if (table->slots[index].status == EMPTY) {
return index;
}

if (table->slots[index].status == OCCUPIED &&
strcmp(table->slots[index].key, key) == 0) {
return index;
}

if (for_insertion && table->slots[index].status == DELETED) {
return index;
}
}

return -1;
}

// 双重哈希
int double_hash_probe(OpenHashTable* table, const char* key, bool for_insertion) {
unsigned int hash1 = djb2_hash(key, table->size);
unsigned int hash2 = 7 - (djb2_hash(key, 7) % 7); // 第二个哈希函数

for (int i = 0; i < table->size; i++) {
int index = (hash1 + i * hash2) % table->size;

if (table->slots[index].status == EMPTY) {
return index;
}

if (table->slots[index].status == OCCUPIED &&
strcmp(table->slots[index].key, key) == 0) {
return index;
}

if (for_insertion && table->slots[index].status == DELETED) {
return index;
}
}

return -1;
}

void open_hash_insert(OpenHashTable* table, const char* key, int value) {
// 检查负载因子,可能需要重新哈希
if ((double)(table->count + table->deleted_count) / table->size > 0.7) {
printf("需要重新哈希\n");
return;
}

int index = linear_probe(table, key, true);

if (index == -1) {
printf("哈希表已满\n");
return;
}

if (table->slots[index].status == OCCUPIED) {
// 更新现有值
table->slots[index].value = value;
} else {
// 插入新值
if (table->slots[index].status == DELETED) {
table->deleted_count--;
}

table->slots[index].key = (char*)malloc(strlen(key) + 1);
strcpy(table->slots[index].key, key);
table->slots[index].value = value;
table->slots[index].status = OCCUPIED;
table->count++;
}
}

bool open_hash_get(OpenHashTable* table, const char* key, int* value) {
int index = linear_probe(table, key, false);

if (index != -1 && table->slots[index].status == OCCUPIED) {
*value = table->slots[index].value;
return true;
}

return false;
}

bool open_hash_delete(OpenHashTable* table, const char* key) {
int index = linear_probe(table, key, false);

if (index != -1 && table->slots[index].status == OCCUPIED) {
free(table->slots[index].key);
table->slots[index].status = DELETED;
table->count--;
table->deleted_count++;
return true;
}

return false;
}

void print_open_hash_table(OpenHashTable* table) {
printf("开放寻址哈希表:\n");
printf("大小: %d, 元素数量: %d, 删除数量: %d\n",
table->size, table->count, table->deleted_count);

for (int i = 0; i < table->size; i++) {
printf("槽[%d]: ", i);
switch (table->slots[i].status) {
case EMPTY:
printf("空\n");
break;
case OCCUPIED:
printf("(%s: %d)\n", table->slots[i].key, table->slots[i].value);
break;
case DELETED:
printf("已删除\n");
break;
}
}
printf("\n");
}

/**
* 布隆过滤器实现(哈希的应用)
*/
typedef struct BloomFilter {
unsigned char* bit_array;
int size;
int hash_count;
} BloomFilter;

BloomFilter* create_bloom_filter(int size, int hash_count) {
BloomFilter* filter = (BloomFilter*)malloc(sizeof(BloomFilter));
filter->size = size;
filter->hash_count = hash_count;
filter->bit_array = (unsigned char*)calloc((size + 7) / 8, sizeof(unsigned char));

return filter;
}

void set_bit(BloomFilter* filter, int index) {
int byte_index = index / 8;
int bit_index = index % 8;
filter->bit_array[byte_index] |= (1 << bit_index);
}

bool get_bit(BloomFilter* filter, int index) {
int byte_index = index / 8;
int bit_index = index % 8;
return (filter->bit_array[byte_index] & (1 << bit_index)) != 0;
}

unsigned int bloom_hash(const char* key, int seed) {
unsigned int hash = seed;
while (*key) {
hash = hash * 31 + *key;
key++;
}
return hash;
}

void bloom_add(BloomFilter* filter, const char* key) {
for (int i = 0; i < filter->hash_count; i++) {
unsigned int hash = bloom_hash(key, i) % filter->size;
set_bit(filter, hash);
}
}

bool bloom_might_contain(BloomFilter* filter, const char* key) {
for (int i = 0; i < filter->hash_count; i++) {
unsigned int hash = bloom_hash(key, i) % filter->size;
if (!get_bit(filter, hash)) {
return false;
}
}
return true;
}

// 测试函数
void test_chain_hash_table() {
printf("=== 链式哈希表测试 ===\n");

HashTable* table = create_hash_table();

// 插入测试数据
hash_table_insert(table, "apple", 5);
hash_table_insert(table, "banana", 7);
hash_table_insert(table, "orange", 3);
hash_table_insert(table, "grape", 12);
hash_table_insert(table, "watermelon", 8);

print_hash_table(table);

// 查找测试
int value;
if (hash_table_get(table, "banana", &value)) {
printf("找到 banana: %d\n", value);
}

// 删除测试
hash_table_delete(table, "orange");
printf("删除 orange 后:\n");
print_hash_table(table);
}

void test_open_hash_table() {
printf("=== 开放寻址哈希表测试 ===\n");

OpenHashTable* table = create_open_hash_table(8);

open_hash_insert(table, "key1", 10);
open_hash_insert(table, "key2", 20);
open_hash_insert(table, "key3", 30);

print_open_hash_table(table);

int value;
if (open_hash_get(table, "key2", &value)) {
printf("找到 key2: %d\n", value);
}

open_hash_delete(table, "key2");
printf("删除 key2 后:\n");
print_open_hash_table(table);
}

void test_bloom_filter() {
printf("=== 布隆过滤器测试 ===\n");

BloomFilter* filter = create_bloom_filter(100, 3);

// 添加元素
bloom_add(filter, "hello");
bloom_add(filter, "world");
bloom_add(filter, "bloom");

// 测试查询
printf("'hello' 可能存在: %s\n", bloom_might_contain(filter, "hello") ? "是" : "否");
printf("'world' 可能存在: %s\n", bloom_might_contain(filter, "world") ? "是" : "否");
printf("'filter' 可能存在: %s\n", bloom_might_contain(filter, "filter") ? "是" : "否");
}

int main() {
test_chain_hash_table();
test_open_hash_table();
test_bloom_filter();

return 0;
}

复杂度分析

时间复杂度

  • 平均情况:O(1) - 插入、删除、查找
  • 最坏情况:O(n) - 所有元素哈希到同一位置

空间复杂度

  • 链式哈希:O(n + m),n为元素数,m为桶数
  • 开放寻址:O(m),m为表大小

实际应用

  1. 数据库:索引结构、查询优化
  2. 编程语言:字典、映射数据结构
  3. 缓存系统:Redis、Memcached
  4. 分布式系统:一致性哈希、DHT
  5. 安全领域:密码存储、数字指纹

总结

哈希表是计算机科学中最重要的数据结构之一,其O(1)的平均时间复杂度使其在需要快速查找的场景中不可替代。理解哈希表的原理和实现细节,掌握不同的冲突解决策略,对于系统设计和性能优化具有重要意义。

版权所有,如有侵权请联系我