哈希表算法：快速查找的艺术

算法原理

哈希表（Hash Table），也称为散列表，是根据关键字值直接进行访问的数据结构。它通过哈希函数将关键字映射到表中的位置来访问记录，以加快查找的速度。哈希表的核心思想是用空间换时间。

基本思想

哈希函数：将任意大小的数据映射到固定大小的值
直接寻址：通过哈希值直接定位到存储位置
冲突处理：处理不同键映射到相同位置的情况
动态调整：根据负载因子调整表大小

核心组件

哈希函数：h(key) → index
存储结构：数组或链表
冲突解决策略：开放寻址或链式哈希
负载因子控制：维持性能的关键

优缺点分析

优点

查找快速：平均O(1)时间复杂度
插入删除高效：平均O(1)操作
空间利用率高：合理的负载因子下空间效率好
实现相对简单：基本操作逻辑清晰

缺点

最坏情况性能差：退化为O(n)
不支持有序操作：无法范围查询或排序
空间开销：需要额外空间处理冲突
哈希函数依赖：性能heavily依赖哈希函数质量

使用场景

适用场景

缓存系统：LRU缓存、数据库缓存
编程语言：符号表、变量查找
数据库索引：快速记录定位
分布式系统：一致性哈希、负载均衡
密码学：数字签名、完整性校验

C语言实现

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>

#define INITIAL_SIZE 16
#define MAX_LOAD_FACTOR 0.75
#define MIN_LOAD_FACTOR 0.25

/**
 * 链式哈希表实现
 */
typedef struct HashNode {
    char* key;
    int value;
    struct HashNode* next;
} HashNode;

typedef struct HashTable {
    HashNode** buckets;
    int size;           // 桶的数量
    int count;          // 元素数量
    double load_factor; // 负载因子
} HashTable;

// 哈希函数实现
unsigned int hash_function(const char* key, int table_size) {
    unsigned int hash = 0;
    
    // 简单的字符串哈希函数
    while (*key) {
        hash = hash * 31 + *key;
        key++;
    }
    
    return hash % table_size;
}

// DJB2哈希函数（更好的分布性）
unsigned int djb2_hash(const char* key, int table_size) {
    unsigned int hash = 5381;
    
    while (*key) {
        hash = ((hash << 5) + hash) + *key;
        key++;
    }
    
    return hash % table_size;
}

// 创建哈希表
HashTable* create_hash_table() {
    HashTable* table = (HashTable*)malloc(sizeof(HashTable));
    table->size = INITIAL_SIZE;
    table->count = 0;
    table->load_factor = 0.0;
    
    table->buckets = (HashNode**)calloc(table->size, sizeof(HashNode*));
    
    return table;
}

// 创建节点
HashNode* create_node(const char* key, int value) {
    HashNode* node = (HashNode*)malloc(sizeof(HashNode));
    node->key = (char*)malloc(strlen(key) + 1);
    strcpy(node->key, key);
    node->value = value;
    node->next = NULL;
    
    return node;
}

// 调整表大小
void resize_hash_table(HashTable* table, int new_size) {
    HashNode** old_buckets = table->buckets;
    int old_size = table->size;
    
    // 创建新的桶数组
    table->buckets = (HashNode**)calloc(new_size, sizeof(HashNode*));
    table->size = new_size;
    table->count = 0;
    
    // 重新插入所有元素
    for (int i = 0; i < old_size; i++) {
        HashNode* current = old_buckets[i];
        while (current != NULL) {
            HashNode* next = current->next;
            
            // 重新计算哈希值并插入
            unsigned int new_index = djb2_hash(current->key, new_size);
            current->next = table->buckets[new_index];
            table->buckets[new_index] = current;
            table->count++;
            
            current = next;
        }
    }
    
    free(old_buckets);
    table->load_factor = (double)table->count / table->size;
}

// 插入操作
void hash_table_insert(HashTable* table, const char* key, int value) {
    unsigned int index = djb2_hash(key, table->size);
    
    // 检查是否已存在相同的键
    HashNode* current = table->buckets[index];
    while (current != NULL) {
        if (strcmp(current->key, key) == 0) {
            current->value = value; // 更新值
            return;
        }
        current = current->next;
    }
    
    // 创建新节点并插入到链表头部
    HashNode* new_node = create_node(key, value);
    new_node->next = table->buckets[index];
    table->buckets[index] = new_node;
    table->count++;
    
    // 更新负载因子
    table->load_factor = (double)table->count / table->size;
    
    // 检查是否需要扩容
    if (table->load_factor > MAX_LOAD_FACTOR) {
        resize_hash_table(table, table->size * 2);
    }
}

// 查找操作
bool hash_table_get(HashTable* table, const char* key, int* value) {
    unsigned int index = djb2_hash(key, table->size);
    
    HashNode* current = table->buckets[index];
    while (current != NULL) {
        if (strcmp(current->key, key) == 0) {
            *value = current->value;
            return true;
        }
        current = current->next;
    }
    
    return false;
}

// 删除操作
bool hash_table_delete(HashTable* table, const char* key) {
    unsigned int index = djb2_hash(key, table->size);
    
    HashNode* current = table->buckets[index];
    HashNode* prev = NULL;
    
    while (current != NULL) {
        if (strcmp(current->key, key) == 0) {
            if (prev == NULL) {
                table->buckets[index] = current->next;
            } else {
                prev->next = current->next;
            }
            
            free(current->key);
            free(current);
            table->count--;
            
            // 更新负载因子
            table->load_factor = (double)table->count / table->size;
            
            // 检查是否需要缩容
            if (table->load_factor < MIN_LOAD_FACTOR && table->size > INITIAL_SIZE) {
                resize_hash_table(table, table->size / 2);
            }
            
            return true;
        }
        
        prev = current;
        current = current->next;
    }
    
    return false;
}

// 打印哈希表
void print_hash_table(HashTable* table) {
    printf("哈希表状态:\n");
    printf("大小: %d, 元素数量: %d, 负载因子: %.2f\n", 
           table->size, table->count, table->load_factor);
    
    for (int i = 0; i < table->size; i++) {
        printf("桶[%d]: ", i);
        HashNode* current = table->buckets[i];
        
        if (current == NULL) {
            printf("空\n");
        } else {
            while (current != NULL) {
                printf("(%s: %d)", current->key, current->value);
                if (current->next != NULL) printf(" -> ");
                current = current->next;
            }
            printf("\n");
        }
    }
    printf("\n");
}

/**
 * 开放寻址哈希表实现
 */
typedef enum {
    EMPTY,
    OCCUPIED,
    DELETED
} SlotStatus;

typedef struct OpenHashSlot {
    char* key;
    int value;
    SlotStatus status;
} OpenHashSlot;

typedef struct OpenHashTable {
    OpenHashSlot* slots;
    int size;
    int count;
    int deleted_count;
} OpenHashTable;

OpenHashTable* create_open_hash_table(int initial_size) {
    OpenHashTable* table = (OpenHashTable*)malloc(sizeof(OpenHashTable));
    table->size = initial_size;
    table->count = 0;
    table->deleted_count = 0;
    
    table->slots = (OpenHashSlot*)malloc(table->size * sizeof(OpenHashSlot));
    for (int i = 0; i < table->size; i++) {
        table->slots[i].status = EMPTY;
        table->slots[i].key = NULL;
    }
    
    return table;
}

// 线性探测
int linear_probe(OpenHashTable* table, const char* key, bool for_insertion) {
    unsigned int hash = djb2_hash(key, table->size);
    int index = hash;
    
    while (table->slots[index].status != EMPTY) {
        if (table->slots[index].status == OCCUPIED && 
            strcmp(table->slots[index].key, key) == 0) {
            return index; // 找到匹配的键
        }
        
        // 如果是插入操作，可以使用已删除的槽
        if (for_insertion && table->slots[index].status == DELETED) {
            return index;
        }
        
        index = (index + 1) % table->size;
        
        // 防止无限循环
        if (index == hash) {
            return -1;
        }
    }
    
    return index; // 返回空槽的索引
}

// 二次探测
int quadratic_probe(OpenHashTable* table, const char* key, bool for_insertion) {
    unsigned int hash = djb2_hash(key, table->size);
    
    for (int i = 0; i < table->size; i++) {
        int index = (hash + i * i) % table->size;
        
        if (table->slots[index].status == EMPTY) {
            return index;
        }
        
        if (table->slots[index].status == OCCUPIED && 
            strcmp(table->slots[index].key, key) == 0) {
            return index;
        }
        
        if (for_insertion && table->slots[index].status == DELETED) {
            return index;
        }
    }
    
    return -1;
}

// 双重哈希
int double_hash_probe(OpenHashTable* table, const char* key, bool for_insertion) {
    unsigned int hash1 = djb2_hash(key, table->size);
    unsigned int hash2 = 7 - (djb2_hash(key, 7) % 7); // 第二个哈希函数
    
    for (int i = 0; i < table->size; i++) {
        int index = (hash1 + i * hash2) % table->size;
        
        if (table->slots[index].status == EMPTY) {
            return index;
        }
        
        if (table->slots[index].status == OCCUPIED && 
            strcmp(table->slots[index].key, key) == 0) {
            return index;
        }
        
        if (for_insertion && table->slots[index].status == DELETED) {
            return index;
        }
    }
    
    return -1;
}

void open_hash_insert(OpenHashTable* table, const char* key, int value) {
    // 检查负载因子，可能需要重新哈希
    if ((double)(table->count + table->deleted_count) / table->size > 0.7) {
        printf("需要重新哈希\n");
        return;
    }
    
    int index = linear_probe(table, key, true);
    
    if (index == -1) {
        printf("哈希表已满\n");
        return;
    }
    
    if (table->slots[index].status == OCCUPIED) {
        // 更新现有值
        table->slots[index].value = value;
    } else {
        // 插入新值
        if (table->slots[index].status == DELETED) {
            table->deleted_count--;
        }
        
        table->slots[index].key = (char*)malloc(strlen(key) + 1);
        strcpy(table->slots[index].key, key);
        table->slots[index].value = value;
        table->slots[index].status = OCCUPIED;
        table->count++;
    }
}

bool open_hash_get(OpenHashTable* table, const char* key, int* value) {
    int index = linear_probe(table, key, false);
    
    if (index != -1 && table->slots[index].status == OCCUPIED) {
        *value = table->slots[index].value;
        return true;
    }
    
    return false;
}

bool open_hash_delete(OpenHashTable* table, const char* key) {
    int index = linear_probe(table, key, false);
    
    if (index != -1 && table->slots[index].status == OCCUPIED) {
        free(table->slots[index].key);
        table->slots[index].status = DELETED;
        table->count--;
        table->deleted_count++;
        return true;
    }
    
    return false;
}

void print_open_hash_table(OpenHashTable* table) {
    printf("开放寻址哈希表:\n");
    printf("大小: %d, 元素数量: %d, 删除数量: %d\n", 
           table->size, table->count, table->deleted_count);
    
    for (int i = 0; i < table->size; i++) {
        printf("槽[%d]: ", i);
        switch (table->slots[i].status) {
            case EMPTY:
                printf("空\n");
                break;
            case OCCUPIED:
                printf("(%s: %d)\n", table->slots[i].key, table->slots[i].value);
                break;
            case DELETED:
                printf("已删除\n");
                break;
        }
    }
    printf("\n");
}

/**
 * 布隆过滤器实现（哈希的应用）
 */
typedef struct BloomFilter {
    unsigned char* bit_array;
    int size;
    int hash_count;
} BloomFilter;

BloomFilter* create_bloom_filter(int size, int hash_count) {
    BloomFilter* filter = (BloomFilter*)malloc(sizeof(BloomFilter));
    filter->size = size;
    filter->hash_count = hash_count;
    filter->bit_array = (unsigned char*)calloc((size + 7) / 8, sizeof(unsigned char));
    
    return filter;
}

void set_bit(BloomFilter* filter, int index) {
    int byte_index = index / 8;
    int bit_index = index % 8;
    filter->bit_array[byte_index] |= (1 << bit_index);
}

bool get_bit(BloomFilter* filter, int index) {
    int byte_index = index / 8;
    int bit_index = index % 8;
    return (filter->bit_array[byte_index] & (1 << bit_index)) != 0;
}

unsigned int bloom_hash(const char* key, int seed) {
    unsigned int hash = seed;
    while (*key) {
        hash = hash * 31 + *key;
        key++;
    }
    return hash;
}

void bloom_add(BloomFilter* filter, const char* key) {
    for (int i = 0; i < filter->hash_count; i++) {
        unsigned int hash = bloom_hash(key, i) % filter->size;
        set_bit(filter, hash);
    }
}

bool bloom_might_contain(BloomFilter* filter, const char* key) {
    for (int i = 0; i < filter->hash_count; i++) {
        unsigned int hash = bloom_hash(key, i) % filter->size;
        if (!get_bit(filter, hash)) {
            return false;
        }
    }
    return true;
}

// 测试函数
void test_chain_hash_table() {
    printf("=== 链式哈希表测试 ===\n");
    
    HashTable* table = create_hash_table();
    
    // 插入测试数据
    hash_table_insert(table, "apple", 5);
    hash_table_insert(table, "banana", 7);
    hash_table_insert(table, "orange", 3);
    hash_table_insert(table, "grape", 12);
    hash_table_insert(table, "watermelon", 8);
    
    print_hash_table(table);
    
    // 查找测试
    int value;
    if (hash_table_get(table, "banana", &value)) {
        printf("找到 banana: %d\n", value);
    }
    
    // 删除测试
    hash_table_delete(table, "orange");
    printf("删除 orange 后:\n");
    print_hash_table(table);
}

void test_open_hash_table() {
    printf("=== 开放寻址哈希表测试 ===\n");
    
    OpenHashTable* table = create_open_hash_table(8);
    
    open_hash_insert(table, "key1", 10);
    open_hash_insert(table, "key2", 20);
    open_hash_insert(table, "key3", 30);
    
    print_open_hash_table(table);
    
    int value;
    if (open_hash_get(table, "key2", &value)) {
        printf("找到 key2: %d\n", value);
    }
    
    open_hash_delete(table, "key2");
    printf("删除 key2 后:\n");
    print_open_hash_table(table);
}

void test_bloom_filter() {
    printf("=== 布隆过滤器测试 ===\n");
    
    BloomFilter* filter = create_bloom_filter(100, 3);
    
    // 添加元素
    bloom_add(filter, "hello");
    bloom_add(filter, "world");
    bloom_add(filter, "bloom");
    
    // 测试查询
    printf("'hello' 可能存在: %s\n", bloom_might_contain(filter, "hello") ? "是" : "否");
    printf("'world' 可能存在: %s\n", bloom_might_contain(filter, "world") ? "是" : "否");
    printf("'filter' 可能存在: %s\n", bloom_might_contain(filter, "filter") ? "是" : "否");
}

int main() {
    test_chain_hash_table();
    test_open_hash_table();
    test_bloom_filter();
    
    return 0;
}

复杂度分析

时间复杂度

平均情况：O(1) - 插入、删除、查找
最坏情况：O(n) - 所有元素哈希到同一位置

空间复杂度

链式哈希：O(n + m)，n为元素数，m为桶数
开放寻址：O(m)，m为表大小

实际应用

数据库：索引结构、查询优化
编程语言：字典、映射数据结构
缓存系统：Redis、Memcached
分布式系统：一致性哈希、DHT
安全领域：密码存储、数字指纹

总结

哈希表是计算机科学中最重要的数据结构之一，其O(1)的平均时间复杂度使其在需要快速查找的场景中不可替代。理解哈希表的原理和实现细节，掌握不同的冲突解决策略，对于系统设计和性能优化具有重要意义。

编外计划 - 日志

To be or not to be,--that is question.

哈希表算法：快速查找的艺术

算法原理

基本思想

核心组件

优缺点分析

优点

缺点

使用场景

适用场景

C语言实现

复杂度分析

时间复杂度

空间复杂度

实际应用

总结