2008-07-30

字典树算法：高效的字符串检索结构

算法原理

字典树（Trie Tree），又称前缀树或单词查找树，是一种树形结构，用于高效地存储和检索字符串数据集中的键。这一数据结构有相当多的应用情景，例如自动补完和拼写检查。

基本思想

共享前缀：具有公共前缀的字符串共享存储空间
树形结构：每个节点代表一个字符
路径表示字符串：从根到某节点的路径代表一个字符串
标记结束：标记哪些节点是单词的结尾

C语言实现

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>

#define ALPHABET_SIZE 26

typedef struct TrieNode {
    struct TrieNode* children[ALPHABET_SIZE];
    bool is_end_of_word;
    int word_count; // 以此节点结尾的单词数量
    int prefix_count; // 经过此节点的单词数量
} TrieNode;

// 创建新节点
TrieNode* create_trie_node() {
    TrieNode* node = (TrieNode*)malloc(sizeof(TrieNode));
    
    for (int i = 0; i < ALPHABET_SIZE; i++) {
        node->children[i] = NULL;
    }
    
    node->is_end_of_word = false;
    node->word_count = 0;
    node->prefix_count = 0;
    
    return node;
}

// 插入单词
void insert(TrieNode* root, const char* word) {
    TrieNode* current = root;
    
    for (int i = 0; word[i] != '\0'; i++) {
        int index = word[i] - 'a';
        
        if (current->children[index] == NULL) {
            current->children[index] = create_trie_node();
        }
        
        current = current->children[index];
        current->prefix_count++;
    }
    
    current->is_end_of_word = true;
    current->word_count++;
}

// 搜索单词
bool search(TrieNode* root, const char* word) {
    TrieNode* current = root;
    
    for (int i = 0; word[i] != '\0'; i++) {
        int index = word[i] - 'a';
        
        if (current->children[index] == NULL) {
            return false;
        }
        
        current = current->children[index];
    }
    
    return current != NULL && current->is_end_of_word;
}

// 检查是否有以prefix开头的单词
bool starts_with(TrieNode* root, const char* prefix) {
    TrieNode* current = root;
    
    for (int i = 0; prefix[i] != '\0'; i++) {
        int index = prefix[i] - 'a';
        
        if (current->children[index] == NULL) {
            return false;
        }
        
        current = current->children[index];
    }
    
    return true;
}

// 删除单词
bool delete_word(TrieNode* root, const char* word) {
    TrieNode* current = root;
    TrieNode* path[strlen(word) + 1];
    int path_length = 0;
    
    // 记录路径
    path[path_length++] = current;
    
    for (int i = 0; word[i] != '\0'; i++) {
        int index = word[i] - 'a';
        
        if (current->children[index] == NULL) {
            return false; // 单词不存在
        }
        
        current = current->children[index];
        path[path_length++] = current;
    }
    
    if (!current->is_end_of_word) {
        return false; // 单词不存在
    }
    
    // 标记为非单词结尾
    current->is_end_of_word = false;
    current->word_count--;
    
    // 更新prefix_count并清理不需要的节点
    for (int i = path_length - 1; i > 0; i--) {
        path[i]->prefix_count--;
        
        // 如果节点没有被使用，删除它
        if (path[i]->prefix_count == 0 && !path[i]->is_end_of_word) {
            int index = word[i - 1] - 'a';
            free(path[i]);
            path[i - 1]->children[index] = NULL;
        }
    }
    
    return true;
}

// 获取所有以prefix开头的单词
void dfs_collect_words(TrieNode* node, char* current_word, int depth, char** results, int* count) {
    if (node->is_end_of_word) {
        current_word[depth] = '\0';
        results[*count] = (char*)malloc(strlen(current_word) + 1);
        strcpy(results[*count], current_word);
        (*count)++;
    }
    
    for (int i = 0; i < ALPHABET_SIZE; i++) {
        if (node->children[i] != NULL) {
            current_word[depth] = 'a' + i;
            dfs_collect_words(node->children[i], current_word, depth + 1, results, count);
        }
    }
}

void auto_complete(TrieNode* root, const char* prefix, char** suggestions, int* count) {
    TrieNode* current = root;
    *count = 0;
    
    // 找到前缀对应的节点
    for (int i = 0; prefix[i] != '\0'; i++) {
        int index = prefix[i] - 'a';
        
        if (current->children[index] == NULL) {
            return; // 没有以此前缀开头的单词
        }
        
        current = current->children[index];
    }
    
    // 收集所有可能的单词
    char word[100];
    strcpy(word, prefix);
    dfs_collect_words(current, word, strlen(prefix), suggestions, count);
}

int main() {
    TrieNode* root = create_trie_node();
    
    // 插入一些单词
    insert(root, "cat");
    insert(root, "cats");
    insert(root, "car");
    insert(root, "card");
    insert(root, "care");
    insert(root, "careful");
    insert(root, "can");
    
    // 搜索测试
    printf("搜索 'car': %s\n", search(root, "car") ? "找到" : "未找到");
    printf("搜索 'card': %s\n", search(root, "card") ? "找到" : "未找到");
    printf("搜索 'care': %s\n", search(root, "care") ? "找到" : "未找到");
    printf("搜索 'careful': %s\n", search(root, "careful") ? "找到" : "未找到");
    printf("搜索 'cards': %s\n", search(root, "cards") ? "找到" : "未找到");
    
    // 前缀测试
    printf("以 'car' 开头: %s\n", starts_with(root, "car") ? "是" : "否");
    printf("以 'pos' 开头: %s\n", starts_with(root, "pos") ? "是" : "否");
    
    // 自动补全测试
    char* suggestions[100];
    int suggestion_count;
    auto_complete(root, "car", suggestions, &suggestion_count);
    
    printf("以 'car' 开头的单词:\n");
    for (int i = 0; i < suggestion_count; i++) {
        printf("  %s\n", suggestions[i]);
        free(suggestions[i]);
    }
    
    return 0;
}

应用场景

搜索引擎：自动补全功能
文本编辑器：拼写检查
IP路由：最长前缀匹配
生物信息学：DNA序列匹配

编外计划 - 日志

To be or not to be,--that is question.

字典树算法：高效的字符串检索结构

算法原理

基本思想

C语言实现

应用场景