字典树算法:高效的字符串检索结构

算法原理

字典树(Trie Tree),又称前缀树或单词查找树,是一种树形结构,用于高效地存储和检索字符串数据集中的键。这一数据结构有相当多的应用情景,例如自动补完和拼写检查。

基本思想

  1. 共享前缀:具有公共前缀的字符串共享存储空间
  2. 树形结构:每个节点代表一个字符
  3. 路径表示字符串:从根到某节点的路径代表一个字符串
  4. 标记结束:标记哪些节点是单词的结尾

C语言实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>

#define ALPHABET_SIZE 26

typedef struct TrieNode {
struct TrieNode* children[ALPHABET_SIZE];
bool is_end_of_word;
int word_count; // 以此节点结尾的单词数量
int prefix_count; // 经过此节点的单词数量
} TrieNode;

// 创建新节点
TrieNode* create_trie_node() {
TrieNode* node = (TrieNode*)malloc(sizeof(TrieNode));

for (int i = 0; i < ALPHABET_SIZE; i++) {
node->children[i] = NULL;
}

node->is_end_of_word = false;
node->word_count = 0;
node->prefix_count = 0;

return node;
}

// 插入单词
void insert(TrieNode* root, const char* word) {
TrieNode* current = root;

for (int i = 0; word[i] != '\0'; i++) {
int index = word[i] - 'a';

if (current->children[index] == NULL) {
current->children[index] = create_trie_node();
}

current = current->children[index];
current->prefix_count++;
}

current->is_end_of_word = true;
current->word_count++;
}

// 搜索单词
bool search(TrieNode* root, const char* word) {
TrieNode* current = root;

for (int i = 0; word[i] != '\0'; i++) {
int index = word[i] - 'a';

if (current->children[index] == NULL) {
return false;
}

current = current->children[index];
}

return current != NULL && current->is_end_of_word;
}

// 检查是否有以prefix开头的单词
bool starts_with(TrieNode* root, const char* prefix) {
TrieNode* current = root;

for (int i = 0; prefix[i] != '\0'; i++) {
int index = prefix[i] - 'a';

if (current->children[index] == NULL) {
return false;
}

current = current->children[index];
}

return true;
}

// 删除单词
bool delete_word(TrieNode* root, const char* word) {
TrieNode* current = root;
TrieNode* path[strlen(word) + 1];
int path_length = 0;

// 记录路径
path[path_length++] = current;

for (int i = 0; word[i] != '\0'; i++) {
int index = word[i] - 'a';

if (current->children[index] == NULL) {
return false; // 单词不存在
}

current = current->children[index];
path[path_length++] = current;
}

if (!current->is_end_of_word) {
return false; // 单词不存在
}

// 标记为非单词结尾
current->is_end_of_word = false;
current->word_count--;

// 更新prefix_count并清理不需要的节点
for (int i = path_length - 1; i > 0; i--) {
path[i]->prefix_count--;

// 如果节点没有被使用,删除它
if (path[i]->prefix_count == 0 && !path[i]->is_end_of_word) {
int index = word[i - 1] - 'a';
free(path[i]);
path[i - 1]->children[index] = NULL;
}
}

return true;
}

// 获取所有以prefix开头的单词
void dfs_collect_words(TrieNode* node, char* current_word, int depth, char** results, int* count) {
if (node->is_end_of_word) {
current_word[depth] = '\0';
results[*count] = (char*)malloc(strlen(current_word) + 1);
strcpy(results[*count], current_word);
(*count)++;
}

for (int i = 0; i < ALPHABET_SIZE; i++) {
if (node->children[i] != NULL) {
current_word[depth] = 'a' + i;
dfs_collect_words(node->children[i], current_word, depth + 1, results, count);
}
}
}

void auto_complete(TrieNode* root, const char* prefix, char** suggestions, int* count) {
TrieNode* current = root;
*count = 0;

// 找到前缀对应的节点
for (int i = 0; prefix[i] != '\0'; i++) {
int index = prefix[i] - 'a';

if (current->children[index] == NULL) {
return; // 没有以此前缀开头的单词
}

current = current->children[index];
}

// 收集所有可能的单词
char word[100];
strcpy(word, prefix);
dfs_collect_words(current, word, strlen(prefix), suggestions, count);
}

int main() {
TrieNode* root = create_trie_node();

// 插入一些单词
insert(root, "cat");
insert(root, "cats");
insert(root, "car");
insert(root, "card");
insert(root, "care");
insert(root, "careful");
insert(root, "can");

// 搜索测试
printf("搜索 'car': %s\n", search(root, "car") ? "找到" : "未找到");
printf("搜索 'card': %s\n", search(root, "card") ? "找到" : "未找到");
printf("搜索 'care': %s\n", search(root, "care") ? "找到" : "未找到");
printf("搜索 'careful': %s\n", search(root, "careful") ? "找到" : "未找到");
printf("搜索 'cards': %s\n", search(root, "cards") ? "找到" : "未找到");

// 前缀测试
printf("以 'car' 开头: %s\n", starts_with(root, "car") ? "是" : "否");
printf("以 'pos' 开头: %s\n", starts_with(root, "pos") ? "是" : "否");

// 自动补全测试
char* suggestions[100];
int suggestion_count;
auto_complete(root, "car", suggestions, &suggestion_count);

printf("以 'car' 开头的单词:\n");
for (int i = 0; i < suggestion_count; i++) {
printf(" %s\n", suggestions[i]);
free(suggestions[i]);
}

return 0;
}

应用场景

  1. 搜索引擎:自动补全功能
  2. 文本编辑器:拼写检查
  3. IP路由:最长前缀匹配
  4. 生物信息学:DNA序列匹配

版权所有,如有侵权请联系我