C语言字符串处理高级技巧与实战

字符串处理是C语言编程中的重要组成部分,掌握高级的字符串处理技巧对于文本处理、数据解析和系统编程至关重要。本文将深入探讨C语言中的字符串处理技术和实用技巧。

1. 字符串基础与内存管理

1.1 字符串表示与存储

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>

// 动态字符串结构
typedef struct {
char *data;
size_t length;
size_t capacity;
} DynamicString;

// 初始化动态字符串
DynamicString* dstring_create(size_t initial_capacity) {
DynamicString *ds = (DynamicString*)malloc(sizeof(DynamicString));
if (!ds) return NULL;

ds->capacity = initial_capacity > 0 ? initial_capacity : 16;
ds->data = (char*)malloc(ds->capacity);
if (!ds->data) {
free(ds);
return NULL;
}

ds->data[0] = '\0';
ds->length = 0;
return ds;
}

// 扩容动态字符串
int dstring_resize(DynamicString *ds, size_t new_capacity) {
if (!ds || new_capacity <= ds->capacity) return 0;

char *new_data = (char*)realloc(ds->data, new_capacity);
if (!new_data) return -1;

ds->data = new_data;
ds->capacity = new_capacity;
return 0;
}

// 确保容量足够
int dstring_ensure_capacity(DynamicString *ds, size_t required) {
if (!ds) return -1;

if (required >= ds->capacity) {
size_t new_capacity = ds->capacity;
while (new_capacity <= required) {
new_capacity *= 2;
}
return dstring_resize(ds, new_capacity);
}

return 0;
}

// 追加字符串
int dstring_append(DynamicString *ds, const char *str) {
if (!ds || !str) return -1;

size_t str_len = strlen(str);
if (dstring_ensure_capacity(ds, ds->length + str_len + 1) != 0) {
return -1;
}

strcpy(ds->data + ds->length, str);
ds->length += str_len;
return 0;
}

// 追加字符
int dstring_append_char(DynamicString *ds, char c) {
if (!ds) return -1;

if (dstring_ensure_capacity(ds, ds->length + 2) != 0) {
return -1;
}

ds->data[ds->length] = c;
ds->data[ds->length + 1] = '\0';
ds->length++;
return 0;
}

// 销毁动态字符串
void dstring_destroy(DynamicString *ds) {
if (ds) {
free(ds->data);
free(ds);
}
}

1.2 安全的字符串操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// 安全的字符串复制
char* safe_strcpy(char *dest, const char *src, size_t dest_size) {
if (!dest || !src || dest_size == 0) return NULL;

size_t src_len = strlen(src);
size_t copy_len = (src_len < dest_size - 1) ? src_len : dest_size - 1;

memcpy(dest, src, copy_len);
dest[copy_len] = '\0';

return dest;
}

// 安全的字符串连接
char* safe_strcat(char *dest, const char *src, size_t dest_size) {
if (!dest || !src || dest_size == 0) return NULL;

size_t dest_len = strlen(dest);
if (dest_len >= dest_size - 1) return dest;

size_t remaining = dest_size - dest_len - 1;
size_t src_len = strlen(src);
size_t copy_len = (src_len < remaining) ? src_len : remaining;

memcpy(dest + dest_len, src, copy_len);
dest[dest_len + copy_len] = '\0';

return dest;
}

// 安全的字符串格式化
int safe_sprintf(char *buffer, size_t buffer_size, const char *format, ...) {
if (!buffer || !format || buffer_size == 0) return -1;

va_list args;
va_start(args, format);

int result = vsnprintf(buffer, buffer_size, format, args);

va_end(args);

// 确保字符串以null结尾
buffer[buffer_size - 1] = '\0';

return result;
}

2. 字符串查找与匹配

2.1 基础查找算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// 朴素字符串匹配
int naive_string_search(const char *text, const char *pattern) {
if (!text || !pattern) return -1;

int text_len = strlen(text);
int pattern_len = strlen(pattern);

if (pattern_len == 0) return 0;
if (pattern_len > text_len) return -1;

for (int i = 0; i <= text_len - pattern_len; i++) {
int j;
for (j = 0; j < pattern_len; j++) {
if (text[i + j] != pattern[j]) {
break;
}
}
if (j == pattern_len) {
return i;
}
}

return -1;
}

// KMP算法实现
void compute_lps_array(const char *pattern, int pattern_len, int *lps) {
int len = 0;
lps[0] = 0;
int i = 1;

while (i < pattern_len) {
if (pattern[i] == pattern[len]) {
len++;
lps[i] = len;
i++;
} else {
if (len != 0) {
len = lps[len - 1];
} else {
lps[i] = 0;
i++;
}
}
}
}

int kmp_search(const char *text, const char *pattern) {
if (!text || !pattern) return -1;

int text_len = strlen(text);
int pattern_len = strlen(pattern);

if (pattern_len == 0) return 0;
if (pattern_len > text_len) return -1;

int *lps = (int*)malloc(pattern_len * sizeof(int));
compute_lps_array(pattern, pattern_len, lps);

int i = 0; // text的索引
int j = 0; // pattern的索引

while (i < text_len) {
if (pattern[j] == text[i]) {
i++;
j++;
}

if (j == pattern_len) {
free(lps);
return i - j;
} else if (i < text_len && pattern[j] != text[i]) {
if (j != 0) {
j = lps[j - 1];
} else {
i++;
}
}
}

free(lps);
return -1;
}

// Boyer-Moore算法(简化版)
#define ALPHABET_SIZE 256

void compute_bad_char_heuristic(const char *pattern, int pattern_len, int bad_char[ALPHABET_SIZE]) {
for (int i = 0; i < ALPHABET_SIZE; i++) {
bad_char[i] = -1;
}

for (int i = 0; i < pattern_len; i++) {
bad_char[(int)pattern[i]] = i;
}
}

int boyer_moore_search(const char *text, const char *pattern) {
if (!text || !pattern) return -1;

int text_len = strlen(text);
int pattern_len = strlen(pattern);

if (pattern_len == 0) return 0;
if (pattern_len > text_len) return -1;

int bad_char[ALPHABET_SIZE];
compute_bad_char_heuristic(pattern, pattern_len, bad_char);

int shift = 0;
while (shift <= text_len - pattern_len) {
int j = pattern_len - 1;

while (j >= 0 && pattern[j] == text[shift + j]) {
j--;
}

if (j < 0) {
return shift;
} else {
int bad_char_shift = j - bad_char[(int)text[shift + j]];
shift += (bad_char_shift > 1) ? bad_char_shift : 1;
}
}

return -1;
}

2.2 多模式匹配

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
// Aho-Corasick算法的简化实现
#define MAX_STATES 500
#define MAX_CHARS 26

typedef struct {
int go[MAX_STATES][MAX_CHARS];
int fail[MAX_STATES];
int output[MAX_STATES];
int states;
} AhoCorasick;

// 初始化AC自动机
void ac_init(AhoCorasick *ac) {
memset(ac->go, -1, sizeof(ac->go));
memset(ac->fail, 0, sizeof(ac->fail));
memset(ac->output, 0, sizeof(ac->output));
ac->states = 1;
}

// 添加模式串
void ac_add_pattern(AhoCorasick *ac, const char *pattern, int pattern_id) {
int current_state = 0;

for (int i = 0; pattern[i]; i++) {
int c = pattern[i] - 'a';
if (ac->go[current_state][c] == -1) {
ac->go[current_state][c] = ac->states++;
}
current_state = ac->go[current_state][c];
}

ac->output[current_state] = pattern_id;
}

// 构建失败函数
void ac_build_failure_function(AhoCorasick *ac) {
int queue[MAX_STATES];
int front = 0, rear = 0;

// 初始化第一层
for (int i = 0; i < MAX_CHARS; i++) {
if (ac->go[0][i] == -1) {
ac->go[0][i] = 0;
} else {
ac->fail[ac->go[0][i]] = 0;
queue[rear++] = ac->go[0][i];
}
}

// BFS构建失败函数
while (front < rear) {
int state = queue[front++];

for (int i = 0; i < MAX_CHARS; i++) {
if (ac->go[state][i] != -1) {
int failure = ac->fail[state];

while (ac->go[failure][i] == -1) {
failure = ac->fail[failure];
}

ac->fail[ac->go[state][i]] = ac->go[failure][i];
ac->output[ac->go[state][i]] |= ac->output[ac->go[failure][i]];

queue[rear++] = ac->go[state][i];
}
}
}
}

// 搜索文本
void ac_search(AhoCorasick *ac, const char *text) {
int current_state = 0;

for (int i = 0; text[i]; i++) {
int c = text[i] - 'a';

while (ac->go[current_state][c] == -1) {
current_state = ac->fail[current_state];
}

current_state = ac->go[current_state][c];

if (ac->output[current_state] != 0) {
printf("在位置 %d 找到模式 %d\n", i, ac->output[current_state]);
}
}
}

3. 字符串解析与处理

3.1 字符串分割

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
// 字符串分割结果结构
typedef struct {
char **tokens;
int count;
int capacity;
} StringTokens;

// 初始化分割结果
StringTokens* tokens_create() {
StringTokens *tokens = (StringTokens*)malloc(sizeof(StringTokens));
if (!tokens) return NULL;

tokens->capacity = 10;
tokens->tokens = (char**)malloc(tokens->capacity * sizeof(char*));
if (!tokens->tokens) {
free(tokens);
return NULL;
}

tokens->count = 0;
return tokens;
}

// 添加token
int tokens_add(StringTokens *tokens, const char *token) {
if (!tokens || !token) return -1;

if (tokens->count >= tokens->capacity) {
tokens->capacity *= 2;
char **new_tokens = (char**)realloc(tokens->tokens,
tokens->capacity * sizeof(char*));
if (!new_tokens) return -1;
tokens->tokens = new_tokens;
}

tokens->tokens[tokens->count] = strdup(token);
if (!tokens->tokens[tokens->count]) return -1;

tokens->count++;
return 0;
}

// 字符串分割
StringTokens* string_split(const char *str, const char *delimiters) {
if (!str || !delimiters) return NULL;

StringTokens *tokens = tokens_create();
if (!tokens) return NULL;

char *str_copy = strdup(str);
if (!str_copy) {
tokens_destroy(tokens);
return NULL;
}

char *token = strtok(str_copy, delimiters);
while (token != NULL) {
if (tokens_add(tokens, token) != 0) {
free(str_copy);
tokens_destroy(tokens);
return NULL;
}
token = strtok(NULL, delimiters);
}

free(str_copy);
return tokens;
}

// 高级分割(支持引号和转义)
StringTokens* string_split_advanced(const char *str, char delimiter,
char quote_char, char escape_char) {
if (!str) return NULL;

StringTokens *tokens = tokens_create();
if (!tokens) return NULL;

DynamicString *current_token = dstring_create(0);
if (!current_token) {
tokens_destroy(tokens);
return NULL;
}

bool in_quotes = false;
bool escaped = false;

for (int i = 0; str[i]; i++) {
char c = str[i];

if (escaped) {
dstring_append_char(current_token, c);
escaped = false;
} else if (c == escape_char) {
escaped = true;
} else if (c == quote_char) {
in_quotes = !in_quotes;
} else if (c == delimiter && !in_quotes) {
// 结束当前token
if (tokens_add(tokens, current_token->data) != 0) {
dstring_destroy(current_token);
tokens_destroy(tokens);
return NULL;
}

// 重置当前token
current_token->length = 0;
current_token->data[0] = '\0';
} else {
dstring_append_char(current_token, c);
}
}

// 添加最后一个token
if (current_token->length > 0) {
tokens_add(tokens, current_token->data);
}

dstring_destroy(current_token);
return tokens;
}

// 销毁分割结果
void tokens_destroy(StringTokens *tokens) {
if (tokens) {
for (int i = 0; i < tokens->count; i++) {
free(tokens->tokens[i]);
}
free(tokens->tokens);
free(tokens);
}
}

3.2 字符串替换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// 简单字符串替换
char* string_replace(const char *str, const char *old_substr, const char *new_substr) {
if (!str || !old_substr || !new_substr) return NULL;

int old_len = strlen(old_substr);
int new_len = strlen(new_substr);

if (old_len == 0) return strdup(str);

// 计算替换后的长度
int count = 0;
const char *pos = str;
while ((pos = strstr(pos, old_substr)) != NULL) {
count++;
pos += old_len;
}

if (count == 0) return strdup(str);

int result_len = strlen(str) + count * (new_len - old_len);
char *result = (char*)malloc(result_len + 1);
if (!result) return NULL;

char *dest = result;
const char *src = str;

while ((pos = strstr(src, old_substr)) != NULL) {
// 复制前面的部分
int prefix_len = pos - src;
memcpy(dest, src, prefix_len);
dest += prefix_len;

// 复制新字符串
memcpy(dest, new_substr, new_len);
dest += new_len;

src = pos + old_len;
}

// 复制剩余部分
strcpy(dest, src);

return result;
}

// 正则表达式风格的替换(简化版)
char* string_replace_regex(const char *str, const char *pattern, const char *replacement) {
// 这里只实现简单的通配符匹配
// 实际应用中可以集成PCRE等正则表达式库

if (!str || !pattern || !replacement) return NULL;

DynamicString *result = dstring_create(strlen(str) * 2);
if (!result) return NULL;

int i = 0;
int str_len = strlen(str);
int pattern_len = strlen(pattern);

while (i < str_len) {
if (strncmp(str + i, pattern, pattern_len) == 0) {
dstring_append(result, replacement);
i += pattern_len;
} else {
dstring_append_char(result, str[i]);
i++;
}
}

char *final_result = strdup(result->data);
dstring_destroy(result);
return final_result;
}

3.3 字符串格式化与解析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
// 自定义格式化函数
char* string_format(const char *format, ...) {
va_list args;
va_start(args, format);

// 计算所需长度
int len = vsnprintf(NULL, 0, format, args);
va_end(args);

if (len < 0) return NULL;

char *result = (char*)malloc(len + 1);
if (!result) return NULL;

va_start(args, format);
vsnprintf(result, len + 1, format, args);
va_end(args);

return result;
}

// CSV解析器
typedef struct {
char ***rows;
int *column_counts;
int row_count;
int capacity;
} CSVData;

CSVData* csv_parse(const char *csv_string) {
if (!csv_string) return NULL;

CSVData *csv = (CSVData*)malloc(sizeof(CSVData));
if (!csv) return NULL;

csv->capacity = 10;
csv->rows = (char***)malloc(csv->capacity * sizeof(char**));
csv->column_counts = (int*)malloc(csv->capacity * sizeof(int));
csv->row_count = 0;

if (!csv->rows || !csv->column_counts) {
free(csv->rows);
free(csv->column_counts);
free(csv);
return NULL;
}

// 按行分割
StringTokens *lines = string_split(csv_string, "\n");
if (!lines) {
free(csv->rows);
free(csv->column_counts);
free(csv);
return NULL;
}

for (int i = 0; i < lines->count; i++) {
if (strlen(lines->tokens[i]) == 0) continue;

// 扩容检查
if (csv->row_count >= csv->capacity) {
csv->capacity *= 2;
csv->rows = (char***)realloc(csv->rows, csv->capacity * sizeof(char**));
csv->column_counts = (int*)realloc(csv->column_counts, csv->capacity * sizeof(int));
}

// 解析CSV行(支持引号)
StringTokens *columns = string_split_advanced(lines->tokens[i], ',', '"', '\\');
if (columns) {
csv->rows[csv->row_count] = (char**)malloc(columns->count * sizeof(char*));
for (int j = 0; j < columns->count; j++) {
csv->rows[csv->row_count][j] = strdup(columns->tokens[j]);
}
csv->column_counts[csv->row_count] = columns->count;
csv->row_count++;
tokens_destroy(columns);
}
}

tokens_destroy(lines);
return csv;
}

// JSON解析器(简化版)
typedef enum {
JSON_NULL,
JSON_BOOL,
JSON_NUMBER,
JSON_STRING,
JSON_ARRAY,
JSON_OBJECT
} JsonType;

typedef struct JsonValue {
JsonType type;
union {
int bool_value;
double number_value;
char *string_value;
struct {
struct JsonValue **items;
int count;
} array_value;
struct {
char **keys;
struct JsonValue **values;
int count;
} object_value;
} data;
} JsonValue;

// 跳过空白字符
const char* skip_whitespace(const char *json) {
while (*json && isspace(*json)) {
json++;
}
return json;
}

// 解析JSON字符串(简化实现)
JsonValue* json_parse_string(const char **json) {
const char *start = ++(*json); // 跳过开始的引号

while (**json && **json != '"') {
if (**json == '\\') {
(*json)++; // 跳过转义字符
}
(*json)++;
}

if (**json != '"') return NULL;

int len = *json - start;
JsonValue *value = (JsonValue*)malloc(sizeof(JsonValue));
if (!value) return NULL;

value->type = JSON_STRING;
value->data.string_value = (char*)malloc(len + 1);
if (!value->data.string_value) {
free(value);
return NULL;
}

memcpy(value->data.string_value, start, len);
value->data.string_value[len] = '\0';

(*json)++; // 跳过结束的引号
return value;
}

// 解析JSON数字
JsonValue* json_parse_number(const char **json) {
char *end;
double number = strtod(*json, &end);

if (end == *json) return NULL;

JsonValue *value = (JsonValue*)malloc(sizeof(JsonValue));
if (!value) return NULL;

value->type = JSON_NUMBER;
value->data.number_value = number;

*json = end;
return value;
}

4. 字符编码与转换

4.1 UTF-8处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// UTF-8字符长度检测
int utf8_char_length(unsigned char first_byte) {
if ((first_byte & 0x80) == 0) return 1; // 0xxxxxxx
if ((first_byte & 0xE0) == 0xC0) return 2; // 110xxxxx
if ((first_byte & 0xF0) == 0xE0) return 3; // 1110xxxx
if ((first_byte & 0xF8) == 0xF0) return 4; // 11110xxx
return -1; // 无效的UTF-8字节
}

// 验证UTF-8字符串
bool is_valid_utf8(const char *str) {
if (!str) return false;

const unsigned char *bytes = (const unsigned char*)str;

while (*bytes) {
int char_len = utf8_char_length(*bytes);
if (char_len < 0) return false;

// 检查后续字节
for (int i = 1; i < char_len; i++) {
if ((bytes[i] & 0xC0) != 0x80) {
return false;
}
}

bytes += char_len;
}

return true;
}

// UTF-8字符计数
int utf8_strlen(const char *str) {
if (!str) return 0;

int count = 0;
const unsigned char *bytes = (const unsigned char*)str;

while (*bytes) {
int char_len = utf8_char_length(*bytes);
if (char_len < 0) return -1;

bytes += char_len;
count++;
}

return count;
}

// UTF-8到Unicode码点转换
int utf8_to_unicode(const char *utf8_char, int *unicode) {
if (!utf8_char || !unicode) return -1;

const unsigned char *bytes = (const unsigned char*)utf8_char;
int char_len = utf8_char_length(bytes[0]);

if (char_len < 0) return -1;

switch (char_len) {
case 1:
*unicode = bytes[0];
break;
case 2:
*unicode = ((bytes[0] & 0x1F) << 6) | (bytes[1] & 0x3F);
break;
case 3:
*unicode = ((bytes[0] & 0x0F) << 12) |
((bytes[1] & 0x3F) << 6) |
(bytes[2] & 0x3F);
break;
case 4:
*unicode = ((bytes[0] & 0x07) << 18) |
((bytes[1] & 0x3F) << 12) |
((bytes[2] & 0x3F) << 6) |
(bytes[3] & 0x3F);
break;
}

return char_len;
}

// Unicode码点到UTF-8转换
int unicode_to_utf8(int unicode, char *utf8_char) {
if (!utf8_char) return -1;

if (unicode <= 0x7F) {
utf8_char[0] = unicode;
utf8_char[1] = '\0';
return 1;
} else if (unicode <= 0x7FF) {
utf8_char[0] = 0xC0 | (unicode >> 6);
utf8_char[1] = 0x80 | (unicode & 0x3F);
utf8_char[2] = '\0';
return 2;
} else if (unicode <= 0xFFFF) {
utf8_char[0] = 0xE0 | (unicode >> 12);
utf8_char[1] = 0x80 | ((unicode >> 6) & 0x3F);
utf8_char[2] = 0x80 | (unicode & 0x3F);
utf8_char[3] = '\0';
return 3;
} else if (unicode <= 0x10FFFF) {
utf8_char[0] = 0xF0 | (unicode >> 18);
utf8_char[1] = 0x80 | ((unicode >> 12) & 0x3F);
utf8_char[2] = 0x80 | ((unicode >> 6) & 0x3F);
utf8_char[3] = 0x80 | (unicode & 0x3F);
utf8_char[4] = '\0';
return 4;
}

return -1;
}

4.2 编码转换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
// 简单的ASCII到UTF-8转换
char* ascii_to_utf8(const char *ascii_str) {
if (!ascii_str) return NULL;

// ASCII是UTF-8的子集,直接复制即可
return strdup(ascii_str);
}

// Base64编码
static const char base64_chars[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

char* base64_encode(const unsigned char *data, size_t input_length) {
if (!data) return NULL;

size_t output_length = 4 * ((input_length + 2) / 3);
char *encoded_data = (char*)malloc(output_length + 1);
if (!encoded_data) return NULL;

for (size_t i = 0, j = 0; i < input_length;) {
uint32_t octet_a = i < input_length ? data[i++] : 0;
uint32_t octet_b = i < input_length ? data[i++] : 0;
uint32_t octet_c = i < input_length ? data[i++] : 0;

uint32_t triple = (octet_a << 0x10) + (octet_b << 0x08) + octet_c;

encoded_data[j++] = base64_chars[(triple >> 3 * 6) & 0x3F];
encoded_data[j++] = base64_chars[(triple >> 2 * 6) & 0x3F];
encoded_data[j++] = base64_chars[(triple >> 1 * 6) & 0x3F];
encoded_data[j++] = base64_chars[(triple >> 0 * 6) & 0x3F];
}

// 添加填充
for (int i = 0; i < (3 - input_length % 3) % 3; i++) {
encoded_data[output_length - 1 - i] = '=';
}

encoded_data[output_length] = '\0';
return encoded_data;
}

// Base64解码
unsigned char* base64_decode(const char *data, size_t *output_length) {
if (!data || !output_length) return NULL;

size_t input_length = strlen(data);
if (input_length % 4 != 0) return NULL;

*output_length = input_length / 4 * 3;
if (data[input_length - 1] == '=') (*output_length)--;
if (data[input_length - 2] == '=') (*output_length)--;

unsigned char *decoded_data = (unsigned char*)malloc(*output_length);
if (!decoded_data) return NULL;

// 创建解码表
int decode_table[256];
for (int i = 0; i < 256; i++) decode_table[i] = -1;
for (int i = 0; i < 64; i++) decode_table[(int)base64_chars[i]] = i;

for (size_t i = 0, j = 0; i < input_length;) {
uint32_t sextet_a = data[i] == '=' ? 0 & i++ : decode_table[(int)data[i++]];
uint32_t sextet_b = data[i] == '=' ? 0 & i++ : decode_table[(int)data[i++]];
uint32_t sextet_c = data[i] == '=' ? 0 & i++ : decode_table[(int)data[i++]];
uint32_t sextet_d = data[i] == '=' ? 0 & i++ : decode_table[(int)data[i++]];

uint32_t triple = (sextet_a << 3 * 6) + (sextet_b << 2 * 6) +
(sextet_c << 1 * 6) + (sextet_d << 0 * 6);

if (j < *output_length) decoded_data[j++] = (triple >> 2 * 8) & 0xFF;
if (j < *output_length) decoded_data[j++] = (triple >> 1 * 8) & 0xFF;
if (j < *output_length) decoded_data[j++] = (triple >> 0 * 8) & 0xFF;
}

return decoded_data;
}

5. 字符串性能优化

5.1 字符串池

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
// 字符串池实现
#define STRING_POOL_SIZE 1024

typedef struct StringPoolNode {
char *string;
int ref_count;
struct StringPoolNode *next;
} StringPoolNode;

typedef struct {
StringPoolNode *buckets[STRING_POOL_SIZE];
int total_strings;
} StringPool;

// 简单哈希函数
unsigned int string_hash(const char *str) {
unsigned int hash = 5381;
int c;

while ((c = *str++)) {
hash = ((hash << 5) + hash) + c;
}

return hash % STRING_POOL_SIZE;
}

// 初始化字符串池
StringPool* string_pool_create() {
StringPool *pool = (StringPool*)malloc(sizeof(StringPool));
if (!pool) return NULL;

memset(pool->buckets, 0, sizeof(pool->buckets));
pool->total_strings = 0;
return pool;
}

// 获取字符串(如果不存在则创建)
const char* string_pool_intern(StringPool *pool, const char *str) {
if (!pool || !str) return NULL;

unsigned int hash = string_hash(str);
StringPoolNode *node = pool->buckets[hash];

// 查找现有字符串
while (node) {
if (strcmp(node->string, str) == 0) {
node->ref_count++;
return node->string;
}
node = node->next;
}

// 创建新节点
node = (StringPoolNode*)malloc(sizeof(StringPoolNode));
if (!node) return NULL;

node->string = strdup(str);
if (!node->string) {
free(node);
return NULL;
}

node->ref_count = 1;
node->next = pool->buckets[hash];
pool->buckets[hash] = node;
pool->total_strings++;

return node->string;
}

// 释放字符串引用
void string_pool_release(StringPool *pool, const char *str) {
if (!pool || !str) return;

unsigned int hash = string_hash(str);
StringPoolNode *node = pool->buckets[hash];
StringPoolNode *prev = NULL;

while (node) {
if (node->string == str) {
node->ref_count--;
if (node->ref_count == 0) {
if (prev) {
prev->next = node->next;
} else {
pool->buckets[hash] = node->next;
}
free(node->string);
free(node);
pool->total_strings--;
}
return;
}
prev = node;
node = node->next;
}
}

5.2 字符串缓存和重用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
// 字符串构建器(避免频繁内存分配)
typedef struct {
char *buffer;
size_t length;
size_t capacity;
size_t growth_factor;
} StringBuilder;

// 创建字符串构建器
StringBuilder* sb_create(size_t initial_capacity) {
StringBuilder *sb = (StringBuilder*)malloc(sizeof(StringBuilder));
if (!sb) return NULL;

sb->capacity = initial_capacity > 0 ? initial_capacity : 64;
sb->buffer = (char*)malloc(sb->capacity);
if (!sb->buffer) {
free(sb);
return NULL;
}

sb->buffer[0] = '\0';
sb->length = 0;
sb->growth_factor = 2;
return sb;
}

// 确保容量
int sb_ensure_capacity(StringBuilder *sb, size_t required) {
if (!sb) return -1;

if (required >= sb->capacity) {
size_t new_capacity = sb->capacity;
while (new_capacity <= required) {
new_capacity *= sb->growth_factor;
}

char *new_buffer = (char*)realloc(sb->buffer, new_capacity);
if (!new_buffer) return -1;

sb->buffer = new_buffer;
sb->capacity = new_capacity;
}

return 0;
}

// 追加字符串
int sb_append(StringBuilder *sb, const char *str) {
if (!sb || !str) return -1;

size_t str_len = strlen(str);
if (sb_ensure_capacity(sb, sb->length + str_len + 1) != 0) {
return -1;
}

memcpy(sb->buffer + sb->length, str, str_len);
sb->length += str_len;
sb->buffer[sb->length] = '\0';

return 0;
}

// 追加格式化字符串
int sb_append_format(StringBuilder *sb, const char *format, ...) {
if (!sb || !format) return -1;

va_list args;
va_start(args, format);

// 计算所需长度
int len = vsnprintf(NULL, 0, format, args);
va_end(args);

if (len < 0) return -1;

if (sb_ensure_capacity(sb, sb->length + len + 1) != 0) {
return -1;
}

va_start(args, format);
vsnprintf(sb->buffer + sb->length, len + 1, format, args);
va_end(args);

sb->length += len;
return 0;
}

// 获取结果字符串
char* sb_to_string(StringBuilder *sb) {
if (!sb) return NULL;
return strdup(sb->buffer);
}

// 重置构建器
void sb_clear(StringBuilder *sb) {
if (sb) {
sb->length = 0;
sb->buffer[0] = '\0';
}
}

// 销毁构建器
void sb_destroy(StringBuilder *sb) {
if (sb) {
free(sb->buffer);
free(sb);
}
}

6. 实际应用示例

6.1 配置文件解析器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// 配置项结构
typedef struct ConfigItem {
char *key;
char *value;
struct ConfigItem *next;
} ConfigItem;

// 配置结构
typedef struct {
ConfigItem *items;
int count;
} Config;

// 解析配置文件
Config* parse_config_file(const char *filename) {
FILE *file = fopen(filename, "r");
if (!file) return NULL;

Config *config = (Config*)malloc(sizeof(Config));
if (!config) {
fclose(file);
return NULL;
}

config->items = NULL;
config->count = 0;

char line[1024];
while (fgets(line, sizeof(line), file)) {
// 移除换行符
line[strcspn(line, "\n")] = '\0';

// 跳过空行和注释
char *trimmed = line;
while (isspace(*trimmed)) trimmed++;
if (*trimmed == '\0' || *trimmed == '#') continue;

// 查找等号
char *equals = strchr(trimmed, '=');
if (!equals) continue;

// 分离键和值
*equals = '\0';
char *key = trimmed;
char *value = equals + 1;

// 去除前后空格
while (isspace(*key)) key++;
char *key_end = key + strlen(key) - 1;
while (key_end > key && isspace(*key_end)) *key_end-- = '\0';

while (isspace(*value)) value++;
char *value_end = value + strlen(value) - 1;
while (value_end > value && isspace(*value_end)) *value_end-- = '\0';

// 创建配置项
ConfigItem *item = (ConfigItem*)malloc(sizeof(ConfigItem));
if (item) {
item->key = strdup(key);
item->value = strdup(value);
item->next = config->items;
config->items = item;
config->count++;
}
}

fclose(file);
return config;
}

// 获取配置值
const char* config_get(Config *config, const char *key) {
if (!config || !key) return NULL;

ConfigItem *item = config->items;
while (item) {
if (strcmp(item->key, key) == 0) {
return item->value;
}
item = item->next;
}

return NULL;
}

6.2 模板引擎

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// 简单的模板引擎
char* template_render(const char *template, Config *variables) {
if (!template) return NULL;

StringBuilder *result = sb_create(strlen(template) * 2);
if (!result) return NULL;

const char *pos = template;
while (*pos) {
if (*pos == '{' && *(pos + 1) == '{') {
// 找到变量开始
const char *var_start = pos + 2;
const char *var_end = strstr(var_start, "}}");

if (var_end) {
// 提取变量名
int var_len = var_end - var_start;
char *var_name = (char*)malloc(var_len + 1);
if (var_name) {
memcpy(var_name, var_start, var_len);
var_name[var_len] = '\0';

// 查找变量值
const char *var_value = config_get(variables, var_name);
if (var_value) {
sb_append(result, var_value);
}

free(var_name);
}

pos = var_end + 2;
} else {
sb_append_char(result, *pos);
pos++;
}
} else {
sb_append_char(result, *pos);
pos++;
}
}

char *final_result = sb_to_string(result);
sb_destroy(result);
return final_result;
}

总结

本文详细介绍了C语言字符串处理的高级技巧,包括:

  1. 内存管理 - 动态字符串、安全操作
  2. 查找匹配 - KMP、Boyer-Moore、Aho-Corasick算法
  3. 解析处理 - 分割、替换、格式化
  4. 编码转换 - UTF-8、Base64处理
  5. 性能优化 - 字符串池、构建器
  6. 实际应用 - 配置解析、模板引擎

关键要点:

  • 注重内存安全,避免缓冲区溢出
  • 选择合适的算法提高性能
  • 使用动态内存管理处理可变长度字符串
  • 考虑字符编码和国际化需求
  • 通过缓存和重用优化性能

掌握这些字符串处理技巧,能够帮助开发者编写更加健壮、高效的C语言程序。

版权所有,如有侵权请联系我