ARM汇编基础:与C语言混合编程

ARM汇编与C语言的混合编程是嵌入式开发中的重要技术,它结合了C语言的高级特性和汇编语言的底层控制能力。本文将详细介绍ARM汇编与C语言混合编程的各种技术和最佳实践。

ARM调用约定(AAPCS)

寄存器使用约定

1
2
3
4
5
6
7
8
9
10
11
// C语言函数声明
int add_numbers(int a, int b, int c, int d, int e);
int multiply_and_add(int x, int y, int z);

// ARM调用约定说明:
// r0-r3: 参数传递和返回值
// r4-r11: 被调用者保存寄存器
// r12: 临时寄存器(IP)
// r13: 栈指针(SP)
// r14: 链接寄存器(LR)
// r15: 程序计数器(PC)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
@ ARM汇编实现
.section .text
.global add_numbers
.global multiply_and_add

add_numbers:
@ 参数:r0=a, r1=b, r2=c, r3=d, [sp]=e
@ 返回值:r0

@ 保存被调用者保存寄存器
push {r4, lr}

@ 加载第5个参数(从栈中)
ldr r4, [sp, #8] @ e在栈中的位置

@ 计算 a + b + c + d + e
add r0, r0, r1 @ a + b
add r0, r0, r2 @ + c
add r0, r0, r3 @ + d
add r0, r0, r4 @ + e

@ 恢复寄存器并返回
pop {r4, lr}
bx lr

multiply_and_add:
@ 参数:r0=x, r1=y, r2=z
@ 返回值:r0 = x * y + z

mul r0, r0, r1 @ x * y
add r0, r0, r2 @ + z
bx lr

栈帧管理

1
2
3
4
// C语言函数
int complex_calculation(int a, int b, int c, int d, int e, int f, int g, int h);

void stack_frame_demo(void);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@ 复杂栈帧管理示例
.section .text
.global complex_calculation
.global stack_frame_demo

complex_calculation:
@ 参数:r0=a, r1=b, r2=c, r3=d, [sp]=e, [sp+4]=f, [sp+8]=g, [sp+12]=h

@ 建立栈帧
push {r4-r11, lr} @ 保存被调用者保存寄存器
sub sp, sp, #16 @ 为局部变量分配空间

@ 访问栈参数
ldr r4, [sp, #52] @ e (16 + 9*4 = 52)
ldr r5, [sp, #56] @ f
ldr r6, [sp, #60] @ g
ldr r7, [sp, #64] @ h

@ 使用局部变量空间
str r0, [sp, #0] @ 保存a到局部变量
str r1, [sp, #4] @ 保存b到局部变量

@ 复杂计算
mul r8, r0, r1 @ a * b
mul r9, r2, r3 @ c * d
add r8, r8, r9 @ (a*b) + (c*d)

mul r9, r4, r5 @ e * f
mul r10, r6, r7 @ g * h
add r9, r9, r10 @ (e*f) + (g*h)

add r0, r8, r9 @ 最终结果

@ 清理栈帧
add sp, sp, #16 @ 释放局部变量空间
pop {r4-r11, lr} @ 恢复寄存器
bx lr

stack_frame_demo:
@ 演示标准栈帧结构
push {r4-r11, lr} @ 保存寄存器
sub sp, sp, #32 @ 分配局部变量空间

@ 栈帧布局:
@ [sp + 0] : 局部变量1
@ [sp + 4] : 局部变量2
@ [sp + 8] : 局部变量3
@ [sp + 12] : 局部变量4
@ [sp + 16] : 保留
@ [sp + 20] : 保留
@ [sp + 24] : 保留
@ [sp + 28] : 保留
@ [sp + 32] : r4
@ [sp + 36] : r5
@ ...
@ [sp + 64] : lr

@ 使用局部变量
mov r0, #100
str r0, [sp, #0] @ 局部变量1 = 100

mov r1, #200
str r1, [sp, #4] @ 局部变量2 = 200

@ 调用其他函数
bl some_function

@ 清理并返回
add sp, sp, #32
pop {r4-r11, lr}
bx lr

some_function:
@ 简单函数示例
mov r0, #42
bx lr

内联汇编

基本内联汇编语法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#include <stdint.h>

// 基本内联汇编示例
void inline_assembly_basics(void) {
int result;
int input = 10;

// 简单的内联汇编
__asm__ volatile (
"mov %0, %1\n\t" // 将input复制到result
"add %0, %0, #5\n\t" // result += 5
: "=r" (result) // 输出操作数
: "r" (input) // 输入操作数
: // 破坏的寄存器列表
);

// result现在应该是15
}

// 内联汇编约束说明
void inline_assembly_constraints(void) {
int a = 10, b = 20, result;

__asm__ volatile (
"add %0, %1, %2\n\t" // result = a + b
: "=r" (result) // =r: 输出到通用寄存器
: "r" (a), "r" (b) // r: 输入从通用寄存器
: // 无破坏的寄存器
);

// 使用立即数约束
__asm__ volatile (
"add %0, %1, %2\n\t" // result = a + 100
: "=r" (result)
: "r" (a), "I" (100) // I: 立即数约束
:
);

// 使用内存约束
int memory_var = 50;
__asm__ volatile (
"ldr %0, %1\n\t" // 从内存加载
"add %0, %0, #10\n\t" // 加10
: "=r" (result)
: "m" (memory_var) // m: 内存操作数
:
);
}

高级内联汇编技术

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
// 原子操作实现
static inline int atomic_add(volatile int *ptr, int value) {
int result, tmp;

__asm__ volatile (
"1:\n\t"
"ldrex %0, [%3]\n\t" // 独占加载
"add %1, %0, %4\n\t" // 计算新值
"strex %2, %1, [%3]\n\t" // 独占存储
"cmp %2, #0\n\t" // 检查是否成功
"bne 1b\n\t" // 失败则重试
"dmb\n\t" // 内存屏障
: "=&r" (result), "=&r" (tmp), "=&r" (tmp)
: "r" (ptr), "r" (value)
: "memory", "cc"
);

return result;
}

// 位操作优化
static inline int count_leading_zeros(uint32_t value) {
int result;

__asm__ volatile (
"clz %0, %1\n\t" // 计算前导零
: "=r" (result)
: "r" (value)
:
);

return result;
}

// 快速除法(使用移位)
static inline int fast_divide_by_power_of_2(int value, int shift) {
int result;

__asm__ volatile (
"cmp %1, #0\n\t" // 检查符号
"addlt %1, %1, %2\n\t" // 负数补偿
"asr %0, %1, %3\n\t" // 算术右移
: "=r" (result)
: "r" (value), "r" ((1 << shift) - 1), "r" (shift)
: "cc"
);

return result;
}

// SIMD操作示例(NEON)
void vector_add_neon(float *a, float *b, float *result, int count) {
int i;

// 处理4个元素为一组
for (i = 0; i < count - 3; i += 4) {
__asm__ volatile (
"vld1.32 {q0}, [%0]!\n\t" // 加载a[i:i+3]
"vld1.32 {q1}, [%1]!\n\t" // 加载b[i:i+3]
"vadd.f32 q2, q0, q1\n\t" // 向量加法
"vst1.32 {q2}, [%2]!\n\t" // 存储结果
: "+r" (a), "+r" (b), "+r" (result)
:
: "q0", "q1", "q2", "memory"
);
}

// 处理剩余元素
for (; i < count; i++) {
result[i] = a[i] + b[i];
}
}

汇编函数与C语言接口

汇编函数的C语言声明

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// assembly_functions.h
#ifndef ASSEMBLY_FUNCTIONS_H
#define ASSEMBLY_FUNCTIONS_H

#include <stdint.h>

// 基本数学运算
extern int asm_add(int a, int b);
extern int asm_multiply(int a, int b);
extern int asm_divide(int dividend, int divisor);

// 字符串操作
extern int asm_strlen(const char *str);
extern char* asm_strcpy(char *dest, const char *src);
extern int asm_strcmp(const char *str1, const char *str2);

// 内存操作
extern void* asm_memcpy(void *dest, const void *src, size_t n);
extern void* asm_memset(void *s, int c, size_t n);
extern int asm_memcmp(const void *s1, const void *s2, size_t n);

// 位操作
extern int asm_popcount(uint32_t value); // 计算1的个数
extern int asm_ffs(uint32_t value); // 找到第一个设置的位
extern uint32_t asm_reverse_bits(uint32_t value);

// 系统级操作
extern void asm_enable_interrupts(void);
extern void asm_disable_interrupts(void);
extern uint32_t asm_get_cpsr(void);
extern void asm_set_cpsr(uint32_t cpsr);

// 性能关键函数
extern void asm_matrix_multiply(const float *a, const float *b, float *c, int n);
extern void asm_fft(float *real, float *imag, int n);

#endif

汇编函数实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
@ assembly_functions.s
.section .text

@ 基本数学运算
.global asm_add
.global asm_multiply
.global asm_divide

asm_add:
@ int asm_add(int a, int b)
@ 参数:r0 = a, r1 = b
@ 返回:r0 = a + b
add r0, r0, r1
bx lr

asm_multiply:
@ int asm_multiply(int a, int b)
@ 参数:r0 = a, r1 = b
@ 返回:r0 = a * b
mul r0, r0, r1
bx lr

asm_divide:
@ int asm_divide(int dividend, int divisor)
@ 参数:r0 = dividend, r1 = divisor
@ 返回:r0 = dividend / divisor

@ 检查除零
cmp r1, #0
moveq r0, #0
bxeq lr

@ 使用硬件除法(如果支持)
sdiv r0, r0, r1
bx lr

@ 字符串操作
.global asm_strlen
.global asm_strcpy
.global asm_strcmp

asm_strlen:
@ int asm_strlen(const char *str)
@ 参数:r0 = str
@ 返回:r0 = 字符串长度

mov r1, r0 @ 保存原始指针
strlen_loop:
ldrb r2, [r0], #1 @ 加载字符并递增指针
cmp r2, #0 @ 检查是否为空字符
bne strlen_loop

sub r0, r0, r1 @ 计算长度
sub r0, r0, #1 @ 减去最后的递增
bx lr

asm_strcpy:
@ char* asm_strcpy(char *dest, const char *src)
@ 参数:r0 = dest, r1 = src
@ 返回:r0 = dest

mov r2, r0 @ 保存dest指针
strcpy_loop:
ldrb r3, [r1], #1 @ 从src加载字符
strb r3, [r0], #1 @ 存储到dest
cmp r3, #0 @ 检查是否为空字符
bne strcpy_loop

mov r0, r2 @ 返回dest
bx lr

asm_strcmp:
@ int asm_strcmp(const char *str1, const char *str2)
@ 参数:r0 = str1, r1 = str2
@ 返回:r0 = 比较结果

strcmp_loop:
ldrb r2, [r0], #1 @ 从str1加载字符
ldrb r3, [r1], #1 @ 从str2加载字符

cmp r2, r3 @ 比较字符
bne strcmp_diff @ 不相等则跳转

cmp r2, #0 @ 检查是否到达字符串末尾
bne strcmp_loop

mov r0, #0 @ 字符串相等
bx lr

strcmp_diff:
sub r0, r2, r3 @ 返回差值
bx lr

@ 内存操作
.global asm_memcpy
.global asm_memset
.global asm_memcmp

asm_memcpy:
@ void* asm_memcpy(void *dest, const void *src, size_t n)
@ 参数:r0 = dest, r1 = src, r2 = n
@ 返回:r0 = dest

push {r4, lr}
mov r3, r0 @ 保存dest指针

@ 优化:按字复制(如果对齐)
orr r4, r0, r1
tst r4, #3 @ 检查4字节对齐
bne memcpy_byte @ 不对齐则按字节复制

@ 按字复制
cmp r2, #4
blt memcpy_byte

memcpy_word_loop:
ldr r4, [r1], #4
str r4, [r0], #4
sub r2, r2, #4
cmp r2, #4
bge memcpy_word_loop

@ 复制剩余字节
memcpy_byte:
cmp r2, #0
ble memcpy_done

memcpy_byte_loop:
ldrb r4, [r1], #1
strb r4, [r0], #1
subs r2, r2, #1
bne memcpy_byte_loop

memcpy_done:
mov r0, r3 @ 返回dest
pop {r4, lr}
bx lr

asm_memset:
@ void* asm_memset(void *s, int c, size_t n)
@ 参数:r0 = s, r1 = c, r2 = n
@ 返回:r0 = s

push {r4, lr}
mov r3, r0 @ 保存s指针

@ 扩展字节到字
and r1, r1, #0xFF @ 确保c是字节值
orr r1, r1, r1, lsl #8
orr r1, r1, r1, lsl #16

@ 按字设置(如果对齐)
tst r0, #3
bne memset_byte

cmp r2, #4
blt memset_byte

memset_word_loop:
str r1, [r0], #4
sub r2, r2, #4
cmp r2, #4
bge memset_word_loop

@ 设置剩余字节
memset_byte:
cmp r2, #0
ble memset_done

memset_byte_loop:
strb r1, [r0], #1
subs r2, r2, #1
bne memset_byte_loop

memset_done:
mov r0, r3
pop {r4, lr}
bx lr

asm_memcmp:
@ int asm_memcmp(const void *s1, const void *s2, size_t n)
@ 参数:r0 = s1, r1 = s2, r2 = n
@ 返回:r0 = 比较结果

cmp r2, #0
moveq r0, #0
bxeq lr

memcmp_loop:
ldrb r3, [r0], #1
ldrb r12, [r1], #1

cmp r3, r12
bne memcmp_diff

subs r2, r2, #1
bne memcmp_loop

mov r0, #0 @ 内存块相等
bx lr

memcmp_diff:
sub r0, r3, r12 @ 返回差值
bx lr

@ 位操作
.global asm_popcount
.global asm_ffs
.global asm_reverse_bits

asm_popcount:
@ int asm_popcount(uint32_t value)
@ 计算value中1的个数
@ 参数:r0 = value
@ 返回:r0 = 1的个数

mov r1, #0 @ 计数器

popcount_loop:
cmp r0, #0
beq popcount_done

@ 清除最低位的1
sub r2, r0, #1
and r0, r0, r2
add r1, r1, #1

b popcount_loop

popcount_done:
mov r0, r1
bx lr

asm_ffs:
@ int asm_ffs(uint32_t value)
@ 找到第一个设置的位(从1开始计数)
@ 参数:r0 = value
@ 返回:r0 = 位位置(0表示没有设置的位)

cmp r0, #0
moveq r0, #0
bxeq lr

@ 使用CLZ指令优化
rsb r1, r0, #0 @ -value
and r0, r0, r1 @ value & (-value)
clz r0, r0 @ 计算前导零
rsb r0, r0, #32 @ 32 - clz

bx lr

asm_reverse_bits:
@ uint32_t asm_reverse_bits(uint32_t value)
@ 反转32位中的位顺序
@ 参数:r0 = value
@ 返回:r0 = 反转后的值

@ 使用RBIT指令(如果支持)
rbit r0, r0
bx lr

@ 系统级操作
.global asm_enable_interrupts
.global asm_disable_interrupts
.global asm_get_cpsr
.global asm_set_cpsr

asm_enable_interrupts:
@ void asm_enable_interrupts(void)
mrs r0, cpsr
bic r0, r0, #0x80 @ 清除I位
msr cpsr_c, r0
bx lr

asm_disable_interrupts:
@ void asm_disable_interrupts(void)
mrs r0, cpsr
orr r0, r0, #0x80 @ 设置I位
msr cpsr_c, r0
bx lr

asm_get_cpsr:
@ uint32_t asm_get_cpsr(void)
mrs r0, cpsr
bx lr

asm_set_cpsr:
@ void asm_set_cpsr(uint32_t cpsr)
msr cpsr, r0
bx lr

性能优化技术

循环优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// 循环展开示例
void optimized_array_sum(void) {
int array[1000];
int sum = 0;
int i;

// 初始化数组
for (i = 0; i < 1000; i++) {
array[i] = i;
}

// 优化的求和(循环展开)
__asm__ volatile (
"mov %0, #0\n\t" // sum = 0
"mov r1, %1\n\t" // r1 = array
"mov r2, #0\n\t" // i = 0
"1:\n\t" // 循环开始
"ldr r3, [r1, r2, lsl #2]\n\t" // 加载array[i]
"add %0, %0, r3\n\t" // sum += array[i]
"ldr r3, [r1, r2, lsl #2 + 4]\n\t" // 加载array[i+1]
"add %0, %0, r3\n\t" // sum += array[i+1]
"ldr r3, [r1, r2, lsl #2 + 8]\n\t" // 加载array[i+2]
"add %0, %0, r3\n\t" // sum += array[i+2]
"ldr r3, [r1, r2, lsl #2 + 12]\n\t" // 加载array[i+3]
"add %0, %0, r3\n\t" // sum += array[i+3]
"add r2, r2, #4\n\t" // i += 4
"cmp r2, #1000\n\t" // 比较i和1000
"blt 1b\n\t" // 如果i < 1000则继续循环
: "=r" (sum)
: "r" (array)
: "r1", "r2", "r3", "memory"
);
}

// SIMD优化的向量操作
void simd_vector_operations(void) {
float a[16] __attribute__((aligned(16)));
float b[16] __attribute__((aligned(16)));
float result[16] __attribute__((aligned(16)));
int i;

// 初始化数组
for (i = 0; i < 16; i++) {
a[i] = i * 1.5f;
b[i] = i * 2.0f;
}

// NEON SIMD优化
__asm__ volatile (
"mov r0, %0\n\t" // r0 = a
"mov r1, %1\n\t" // r1 = b
"mov r2, %2\n\t" // r2 = result
"mov r3, #0\n\t" // i = 0
"1:\n\t" // 循环开始
"vld1.32 {q0}, [r0]!\n\t" // 加载4个float从a
"vld1.32 {q1}, [r1]!\n\t" // 加载4个float从b
"vadd.f32 q2, q0, q1\n\t" // 向量加法
"vmul.f32 q2, q2, q0\n\t" // 向量乘法
"vst1.32 {q2}, [r2]!\n\t" // 存储结果
"add r3, r3, #4\n\t" // i += 4
"cmp r3, #16\n\t" // 比较i和16
"blt 1b\n\t" // 继续循环
:
: "r" (a), "r" (b), "r" (result)
: "r0", "r1", "r2", "r3", "q0", "q1", "q2", "memory"
);
}

分支预测优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// 分支预测优化
void branch_prediction_optimization(void) {
int array[1000];
int positive_count = 0;
int i;

// 初始化数组(大部分为正数)
for (i = 0; i < 1000; i++) {
array[i] = (i % 10 == 0) ? -i : i;
}

// 使用likely/unlikely提示
for (i = 0; i < 1000; i++) {
if (__builtin_expect(array[i] > 0, 1)) { // likely
positive_count++;
}
}

// 手动分支预测优化
__asm__ volatile (
"mov %0, #0\n\t" // positive_count = 0
"mov r1, %1\n\t" // r1 = array
"mov r2, #0\n\t" // i = 0
"1:\n\t" // 循环开始
"ldr r3, [r1, r2, lsl #2]\n\t" // 加载array[i]
"cmp r3, #0\n\t" // 比较与0
"addle %0, %0, #0\n\t" // 如果<=0,不增加计数(预测不会执行)
"addgt %0, %0, #1\n\t" // 如果>0,增加计数(预测会执行)
"add r2, r2, #1\n\t" // i++
"cmp r2, #1000\n\t" // 比较i和1000
"blt 1b\n\t" // 继续循环
: "=r" (positive_count)
: "r" (array)
: "r1", "r2", "r3", "cc"
);
}

调试和测试

混合编程调试技术

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
// debug_helpers.h
#ifndef DEBUG_HELPERS_H
#define DEBUG_HELPERS_H

#include <stdio.h>
#include <stdint.h>

// 调试宏
#ifdef DEBUG
#define DBG_PRINT(fmt, ...) printf("[DEBUG] " fmt "\n", ##__VA_ARGS__)
#define DBG_ASM_REGS() debug_print_registers()
#else
#define DBG_PRINT(fmt, ...)
#define DBG_ASM_REGS()
#endif

// 寄存器状态打印
void debug_print_registers(void);

// 内存转储
void debug_memory_dump(const void *addr, size_t size);

// 性能计时
typedef struct {
uint32_t start_cycles;
uint32_t end_cycles;
} perf_timer_t;

void perf_timer_start(perf_timer_t *timer);
void perf_timer_end(perf_timer_t *timer);
uint32_t perf_timer_get_cycles(const perf_timer_t *timer);

#endif
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
// debug_helpers.c
#include "debug_helpers.h"

void debug_print_registers(void) {
uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
uint32_t r8, r9, r10, r11, r12, sp, lr, pc;
uint32_t cpsr;

__asm__ volatile (
"mov %0, r0\n\t"
"mov %1, r1\n\t"
"mov %2, r2\n\t"
"mov %3, r3\n\t"
"mov %4, r4\n\t"
"mov %5, r5\n\t"
"mov %6, r6\n\t"
"mov %7, r7\n\t"
: "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3),
"=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
:
:
);

__asm__ volatile (
"mov %0, r8\n\t"
"mov %1, r9\n\t"
"mov %2, r10\n\t"
"mov %3, r11\n\t"
"mov %4, r12\n\t"
"mov %5, sp\n\t"
"mov %6, lr\n\t"
"mrs %7, cpsr\n\t"
: "=r" (r8), "=r" (r9), "=r" (r10), "=r" (r11),
"=r" (r12), "=r" (sp), "=r" (lr), "=r" (cpsr)
:
:
);

printf("Register dump:\n");
printf("r0=0x%08x r1=0x%08x r2=0x%08x r3=0x%08x\n", r0, r1, r2, r3);
printf("r4=0x%08x r5=0x%08x r6=0x%08x r7=0x%08x\n", r4, r5, r6, r7);
printf("r8=0x%08x r9=0x%08x r10=0x%08x r11=0x%08x\n", r8, r9, r10, r11);
printf("r12=0x%08x sp=0x%08x lr=0x%08x\n", r12, sp, lr);
printf("cpsr=0x%08x [N=%d Z=%d C=%d V=%d]\n",
cpsr,
(cpsr >> 31) & 1, // N flag
(cpsr >> 30) & 1, // Z flag
(cpsr >> 29) & 1, // C flag
(cpsr >> 28) & 1); // V flag
}

void debug_memory_dump(const void *addr, size_t size) {
const uint8_t *ptr = (const uint8_t *)addr;
size_t i, j;

printf("Memory dump at 0x%08x (%zu bytes):\n", (uint32_t)addr, size);

for (i = 0; i < size; i += 16) {
printf("%08x: ", (uint32_t)(ptr + i));

// 十六进制显示
for (j = 0; j < 16 && i + j < size; j++) {
printf("%02x ", ptr[i + j]);
}

// 填充空格
for (; j < 16; j++) {
printf(" ");
}

printf(" |");

// ASCII显示
for (j = 0; j < 16 && i + j < size; j++) {
char c = ptr[i + j];
printf("%c", (c >= 32 && c <= 126) ? c : '.');
}

printf("|\n");
}
}

void perf_timer_start(perf_timer_t *timer) {
__asm__ volatile (
"mrc p15, 0, %0, c9, c13, 0\n\t" // 读取周期计数器
: "=r" (timer->start_cycles)
:
:
);
}

void perf_timer_end(perf_timer_t *timer) {
__asm__ volatile (
"mrc p15, 0, %0, c9, c13, 0\n\t" // 读取周期计数器
: "=r" (timer->end_cycles)
:
:
);
}

uint32_t perf_timer_get_cycles(const perf_timer_t *timer) {
return timer->end_cycles - timer->start_cycles;
}

单元测试框架

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// test_framework.h
#ifndef TEST_FRAMEWORK_H
#define TEST_FRAMEWORK_H

#include <stdio.h>
#include <string.h>
#include <stdint.h>

// 测试统计
extern int test_count;
extern int test_passed;
extern int test_failed;

// 测试宏
#define TEST_ASSERT(condition, message) \
do { \
test_count++; \
if (condition) { \
test_passed++; \
printf("[PASS] %s\n", message); \
} else { \
test_failed++; \
printf("[FAIL] %s\n", message); \
} \
} while(0)

#define TEST_ASSERT_EQ(expected, actual, message) \
TEST_ASSERT((expected) == (actual), message)

#define TEST_ASSERT_STR_EQ(expected, actual, message) \
TEST_ASSERT(strcmp(expected, actual) == 0, message)

#define TEST_ASSERT_MEM_EQ(expected, actual, size, message) \
TEST_ASSERT(memcmp(expected, actual, size) == 0, message)

// 测试函数声明
void test_basic_math(void);
void test_string_functions(void);
void test_memory_functions(void);
void test_bit_operations(void);
void test_performance(void);

// 测试报告
void print_test_summary(void);

#endif
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// test_framework.c
#include "test_framework.h"
#include "assembly_functions.h"
#include "debug_helpers.h"

int test_count = 0;
int test_passed = 0;
int test_failed = 0;

void test_basic_math(void) {
printf("\n=== Testing Basic Math Functions ===\n");

TEST_ASSERT_EQ(15, asm_add(10, 5), "asm_add(10, 5) == 15");
TEST_ASSERT_EQ(-5, asm_add(10, -15), "asm_add(10, -15) == -5");

TEST_ASSERT_EQ(50, asm_multiply(10, 5), "asm_multiply(10, 5) == 50");
TEST_ASSERT_EQ(-50, asm_multiply(10, -5), "asm_multiply(10, -5) == -50");

TEST_ASSERT_EQ(2, asm_divide(10, 5), "asm_divide(10, 5) == 2");
TEST_ASSERT_EQ(-2, asm_divide(10, -5), "asm_divide(10, -5) == -2");
TEST_ASSERT_EQ(0, asm_divide(10, 0), "asm_divide(10, 0) == 0 (division by zero)");
}

void test_string_functions(void) {
printf("\n=== Testing String Functions ===\n");

char test_str[] = "Hello, World!";
char dest[50];

TEST_ASSERT_EQ(13, asm_strlen(test_str), "asm_strlen('Hello, World!') == 13");
TEST_ASSERT_EQ(0, asm_strlen(""), "asm_strlen('') == 0");

asm_strcpy(dest, test_str);
TEST_ASSERT_STR_EQ(test_str, dest, "asm_strcpy works correctly");

TEST_ASSERT_EQ(0, asm_strcmp("test", "test"), "asm_strcmp('test', 'test') == 0");
TEST_ASSERT(asm_strcmp("test", "TEST") > 0, "asm_strcmp('test', 'TEST') > 0");
TEST_ASSERT(asm_strcmp("TEST", "test") < 0, "asm_strcmp('TEST', 'test') < 0");
}

void test_memory_functions(void) {
printf("\n=== Testing Memory Functions ===\n");

uint8_t src[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
uint8_t dest[16];
uint8_t pattern[16];

// 测试memcpy
asm_memcpy(dest, src, 16);
TEST_ASSERT_MEM_EQ(src, dest, 16, "asm_memcpy works correctly");

// 测试memset
asm_memset(pattern, 0xAA, 16);
for (int i = 0; i < 16; i++) {
if (pattern[i] != 0xAA) {
TEST_ASSERT(0, "asm_memset failed");
break;
}
}
TEST_ASSERT(1, "asm_memset works correctly");

// 测试memcmp
TEST_ASSERT_EQ(0, asm_memcmp(src, dest, 16), "asm_memcmp(identical) == 0");
dest[8] = 0xFF;
TEST_ASSERT(asm_memcmp(src, dest, 16) != 0, "asm_memcmp(different) != 0");
}

void test_bit_operations(void) {
printf("\n=== Testing Bit Operations ===\n");

TEST_ASSERT_EQ(3, asm_popcount(0x07), "asm_popcount(0x07) == 3");
TEST_ASSERT_EQ(8, asm_popcount(0xFF), "asm_popcount(0xFF) == 8");
TEST_ASSERT_EQ(0, asm_popcount(0x00), "asm_popcount(0x00) == 0");

TEST_ASSERT_EQ(1, asm_ffs(0x01), "asm_ffs(0x01) == 1");
TEST_ASSERT_EQ(4, asm_ffs(0x08), "asm_ffs(0x08) == 4");
TEST_ASSERT_EQ(0, asm_ffs(0x00), "asm_ffs(0x00) == 0");

TEST_ASSERT_EQ(0x80000000, asm_reverse_bits(0x00000001), "asm_reverse_bits works");
}

void test_performance(void) {
printf("\n=== Performance Testing ===\n");

perf_timer_t timer;
const int iterations = 10000;

// 测试C语言版本
perf_timer_start(&timer);
for (int i = 0; i < iterations; i++) {
volatile int result = 10 + 5; // 防止优化
}
perf_timer_end(&timer);
uint32_t c_cycles = perf_timer_get_cycles(&timer);

// 测试汇编版本
perf_timer_start(&timer);
for (int i = 0; i < iterations; i++) {
volatile int result = asm_add(10, 5);
}
perf_timer_end(&timer);
uint32_t asm_cycles = perf_timer_get_cycles(&timer);

printf("C version: %u cycles\n", c_cycles);
printf("Assembly version: %u cycles\n", asm_cycles);

if (asm_cycles < c_cycles) {
printf("Assembly is faster by %u cycles\n", c_cycles - asm_cycles);
} else {
printf("C is faster by %u cycles\n", asm_cycles - c_cycles);
}
}

void print_test_summary(void) {
printf("\n=== Test Summary ===\n");
printf("Total tests: %d\n", test_count);
printf("Passed: %d\n", test_passed);
printf("Failed: %d\n", test_failed);
printf("Success rate: %.1f%%\n",
test_count > 0 ? (100.0 * test_passed / test_count) : 0.0);
}

// 主测试函数
int main(void) {
printf("ARM Assembly and C Integration Test Suite\n");
printf("==========================================\n");

test_basic_math();
test_string_functions();
test_memory_functions();
test_bit_operations();
test_performance();

print_test_summary();

return (test_failed == 0) ? 0 : 1;
}

总结

ARM汇编与C语言混合编程是嵌入式开发的重要技术:

核心概念

  • 调用约定:AAPCS标准确保函数间正确通信
  • 内联汇编:在C代码中嵌入汇编指令
  • 汇编函数:用汇编实现性能关键函数
  • 接口设计:C语言声明与汇编实现的对接

关键技术

  • 寄存器管理:遵循调用约定保存/恢复寄存器
  • 栈帧操作:正确管理函数调用栈
  • 参数传递:理解寄存器和栈的参数传递机制
  • 性能优化:利用汇编实现高效算法

实际应用

  • 系统编程:中断处理、上下文切换
  • 性能优化:数学运算、图像处理
  • 硬件控制:寄存器操作、外设驱动
  • 安全功能:加密算法、安全启动

最佳实践

  • 遵循标准调用约定确保兼容性
  • 合理使用内联汇编避免过度优化
  • 建立完善的测试框架验证正确性
  • 平衡性能提升与代码可维护性

掌握ARM汇编与C语言混合编程技术能够帮助开发者构建高性能、可靠的嵌入式系统,在保持代码可读性的同时获得最佳性能。

版权所有,如有侵权请联系我