2016-01-15

音视频基础理论与数字信号处理深入解析

音视频技术是现代多媒体应用的核心，理解其基础理论和数字信号处理原理对于音视频开发至关重要。本文将深入探讨音视频的基本概念、数字化过程、信号处理技术以及实际应用中的关键技术点。

1. 音频基础理论

1.1 声音的物理特性

声音是由物体振动产生的机械波，在空气中以纵波的形式传播。声音具有以下基本特性：

// 声音的基本参数
typedef struct {
    double frequency;    // 频率 (Hz)
    double amplitude;    // 振幅
    double phase;        // 相位
    double wavelength;   // 波长
} SoundWave;

// 计算声音在空气中的传播速度
double calculate_sound_speed(double temperature) {
    // 声速公式：v = 331.3 + 0.606 * T (T为摄氏温度)
    return 331.3 + 0.606 * temperature;
}

// 计算波长
double calculate_wavelength(double frequency, double speed) {
    return speed / frequency;
}

声音的三要素：

音调（Pitch）：由频率决定，人耳可听范围约为20Hz-20kHz
音量（Volume）：由振幅决定，通常用分贝(dB)表示
音色（Timbre）：由波形决定，不同乐器的波形特征不同

1.2 数字音频基础

采样定理（Nyquist定理）

根据采样定理，要完整重建原始信号，采样频率必须至少是信号最高频率的两倍：

# 采样定理示例
import numpy as np
import matplotlib.pyplot as plt

def demonstrate_sampling_theorem():
    # 原始信号参数
    signal_freq = 1000  # 1kHz信号
    duration = 0.01     # 10ms
    
    # 时间轴
    t_continuous = np.linspace(0, duration, 10000)
    
    # 原始连续信号
    original_signal = np.sin(2 * np.pi * signal_freq * t_continuous)
    
    # 不同采样频率的效果
    sampling_rates = [1500, 2000, 4000, 8000]  # Hz
    
    plt.figure(figsize=(12, 8))
    
    for i, fs in enumerate(sampling_rates):
        plt.subplot(2, 2, i+1)
        
        # 采样点
        t_sampled = np.arange(0, duration, 1/fs)
        sampled_signal = np.sin(2 * np.pi * signal_freq * t_sampled)
        
        # 绘制原始信号和采样点
        plt.plot(t_continuous * 1000, original_signal, 'b-', alpha=0.7, label='原始信号')
        plt.stem(t_sampled * 1000, sampled_signal, 'r-', label=f'采样 {fs}Hz')
        
        plt.title(f'采样频率: {fs}Hz (Nyquist: {2*signal_freq}Hz)')
        plt.xlabel('时间 (ms)')
        plt.ylabel('幅度')
        plt.legend()
        plt.grid(True)
    
    plt.tight_layout()
    plt.show()

# 运行演示
demonstrate_sampling_theorem()

量化过程

量化是将连续的幅度值转换为离散数字值的过程：

#include <stdio.h>
#include <math.h>
#include <stdint.h>

// 量化参数结构
typedef struct {
    int bits;           // 量化位数
    double max_value;   // 最大值
    double min_value;   // 最小值
    int levels;         // 量化级数
} QuantizationParams;

// 初始化量化参数
QuantizationParams init_quantization(int bits, double min_val, double max_val) {
    QuantizationParams params;
    params.bits = bits;
    params.min_value = min_val;
    params.max_value = max_val;
    params.levels = 1 << bits;  // 2^bits
    return params;
}

// 线性量化
int16_t linear_quantize(double sample, QuantizationParams *params) {
    // 归一化到[0, 1]
    double normalized = (sample - params->min_value) / (params->max_value - params->min_value);
    
    // 限制范围
    if (normalized < 0.0) normalized = 0.0;
    if (normalized > 1.0) normalized = 1.0;
    
    // 量化
    int quantized = (int)(normalized * (params->levels - 1));
    
    // 转换为有符号整数
    return (int16_t)(quantized - (params->levels / 2));
}

// 反量化
double dequantize(int16_t quantized_sample, QuantizationParams *params) {
    // 转换回无符号
    int unsigned_sample = quantized_sample + (params->levels / 2);
    
    // 反归一化
    double normalized = (double)unsigned_sample / (params->levels - 1);
    
    return params->min_value + normalized * (params->max_value - params->min_value);
}

// 计算量化噪声
double calculate_quantization_noise(double original, double quantized) {
    double error = original - quantized;
    return 20 * log10(fabs(error));  // dB
}

// 演示量化过程
void demonstrate_quantization() {
    QuantizationParams params_8bit = init_quantization(8, -1.0, 1.0);
    QuantizationParams params_16bit = init_quantization(16, -1.0, 1.0);
    
    printf("量化演示:\n");
    printf("原始值\t8位量化\t16位量化\t8位误差(dB)\t16位误差(dB)\n");
    
    for (double sample = -1.0; sample <= 1.0; sample += 0.3) {
        int16_t q8 = linear_quantize(sample, &params_8bit);
        int16_t q16 = linear_quantize(sample, &params_16bit);
        
        double dq8 = dequantize(q8, &params_8bit);
        double dq16 = dequantize(q16, &params_16bit);
        
        double noise_8 = calculate_quantization_noise(sample, dq8);
        double noise_16 = calculate_quantization_noise(sample, dq16);
        
        printf("%.2f\t%d\t%d\t%.2f\t\t%.2f\n", 
               sample, q8, q16, noise_8, noise_16);
    }
}

1.3 音频格式与编码

PCM（脉冲编码调制）

PCM是最基础的数字音频格式，直接存储量化后的采样值：

// PCM音频数据结构
typedef struct {
    uint32_t sample_rate;    // 采样率
    uint16_t channels;       // 声道数
    uint16_t bits_per_sample; // 位深度
    uint32_t data_size;      // 数据大小
    int16_t *data;           // 音频数据
} PCMAudio;

// 创建PCM音频对象
PCMAudio* create_pcm_audio(uint32_t sample_rate, uint16_t channels, 
                          uint16_t bits_per_sample, uint32_t duration_ms) {
    PCMAudio *audio = (PCMAudio*)malloc(sizeof(PCMAudio));
    if (!audio) return NULL;
    
    audio->sample_rate = sample_rate;
    audio->channels = channels;
    audio->bits_per_sample = bits_per_sample;
    
    // 计算数据大小
    uint32_t samples_per_channel = (sample_rate * duration_ms) / 1000;
    audio->data_size = samples_per_channel * channels * (bits_per_sample / 8);
    
    audio->data = (int16_t*)malloc(audio->data_size);
    if (!audio->data) {
        free(audio);
        return NULL;
    }
    
    return audio;
}

// 生成正弦波测试音频
void generate_sine_wave(PCMAudio *audio, double frequency, double amplitude) {
    if (!audio || !audio->data) return;
    
    uint32_t total_samples = audio->data_size / sizeof(int16_t);
    double angular_freq = 2.0 * M_PI * frequency / audio->sample_rate;
    
    for (uint32_t i = 0; i < total_samples; i += audio->channels) {
        double sample_time = (double)(i / audio->channels) / audio->sample_rate;
        int16_t sample_value = (int16_t)(amplitude * 32767 * sin(angular_freq * sample_time));
        
        // 为所有声道设置相同的值
        for (int ch = 0; ch < audio->channels; ch++) {
            if (i + ch < total_samples) {
                audio->data[i + ch] = sample_value;
            }
        }
    }
}

// 写入WAV文件头
void write_wav_header(FILE *file, PCMAudio *audio) {
    // RIFF头
    fwrite("RIFF", 1, 4, file);
    uint32_t file_size = 36 + audio->data_size;
    fwrite(&file_size, 4, 1, file);
    fwrite("WAVE", 1, 4, file);
    
    // fmt子块
    fwrite("fmt ", 1, 4, file);
    uint32_t fmt_size = 16;
    fwrite(&fmt_size, 4, 1, file);
    uint16_t audio_format = 1;  // PCM
    fwrite(&audio_format, 2, 1, file);
    fwrite(&audio->channels, 2, 1, file);
    fwrite(&audio->sample_rate, 4, 1, file);
    
    uint32_t byte_rate = audio->sample_rate * audio->channels * (audio->bits_per_sample / 8);
    fwrite(&byte_rate, 4, 1, file);
    
    uint16_t block_align = audio->channels * (audio->bits_per_sample / 8);
    fwrite(&block_align, 2, 1, file);
    fwrite(&audio->bits_per_sample, 2, 1, file);
    
    // data子块
    fwrite("data", 1, 4, file);
    fwrite(&audio->data_size, 4, 1, file);
}

2. 视频基础理论

2.1 视觉感知原理

人眼的视觉感知特性决定了视频技术的基础参数：

# 视觉感知特性模拟
import numpy as np
import matplotlib.pyplot as plt

class VisualPerception:
    def __init__(self):
        # 人眼感知参数
        self.temporal_resolution = 24  # 帧率阈值
        self.spatial_resolution = (1920, 1080)  # 空间分辨率
        self.color_sensitivity = {
            'red': 0.299,
            'green': 0.587,
            'blue': 0.114
        }
    
    def calculate_luminance(self, rgb):
        """计算亮度值"""
        r, g, b = rgb
        return (self.color_sensitivity['red'] * r + 
                self.color_sensitivity['green'] * g + 
                self.color_sensitivity['blue'] * b)
    
    def simulate_motion_blur(self, frame_rate):
        """模拟运动模糊效果"""
        if frame_rate < self.temporal_resolution:
            blur_factor = 1.0 - (frame_rate / self.temporal_resolution)
            return blur_factor
        return 0.0
    
    def demonstrate_color_space_conversion(self):
        """演示颜色空间转换"""
        # RGB到YUV转换矩阵
        rgb_to_yuv = np.array([
            [0.299, 0.587, 0.114],
            [-0.14713, -0.28886, 0.436],
            [0.615, -0.51499, -0.10001]
        ])
        
        # 测试颜色
        test_colors = [
            [255, 0, 0],    # 红色
            [0, 255, 0],    # 绿色
            [0, 0, 255],    # 蓝色
            [255, 255, 255], # 白色
            [128, 128, 128]  # 灰色
        ]
        
        print("RGB到YUV颜色空间转换:")
        print("RGB\t\t\tYUV")
        
        for rgb in test_colors:
            rgb_normalized = np.array(rgb) / 255.0
            yuv = np.dot(rgb_to_yuv, rgb_normalized)
            print(f"{rgb}\t{yuv}")

# 运行演示
perception = VisualPerception()
perception.demonstrate_color_space_conversion()

2.2 数字视频基础

视频帧结构

// 视频帧数据结构
typedef struct {
    int width;
    int height;
    int channels;        // 颜色通道数
    int bits_per_pixel;
    uint8_t *data;       // 像素数据
    int64_t timestamp;   // 时间戳
    int frame_type;      // 帧类型 (I, P, B)
} VideoFrame;

// 帧类型定义
typedef enum {
    FRAME_TYPE_I = 0,    // 关键帧
    FRAME_TYPE_P = 1,    // 预测帧
    FRAME_TYPE_B = 2     // 双向预测帧
} FrameType;

// 创建视频帧
VideoFrame* create_video_frame(int width, int height, int channels, int bits_per_pixel) {
    VideoFrame *frame = (VideoFrame*)malloc(sizeof(VideoFrame));
    if (!frame) return NULL;
    
    frame->width = width;
    frame->height = height;
    frame->channels = channels;
    frame->bits_per_pixel = bits_per_pixel;
    
    size_t data_size = width * height * channels * (bits_per_pixel / 8);
    frame->data = (uint8_t*)malloc(data_size);
    
    if (!frame->data) {
        free(frame);
        return NULL;
    }
    
    frame->timestamp = 0;
    frame->frame_type = FRAME_TYPE_I;
    
    return frame;
}

// RGB到YUV420转换
void rgb_to_yuv420(VideoFrame *rgb_frame, VideoFrame *yuv_frame) {
    if (!rgb_frame || !yuv_frame) return;
    
    int width = rgb_frame->width;
    int height = rgb_frame->height;
    
    uint8_t *rgb = rgb_frame->data;
    uint8_t *y_plane = yuv_frame->data;
    uint8_t *u_plane = y_plane + width * height;
    uint8_t *v_plane = u_plane + (width * height) / 4;
    
    // 转换Y分量
    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
            int rgb_idx = (i * width + j) * 3;
            int y_idx = i * width + j;
            
            uint8_t r = rgb[rgb_idx];
            uint8_t g = rgb[rgb_idx + 1];
            uint8_t b = rgb[rgb_idx + 2];
            
            y_plane[y_idx] = (uint8_t)(0.299 * r + 0.587 * g + 0.114 * b);
        }
    }
    
    // 转换U和V分量（4:2:0采样）
    for (int i = 0; i < height; i += 2) {
        for (int j = 0; j < width; j += 2) {
            int rgb_idx = (i * width + j) * 3;
            int uv_idx = (i / 2) * (width / 2) + (j / 2);
            
            uint8_t r = rgb[rgb_idx];
            uint8_t g = rgb[rgb_idx + 1];
            uint8_t b = rgb[rgb_idx + 2];
            
            u_plane[uv_idx] = (uint8_t)(128 - 0.168736 * r - 0.331264 * g + 0.5 * b);
            v_plane[uv_idx] = (uint8_t)(128 + 0.5 * r - 0.418688 * g - 0.081312 * b);
        }
    }
}

视频压缩基础

// 简单的帧差压缩算法
typedef struct {
    VideoFrame *reference_frame;
    int threshold;
} FrameDifferenceEncoder;

// 创建帧差编码器
FrameDifferenceEncoder* create_frame_diff_encoder(int threshold) {
    FrameDifferenceEncoder *encoder = (FrameDifferenceEncoder*)malloc(sizeof(FrameDifferenceEncoder));
    if (!encoder) return NULL;
    
    encoder->reference_frame = NULL;
    encoder->threshold = threshold;
    
    return encoder;
}

// 计算帧差
int calculate_frame_difference(VideoFrame *current, VideoFrame *reference, VideoFrame *diff) {
    if (!current || !reference || !diff) return -1;
    
    int width = current->width;
    int height = current->height;
    int changed_pixels = 0;
    
    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
            for (int c = 0; c < current->channels; c++) {
                int idx = (i * width + j) * current->channels + c;
                
                int diff_value = abs(current->data[idx] - reference->data[idx]);
                diff->data[idx] = (uint8_t)diff_value;
                
                if (diff_value > 10) {  // 阈值
                    changed_pixels++;
                }
            }
        }
    }
    
    return changed_pixels;
}

// 运动估计（简化版块匹配）
typedef struct {
    int x, y;        // 运动向量
    int block_size;  // 块大小
    int search_range; // 搜索范围
} MotionVector;

MotionVector estimate_motion(VideoFrame *current, VideoFrame *reference, 
                           int block_x, int block_y, int block_size, int search_range) {
    MotionVector mv = {0, 0, block_size, search_range};
    int min_sad = INT_MAX;
    
    // 在搜索范围内寻找最佳匹配
    for (int dy = -search_range; dy <= search_range; dy++) {
        for (int dx = -search_range; dx <= search_range; dx++) {
            int ref_x = block_x + dx;
            int ref_y = block_y + dy;
            
            // 边界检查
            if (ref_x < 0 || ref_y < 0 || 
                ref_x + block_size > reference->width || 
                ref_y + block_size > reference->height) {
                continue;
            }
            
            // 计算SAD (Sum of Absolute Differences)
            int sad = 0;
            for (int i = 0; i < block_size; i++) {
                for (int j = 0; j < block_size; j++) {
                    int curr_idx = ((block_y + i) * current->width + (block_x + j)) * current->channels;
                    int ref_idx = ((ref_y + i) * reference->width + (ref_x + j)) * reference->channels;
                    
                    for (int c = 0; c < current->channels; c++) {
                        sad += abs(current->data[curr_idx + c] - reference->data[ref_idx + c]);
                    }
                }
            }
            
            if (sad < min_sad) {
                min_sad = sad;
                mv.x = dx;
                mv.y = dy;
            }
        }
    }
    
    return mv;
}

3. 数字信号处理技术

3.1 傅里叶变换

傅里叶变换是音视频处理的核心数学工具：

#include <complex.h>
#include <math.h>

// 复数定义
typedef double complex Complex;

// DFT (离散傅里叶变换)
void dft(Complex *input, Complex *output, int N) {
    for (int k = 0; k < N; k++) {
        output[k] = 0.0 + 0.0*I;
        
        for (int n = 0; n < N; n++) {
            double angle = -2.0 * M_PI * k * n / N;
            Complex twiddle = cos(angle) + sin(angle)*I;
            output[k] += input[n] * twiddle;
        }
    }
}

// IDFT (逆离散傅里叶变换)
void idft(Complex *input, Complex *output, int N) {
    for (int n = 0; n < N; n++) {
        output[n] = 0.0 + 0.0*I;
        
        for (int k = 0; k < N; k++) {
            double angle = 2.0 * M_PI * k * n / N;
            Complex twiddle = cos(angle) + sin(angle)*I;
            output[n] += input[k] * twiddle;
        }
        
        output[n] /= N;
    }
}

// FFT (快速傅里叶变换) - Cooley-Tukey算法
void fft(Complex *input, Complex *output, int N) {
    if (N <= 1) {
        if (N == 1) output[0] = input[0];
        return;
    }
    
    // 分治：分离奇偶项
    Complex *even = (Complex*)malloc((N/2) * sizeof(Complex));
    Complex *odd = (Complex*)malloc((N/2) * sizeof(Complex));
    Complex *even_fft = (Complex*)malloc((N/2) * sizeof(Complex));
    Complex *odd_fft = (Complex*)malloc((N/2) * sizeof(Complex));
    
    for (int i = 0; i < N/2; i++) {
        even[i] = input[2*i];
        odd[i] = input[2*i + 1];
    }
    
    // 递归计算
    fft(even, even_fft, N/2);
    fft(odd, odd_fft, N/2);
    
    // 合并结果
    for (int k = 0; k < N/2; k++) {
        double angle = -2.0 * M_PI * k / N;
        Complex twiddle = cos(angle) + sin(angle)*I;
        Complex t = twiddle * odd_fft[k];
        
        output[k] = even_fft[k] + t;
        output[k + N/2] = even_fft[k] - t;
    }
    
    free(even);
    free(odd);
    free(even_fft);
    free(odd_fft);
}

// 频谱分析
void analyze_spectrum(double *signal, int N, double sample_rate) {
    Complex *input = (Complex*)malloc(N * sizeof(Complex));
    Complex *output = (Complex*)malloc(N * sizeof(Complex));
    
    // 转换为复数
    for (int i = 0; i < N; i++) {
        input[i] = signal[i] + 0.0*I;
    }
    
    // 执行FFT
    fft(input, output, N);
    
    printf("频谱分析结果:\n");
    printf("频率(Hz)\t幅度\t\t相位(度)\n");
    
    for (int k = 0; k < N/2; k++) {
        double frequency = k * sample_rate / N;
        double magnitude = cabs(output[k]);
        double phase = carg(output[k]) * 180.0 / M_PI;
        
        if (magnitude > 0.01) {  // 只显示显著的频率分量
            printf("%.2f\t\t%.4f\t\t%.2f\n", frequency, magnitude, phase);
        }
    }
    
    free(input);
    free(output);
}

3.2 数字滤波器

// IIR滤波器结构
typedef struct {
    double *a_coeffs;  // 反馈系数
    double *b_coeffs;  // 前馈系数
    double *x_history; // 输入历史
    double *y_history; // 输出历史
    int order;         // 滤波器阶数
    int history_index; // 历史索引
} IIRFilter;

// 创建IIR滤波器
IIRFilter* create_iir_filter(double *a_coeffs, double *b_coeffs, int order) {
    IIRFilter *filter = (IIRFilter*)malloc(sizeof(IIRFilter));
    if (!filter) return NULL;
    
    filter->order = order;
    filter->history_index = 0;
    
    filter->a_coeffs = (double*)malloc((order + 1) * sizeof(double));
    filter->b_coeffs = (double*)malloc((order + 1) * sizeof(double));
    filter->x_history = (double*)calloc(order + 1, sizeof(double));
    filter->y_history = (double*)calloc(order + 1, sizeof(double));
    
    if (!filter->a_coeffs || !filter->b_coeffs || 
        !filter->x_history || !filter->y_history) {
        // 清理内存
        free(filter->a_coeffs);
        free(filter->b_coeffs);
        free(filter->x_history);
        free(filter->y_history);
        free(filter);
        return NULL;
    }
    
    // 复制系数
    for (int i = 0; i <= order; i++) {
        filter->a_coeffs[i] = a_coeffs[i];
        filter->b_coeffs[i] = b_coeffs[i];
    }
    
    return filter;
}

// IIR滤波器处理
double iir_filter_process(IIRFilter *filter, double input) {
    if (!filter) return 0.0;
    
    // 更新输入历史
    filter->x_history[filter->history_index] = input;
    
    // 计算输出
    double output = 0.0;
    
    // 前馈部分 (FIR)
    for (int i = 0; i <= filter->order; i++) {
        int idx = (filter->history_index - i + filter->order + 1) % (filter->order + 1);
        output += filter->b_coeffs[i] * filter->x_history[idx];
    }
    
    // 反馈部分 (IIR)
    for (int i = 1; i <= filter->order; i++) {
        int idx = (filter->history_index - i + filter->order + 1) % (filter->order + 1);
        output -= filter->a_coeffs[i] * filter->y_history[idx];
    }
    
    // 更新输出历史
    filter->y_history[filter->history_index] = output;
    
    // 更新索引
    filter->history_index = (filter->history_index + 1) % (filter->order + 1);
    
    return output;
}

// 设计低通滤波器（巴特沃斯）
IIRFilter* design_butterworth_lowpass(double cutoff_freq, double sample_rate, int order) {
    // 简化的巴特沃斯低通滤波器设计
    double omega_c = 2.0 * M_PI * cutoff_freq / sample_rate;
    double *a_coeffs = (double*)malloc((order + 1) * sizeof(double));
    double *b_coeffs = (double*)malloc((order + 1) * sizeof(double));
    
    // 一阶低通滤波器示例
    if (order == 1) {
        double alpha = exp(-omega_c);
        
        a_coeffs[0] = 1.0;
        a_coeffs[1] = -alpha;
        
        b_coeffs[0] = 1.0 - alpha;
        b_coeffs[1] = 0.0;
    }
    
    IIRFilter *filter = create_iir_filter(a_coeffs, b_coeffs, order);
    
    free(a_coeffs);
    free(b_coeffs);
    
    return filter;
}

3.3 音频效果处理

// 回声效果
typedef struct {
    double *delay_buffer;
    int buffer_size;
    int write_index;
    double delay_time;    // 延迟时间(秒)
    double feedback;      // 反馈系数
    double mix;          // 混合比例
    double sample_rate;
} EchoEffect;

// 创建回声效果
EchoEffect* create_echo_effect(double delay_time, double feedback, 
                              double mix, double sample_rate) {
    EchoEffect *echo = (EchoEffect*)malloc(sizeof(EchoEffect));
    if (!echo) return NULL;
    
    echo->delay_time = delay_time;
    echo->feedback = feedback;
    echo->mix = mix;
    echo->sample_rate = sample_rate;
    echo->write_index = 0;
    
    echo->buffer_size = (int)(delay_time * sample_rate) + 1;
    echo->delay_buffer = (double*)calloc(echo->buffer_size, sizeof(double));
    
    if (!echo->delay_buffer) {
        free(echo);
        return NULL;
    }
    
    return echo;
}

// 处理回声效果
double echo_process(EchoEffect *echo, double input) {
    if (!echo) return input;
    
    // 读取延迟信号
    double delayed_sample = echo->delay_buffer[echo->write_index];
    
    // 计算输出
    double output = input + echo->mix * delayed_sample;
    
    // 更新延迟缓冲区
    echo->delay_buffer[echo->write_index] = input + echo->feedback * delayed_sample;
    
    // 更新写入索引
    echo->write_index = (echo->write_index + 1) % echo->buffer_size;
    
    return output;
}

// 混响效果（简化的Schroeder混响）
typedef struct {
    IIRFilter **comb_filters;
    IIRFilter **allpass_filters;
    int num_combs;
    int num_allpass;
    double room_size;
    double damping;
} ReverbEffect;

// 创建混响效果
ReverbEffect* create_reverb_effect(double room_size, double damping, double sample_rate) {
    ReverbEffect *reverb = (ReverbEffect*)malloc(sizeof(ReverbEffect));
    if (!reverb) return NULL;
    
    reverb->room_size = room_size;
    reverb->damping = damping;
    reverb->num_combs = 4;
    reverb->num_allpass = 2;
    
    // 创建梳状滤波器
    reverb->comb_filters = (IIRFilter**)malloc(reverb->num_combs * sizeof(IIRFilter*));
    
    // 梳状滤波器延迟时间（毫秒）
    double comb_delays[] = {29.7, 37.1, 41.1, 43.7};
    
    for (int i = 0; i < reverb->num_combs; i++) {
        double delay_samples = (comb_delays[i] / 1000.0) * sample_rate * room_size;
        // 简化的梳状滤波器实现
        double a_coeffs[] = {1.0, 0.0};
        double b_coeffs[] = {1.0, 0.0};
        reverb->comb_filters[i] = create_iir_filter(a_coeffs, b_coeffs, 1);
    }
    
    // 创建全通滤波器
    reverb->allpass_filters = (IIRFilter**)malloc(reverb->num_allpass * sizeof(IIRFilter*));
    
    double allpass_delays[] = {5.0, 1.7};
    
    for (int i = 0; i < reverb->num_allpass; i++) {
        double delay_samples = (allpass_delays[i] / 1000.0) * sample_rate;
        double a_coeffs[] = {1.0, -0.7};
        double b_coeffs[] = {-0.7, 1.0};
        reverb->allpass_filters[i] = create_iir_filter(a_coeffs, b_coeffs, 1);
    }
    
    return reverb;
}

4. 实时处理与优化

4.1 缓冲区管理

// 环形缓冲区
typedef struct {
    double *buffer;
    int size;
    int read_index;
    int write_index;
    int count;
    pthread_mutex_t mutex;
} RingBuffer;

// 创建环形缓冲区
RingBuffer* create_ring_buffer(int size) {
    RingBuffer *rb = (RingBuffer*)malloc(sizeof(RingBuffer));
    if (!rb) return NULL;
    
    rb->buffer = (double*)malloc(size * sizeof(double));
    if (!rb->buffer) {
        free(rb);
        return NULL;
    }
    
    rb->size = size;
    rb->read_index = 0;
    rb->write_index = 0;
    rb->count = 0;
    
    pthread_mutex_init(&rb->mutex, NULL);
    
    return rb;
}

// 写入数据
int ring_buffer_write(RingBuffer *rb, double *data, int length) {
    if (!rb || !data) return -1;
    
    pthread_mutex_lock(&rb->mutex);
    
    if (rb->count + length > rb->size) {
        pthread_mutex_unlock(&rb->mutex);
        return -1;  // 缓冲区满
    }
    
    for (int i = 0; i < length; i++) {
        rb->buffer[rb->write_index] = data[i];
        rb->write_index = (rb->write_index + 1) % rb->size;
        rb->count++;
    }
    
    pthread_mutex_unlock(&rb->mutex);
    return length;
}

// 读取数据
int ring_buffer_read(RingBuffer *rb, double *data, int length) {
    if (!rb || !data) return -1;
    
    pthread_mutex_lock(&rb->mutex);
    
    int read_count = (length < rb->count) ? length : rb->count;
    
    for (int i = 0; i < read_count; i++) {
        data[i] = rb->buffer[rb->read_index];
        rb->read_index = (rb->read_index + 1) % rb->size;
        rb->count--;
    }
    
    pthread_mutex_unlock(&rb->mutex);
    return read_count;
}

4.2 实时音频处理框架

// 音频处理回调函数类型
typedef void (*AudioCallback)(double *input, double *output, int frames, void *user_data);

// 音频处理引擎
typedef struct {
    AudioCallback callback;
    void *user_data;
    RingBuffer *input_buffer;
    RingBuffer *output_buffer;
    int sample_rate;
    int channels;
    int buffer_size;
    bool is_running;
    pthread_t processing_thread;
} AudioEngine;

// 音频处理线程
void* audio_processing_thread(void *arg) {
    AudioEngine *engine = (AudioEngine*)arg;
    
    double *input_frame = (double*)malloc(engine->buffer_size * sizeof(double));
    double *output_frame = (double*)malloc(engine->buffer_size * sizeof(double));
    
    while (engine->is_running) {
        // 从输入缓冲区读取数据
        int read_count = ring_buffer_read(engine->input_buffer, input_frame, engine->buffer_size);
        
        if (read_count == engine->buffer_size) {
            // 调用用户处理函数
            engine->callback(input_frame, output_frame, engine->buffer_size, engine->user_data);
            
            // 写入输出缓冲区
            ring_buffer_write(engine->output_buffer, output_frame, engine->buffer_size);
        } else {
            // 缓冲区数据不足，短暂休眠
            usleep(1000);  // 1ms
        }
    }
    
    free(input_frame);
    free(output_frame);
    
    return NULL;
}

// 启动音频引擎
int start_audio_engine(AudioEngine *engine) {
    if (!engine) return -1;
    
    engine->is_running = true;
    
    if (pthread_create(&engine->processing_thread, NULL, audio_processing_thread, engine) != 0) {
        engine->is_running = false;
        return -1;
    }
    
    return 0;
}

5. 应用实例

5.1 简单的音频分析器

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from scipy.signal import spectrogram

class AudioAnalyzer:
    def __init__(self, sample_rate=44100):
        self.sample_rate = sample_rate
    
    def load_audio(self, filename):
        """加载音频文件"""
        self.sample_rate, self.audio_data = wavfile.read(filename)
        
        # 转换为单声道
        if len(self.audio_data.shape) > 1:
            self.audio_data = np.mean(self.audio_data, axis=1)
        
        # 归一化
        self.audio_data = self.audio_data.astype(np.float32) / np.max(np.abs(self.audio_data))
        
        return self.audio_data
    
    def analyze_spectrum(self, start_time=0, duration=1):
        """分析频谱"""
        start_sample = int(start_time * self.sample_rate)
        end_sample = int((start_time + duration) * self.sample_rate)
        
        segment = self.audio_data[start_sample:end_sample]
        
        # 计算FFT
        fft_result = np.fft.fft(segment)
        frequencies = np.fft.fftfreq(len(segment), 1/self.sample_rate)
        
        # 只取正频率部分
        positive_freq_idx = frequencies > 0
        frequencies = frequencies[positive_freq_idx]
        magnitude = np.abs(fft_result[positive_freq_idx])
        
        return frequencies, magnitude
    
    def generate_spectrogram(self):
        """生成频谱图"""
        frequencies, times, Sxx = spectrogram(self.audio_data, self.sample_rate)
        
        plt.figure(figsize=(12, 8))
        plt.pcolormesh(times, frequencies, 10 * np.log10(Sxx))
        plt.ylabel('频率 (Hz)')
        plt.xlabel('时间 (s)')
        plt.title('音频频谱图')
        plt.colorbar(label='功率谱密度 (dB)')
        plt.show()
        
        return frequencies, times, Sxx
    
    def detect_pitch(self, start_time=0, duration=0.1):
        """基音检测"""
        start_sample = int(start_time * self.sample_rate)
        end_sample = int((start_time + duration) * self.sample_rate)
        
        segment = self.audio_data[start_sample:end_sample]
        
        # 自相关函数法
        correlation = np.correlate(segment, segment, mode='full')
        correlation = correlation[len(correlation)//2:]
        
        # 寻找第一个峰值（排除零延迟）
        min_period = int(self.sample_rate / 800)  # 最高800Hz
        max_period = int(self.sample_rate / 80)   # 最低80Hz
        
        peak_idx = np.argmax(correlation[min_period:max_period]) + min_period
        fundamental_freq = self.sample_rate / peak_idx
        
        return fundamental_freq

# 使用示例
if __name__ == "__main__":
    analyzer = AudioAnalyzer()
    
    # 生成测试信号
    duration = 2.0
    t = np.linspace(0, duration, int(44100 * duration))
    
    # 复合信号：440Hz + 880Hz + 1320Hz (A4和谐波)
    test_signal = (np.sin(2 * np.pi * 440 * t) + 
                  0.5 * np.sin(2 * np.pi * 880 * t) + 
                  0.25 * np.sin(2 * np.pi * 1320 * t))
    
    analyzer.audio_data = test_signal
    analyzer.sample_rate = 44100
    
    # 分析频谱
    freqs, mags = analyzer.analyze_spectrum(0, 1)
    
    # 绘制频谱
    plt.figure(figsize=(12, 6))
    plt.plot(freqs[:2000], mags[:2000])  # 只显示0-2kHz
    plt.xlabel('频率 (Hz)')
    plt.ylabel('幅度')
    plt.title('音频频谱分析')
    plt.grid(True)
    plt.show()
    
    # 检测基音
    pitch = analyzer.detect_pitch(0, 0.1)
    print(f"检测到的基音频率: {pitch:.2f} Hz")

总结

本文深入介绍了音视频基础理论与数字信号处理的核心概念：

音频基础：

声音的物理特性和数字化过程
采样定理和量化原理
PCM格式和音频编码基础

视频基础：

视觉感知原理和颜色空间
视频帧结构和压缩技术
运动估计和帧差编码

信号处理：

傅里叶变换和频域分析
数字滤波器设计和实现
音频效果处理算法

实时处理：

缓冲区管理和多线程处理
实时音频处理框架
性能优化技巧

实际应用：

音频分析器实现
频谱分析和基音检测
可视化和调试工具

掌握这些基础理论和技术，为深入学习音视频编解码、流媒体传输、实时通信等高级主题奠定了坚实的基础。在实际开发中，这些概念和算法是构建高质量音视频应用的核心要素。

编外计划 - 日志

To be or not to be,--that is question.