音频数据增强:提升音频信号质量的多种技术
在音频处理和机器学习领域,音频数据增强是一种常用的技术,旨在通过对原始音频信号进行各种变换和处理,生成更多样化的训练数据。
这不仅可以提高模型的鲁棒性,还能改善其在真实世界应用中的表现。本文将介绍几种常用的音频数据增强技术,包括时间拉伸、音高变换、带通滤波、均衡器、冲激响应处理、添加回声与延迟、非线性模拟等。
1. 时间拉伸
时间拉伸是一种改变音频信号播放速度而不改变其音高的技术。通过随机选择一个拉伸因子(例如在 0.8 到 1.2 之间),我们可以使音频信号变得更快或更慢。这种处理可以帮助模型适应不同的说话速度或音乐节奏。
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
def time_stretch(audio_data):"""时间拉伸音频信号"""stretch_factor = 1.5stretched_audio = librosa.effects.time_stretch(audio_data, rate=stretch_factor)return stretched_audio
def plot_signals_and_spectra(original_audio, enhanced_audio, sr):"""绘制原始音频和增强音频的信号及频谱"""plt.figure(figsize=(12, 8))plt.subplot(2, 2, 1)librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)plt.title('Original Audio Signal')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 2)D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Original Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.subplot(2, 2, 3)librosa.display.waveshow(enhanced_audio, sr=sr, alpha=0.5)plt.title('Stretched Audio Signal')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 4)D_enhanced = librosa.amplitude_to_db(np.abs(librosa.stft(enhanced_audio)), ref=np.max)librosa.display.specshow(D_enhanced, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Stretched Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.tight_layout()plt.show()
if __name__ == "__main__":audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'original_audio, sr = librosa.load(audio_path, sr=None)stretched_audio = time_stretch(original_audio)plot_signals_and_spectra(original_audio, stretched_audio, sr)
2. 音高变换
音高变换是指在不改变音频信号播放速度的情况下,调整其音高。通过随机选择音高变换的步数(例如 -2、-1、1、2),我们可以模拟不同的音调变化。这对于音乐和语音信号的处理尤为重要。
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
def pitch_shift(audio_data, sr):selected_pitch = 6shifted_audio = librosa.effects.pitch_shift(audio_data, sr=sr, n_steps=selected_pitch)return shifted_audio
def plot_signals_and_spectra(original_audio, enhanced_audio, sr):plt.figure(figsize=(12, 8))plt.subplot(2, 2, 1)librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)plt.title('Original Audio Signal')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 2)D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Original Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.subplot(2, 2, 3)librosa.display.waveshow(enhanced_audio, sr=sr, alpha=0.5)plt.title('Pitch Shifted Audio Signal')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 4)D_enhanced = librosa.amplitude_to_db(np.abs(librosa.stft(enhanced_audio)), ref=np.max)librosa.display.specshow(D_enhanced, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Pitch Shifted Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.tight_layout()plt.show()
if __name__ == "__main__":audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'original_audio, sr = librosa.load(audio_path, sr=None)shifted_audio = pitch_shift(original_audio, sr)plot_signals_and_spectra(original_audio, shifted_audio, sr)
3. 带通滤波
带通滤波是一种常用的信号处理技术,用于保留特定频率范围内的信号,同时抑制其他频率。通过随机选择低频和高频截止频率,我们可以模拟不同的环境和设备特性。
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import random
from scipy.signal import butter, lfilter
def butter_bandpass(lowcut, highcut, fs, order=5):nyq = 0.5 * fsnormal_lowcut = lowcut / nyqnormal_highcut = highcut / nyqb, a = butter(order, [normal_lowcut, normal_highcut], btype='band', analog=False)return b, a
def bandpass_filter_audio(audio_data, sr, lowcut, highcut):b, a = butter_bandpass(lowcut, highcut, sr)filtered_audio = lfilter(b, a, audio_data)return filtered_audio
def plot_signals_and_spectra(original_audio, filtered_audio, sr):plt.figure(figsize=(12, 8))plt.subplot(2, 2, 1)librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)plt.title('Original Audio Signal')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 2)D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Original Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.subplot(2, 2, 3)librosa.display.waveshow(filtered_audio, sr=sr, alpha=0.5)plt.title('Filtered Audio Signal')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 4)D_filtered = librosa.amplitude_to_db(np.abs(librosa.stft(filtered_audio)), ref=np.max)librosa.display.specshow(D_filtered, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Filtered Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.tight_layout()plt.show()
if __name__ == "__main__":audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'original_audio, sr = librosa.load(audio_path, sr=None)filtered_audio = bandpass_filter_audio(original_audio, sr, lowcut=300, highcut=4000)plot_signals_and_spectra(original_audio, filtered_audio, sr)
4. 均衡器
均衡器用于调整音频信号中不同频段的增益。通过定义多个频段及其增益,我们可以增强或削弱特定频率范围的音频信号,从而改善音质。
定义了多个频段,每个频段都有一个低截止频率(lowcut)、高截止频率(highcut)和增益(gain)。
频段的划分应确保下界(lowcut)不太靠近 20 Hz,上界(highcut)不太靠近采样频率的一半(fs/2),以避免滤波器设计中的不稳定性。
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from scipy.signal import butter, lfilter
import random
def butter_bandpass(lowcut, highcut, fs, order=5):nyq = 0.5 * fsif lowcut <= 0 or highcut >= nyq:raise ValueError("Lowcut and highcut must be in the range (0, Nyquist frequency).")normal_lowcut = lowcut / nyqnormal_highcut = highcut / nyqb, a = butter(order, [normal_lowcut, normal_highcut], btype='band', analog=False)return b, a
def equalizer(audio_data, sr):bands = [(60, 200, 0.8),(200, 500, 1.5),(500, 1000, 1.2),(1000, 2000, 1),(2000, 4000, 1.5),(4000, 7995, 0.5)]output = np.zeros_like(audio_data)if np.any(np.isnan(audio_data)) or np.any(np.isinf(audio_data)):raise ValueError("Input audio data contains NaN or Inf values.")for lowcut, highcut, gain in bands:b, a = butter_bandpass(lowcut, highcut, sr)filtered = lfilter(b, a, audio_data)output += filtered * gainoutput = np.clip(output, -1.0, 1.0) # 限制输出范围return output
def plot_signals_and_spectra(original_audio, equalized_audio, sr):plt.figure(figsize=(12, 8))plt.subplot(2, 2, 1)librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)plt.title('Original Audio Signal')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 2)D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Original Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.subplot(2, 2, 3)librosa.display.waveshow(equalized_audio, sr=sr, alpha=0.5)plt.title('Equalized Audio Signal')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 4)D_equalized = librosa.amplitude_to_db(np.abs(librosa.stft(equalized_audio)), ref=np.max)librosa.display.specshow(D_equalized, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Equalized Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.tight_layout()plt.show()
if __name__ == "__main__":audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'original_audio, sr = librosa.load(audio_path, sr=None)equalized_audio = equalizer(original_audio, sr)plot_signals_and_spectra(original_audio, equalized_audio, sr)
5. 添加回声与延迟
通过冲激响应处理,我们可以模拟房间或设备的声学特性。结合归一化、填充和延迟处理,我们可以生成具有特定声学特性的音频信号。以下是相关函数的实现:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from scipy.signal import fftconvolve, correlate
def apply_impulse_response(audio, ir):if audio.ndim >= 2:audio = audio[0]if ir.ndim >= 2:ir = ir[0]processed_audio = fftconvolve(audio, ir, mode='full')[:len(audio)]return processed_audio
def normalize_audio(audio):return audio / np.max(np.abs(audio))
def pad_audio(original_audio, processed_audio, pad_samples):if pad_samples > 0:original_audio_padded = np.pad(original_audio, (0, pad_samples), mode='constant')processed_audio_padded = np.pad(processed_audio, (pad_samples, 0), mode='constant')else:original_audio_padded = np.pad(original_audio, (-pad_samples, 0), mode='constant')processed_audio_padded = np.pad(processed_audio, (0, -pad_samples), mode='constant')return original_audio_padded, processed_audio_padded
def calculate_delay(original_audio, processed_audio):correlation = correlate(processed_audio, original_audio)delay_samples = np.argmax(correlation) - (len(processed_audio) - 1)return delay_samples
def add_reverb_with_delay(audio_data, rir_data, delay_samples):rir_data = normalize_audio(rir_data)processed_audio = apply_impulse_response(audio_data, rir_data)processed_audio = normalize_audio(processed_audio)ori_delay = calculate_delay(audio_data, processed_audio)pad_samples = delay_samples - ori_delayoriginal_audio_padded, processed_audio_padded = pad_audio(audio_data, processed_audio, pad_samples)final_delay = calculate_delay(original_audio_padded, processed_audio_padded)print(f"Final delay: {final_delay / 16:.2f} ms")return original_audio_padded, processed_audio_padded
def plot_signals_and_spectra(original_audio, processed_audio, sr):plt.figure(figsize=(12, 8))plt.subplot(2, 2, 1)librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)plt.title('Original Audio Signal')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 2)D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Original Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.subplot(2, 2, 3)librosa.display.waveshow(processed_audio, sr=sr, alpha=0.5)plt.title('Processed Audio Signal with Reverb')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 4)D_processed = librosa.amplitude_to_db(np.abs(librosa.stft(processed_audio)), ref=np.max)librosa.display.specshow(D_processed, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Processed Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.tight_layout()plt.show()
if __name__ == "__main__":audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'rir_path = '/Volumes/T9/DATA/构建数据集/SELE/rir/large/SLR28_large_Room001-00001.wav'original_audio, sr = librosa.load(audio_path, sr=None)rir_data, _ = librosa.load(rir_path, sr=None)delay_samples = 8000 # 设置延迟样本数original_audio,processed_audio = add_reverb_with_delay(original_audio, rir_data, delay_samples)plot_signals_and_spectra(original_audio, processed_audio, sr)
6. 非线性模拟
非线性模拟用于模拟扬声器或其他设备的非线性特性。通过应用硬剪辑和非线性变换,我们可以生成更真实的音频信号。
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
def simulate_nonlinearity(audio_data):clip_max = 0.3clipped_signal = np.clip(audio_data, -clip_max, clip_max)b = 1.5 * clipped_signal - 0.3 * clipped_signal ** 2gamma = np.random.uniform(0.15, 0.3)if np.any(b > 0):a = np.random.uniform(0.05, 0.45)else:a = np.random.uniform(0.1, 0.4)nonlinear_signal = gamma * (2 / (1 + np.exp(-a * b)))return nonlinear_signal * clipped_signal
def plot_signals_and_spectra(original_audio, processed_audio, sr):plt.figure(figsize=(12, 8))plt.subplot(2, 2, 1)librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)plt.title('Original Audio Signal')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 2)D_original = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)librosa.display.specshow(D_original, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Original Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.subplot(2, 2, 3)librosa.display.waveshow(processed_audio, sr=sr, alpha=0.5)plt.title('Processed Audio Signal with Nonlinearity')plt.xlabel('Time (s)')plt.ylabel('Amplitude')plt.subplot(2, 2, 4)D_processed = librosa.amplitude_to_db(np.abs(librosa.stft(processed_audio)), ref=np.max)librosa.display.specshow(D_processed, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')plt.title('Processed Audio Spectrum')plt.colorbar(format='%+2.0f dB')plt.tight_layout()plt.show()
if __name__ == "__main__":audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'original_audio, sr = librosa.load(audio_path, sr=None)processed_audio = simulate_nonlinearity(original_audio)plot_signals_and_spectra(original_audio, processed_audio, sr)