import scipy.io.wavfile as wav from scipy import signal import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import ipdb; # 读取音频文件 filename = "./tang1.wav" sample_rate, sound_array = wav.read(filename) sound_array = sound_array.T[0, :] if sound_array.ndim != 1 else sound_array sound_array = sound_array / np.max(np.abs(sound_array)) # 归一化 frame_length = int(sample_rate * 0.01) num_frames = len(sound_array) // frame_length autocorrelation = np.zeros((num_frames, frame_length)) autocorrelation_of_candidates = np.zeros((num_frames, frame_length)) min_peak_threshold = min(sample_rate // 400, frame_length) max_peak_threshold = min(sample_rate // 80, frame_length) for n in range(num_frames): frame = sound_array[n * frame_length: (n + 1) * frame_length] autocorrelation[n, :] = signal.correlate(frame, frame, mode='full')[frame_length - 1:] # 基频阈值为80-400Hz,则基音周期(即延迟)t最小为sample_rate/400,最大为sample_rate/80 # 本应该使用峰值的延迟作为基音周期的候选值,但是发现峰值(局部最大值)并不好判断,同时一帧内的点数不多,因此将阈值内的所有点都作为候选点 # 那么将不在阈值内的自相关系数置为一个非常小的数,从而不让算法选择不在阈值内的基音周期 autocorrelation_of_candidates[n, :] = np.pad( autocorrelation[n, min_peak_threshold : max_peak_threshold], (min_peak_threshold, max(frame_length - max_peak_threshold, 0)), mode='constant', constant_values=-30.0, ) dist = -autocorrelation cost = np.zeros((num_frames, frame_length)) path = np.zeros((num_frames, frame_length)) for n in range(num_frames - 1): for j in range(min_peak_threshold, max_peak_threshold): # f0 = sample_rate / candidate cost[n + 1, j] = dist[n + 1, j] + np.min( cost[n, :] + np.abs(sample_rate / np.arange(frame_length) - sample_rate / j) ) path[n + 1, j] = np.argmin( cost[n, :] + np.abs(sample_rate / np.arange(frame_length) - sample_rate / j) ) l_hat = np.zeros(num_frames, dtype=np.int32) l_hat[num_frames - 1] = np.argmin(cost[num_frames - 1, :]) for n in range(num_frames - 2, -1, -1): l_hat[n] = path[n + 1, l_hat[n + 1]] f0 = sample_rate / l_hat