2024-09-05 12:45:40 +08:00

55 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import scipy.io.wavfile as wav
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import ipdb;
# 读取音频文件
filename = "./tang1.wav"
sample_rate, sound_array = wav.read(filename)
sound_array = sound_array.T[0, :] if sound_array.ndim != 1 else sound_array
sound_array = sound_array / np.max(np.abs(sound_array)) # 归一化
frame_length = int(sample_rate * 0.01)
num_frames = len(sound_array) // frame_length
autocorrelation = np.zeros((num_frames, frame_length))
autocorrelation_of_candidates = np.zeros((num_frames, frame_length))
min_peak_threshold = min(sample_rate // 400, frame_length)
max_peak_threshold = min(sample_rate // 80, frame_length)
for n in range(num_frames):
frame = sound_array[n * frame_length: (n + 1) * frame_length]
autocorrelation[n, :] = signal.correlate(frame, frame, mode='full')[frame_length - 1:]
# 基频阈值为80-400Hz则基音周期即延迟t最小为sample_rate/400最大为sample_rate/80
# 本应该使用峰值的延迟作为基音周期的候选值,但是发现峰值(局部最大值)并不好判断,同时一帧内的点数不多,因此将阈值内的所有点都作为候选点
# 那么将不在阈值内的自相关系数置为一个非常小的数,从而不让算法选择不在阈值内的基音周期
autocorrelation_of_candidates[n, :] = np.pad(
autocorrelation[n, min_peak_threshold : max_peak_threshold],
(min_peak_threshold, max(frame_length - max_peak_threshold, 0)),
mode='constant',
constant_values=-30.0,
)
dist = -autocorrelation
cost = np.zeros((num_frames, frame_length))
path = np.zeros((num_frames, frame_length))
for n in range(num_frames - 1):
for j in range(min_peak_threshold, max_peak_threshold):
# f0 = sample_rate / candidate
cost[n + 1, j] = dist[n + 1, j] + np.min(
cost[n, :] + np.abs(sample_rate / np.arange(frame_length) - sample_rate / j)
)
path[n + 1, j] = np.argmin(
cost[n, :] + np.abs(sample_rate / np.arange(frame_length) - sample_rate / j)
)
l_hat = np.zeros(num_frames, dtype=np.int32)
l_hat[num_frames - 1] = np.argmin(cost[num_frames - 1, :])
for n in range(num_frames - 2, -1, -1):
l_hat[n] = path[n + 1, l_hat[n + 1]]
f0 = sample_rate / l_hat