import scipy.io.wavfile as wav
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import ipdb;

# 读取音频文件
filename = "./tang1.wav"
sample_rate, sound_array = wav.read(filename)
sound_array = sound_array.T[0, :] if sound_array.ndim != 1 else sound_array
sound_array = sound_array / np.max(np.abs(sound_array))  # 归一化

frame_length = int(sample_rate * 0.01)
num_frames = len(sound_array) // frame_length
autocorrelation = np.zeros((num_frames, frame_length))
autocorrelation_of_candidates = np.zeros((num_frames, frame_length))
min_peak_threshold = min(sample_rate // 400, frame_length)
max_peak_threshold = min(sample_rate // 80, frame_length)
for n in range(num_frames):
    frame = sound_array[n * frame_length: (n + 1) * frame_length]
    autocorrelation[n, :] = signal.correlate(frame, frame, mode='full')[frame_length - 1:]
    # 基频阈值为80-400Hz，则基音周期（即延迟）t最小为sample_rate/400，最大为sample_rate/80
    
    # 本应该使用峰值的延迟作为基音周期的候选值，但是发现峰值（局部最大值）并不好判断，同时一帧内的点数不多，因此将阈值内的所有点都作为候选点
    # 那么将不在阈值内的自相关系数置为一个非常小的数，从而不让算法选择不在阈值内的基音周期
    autocorrelation_of_candidates[n, :] = np.pad(
        autocorrelation[n, min_peak_threshold : max_peak_threshold], 
        (min_peak_threshold, max(frame_length - max_peak_threshold, 0)),
        mode='constant', 
        constant_values=-30.0,
    )

dist = -autocorrelation
cost = np.zeros((num_frames, frame_length))
path = np.zeros((num_frames, frame_length))

for n in range(num_frames - 1):
    for j in range(min_peak_threshold, max_peak_threshold):
        # f0 = sample_rate / candidate
        cost[n + 1, j] = dist[n + 1, j] + np.min(
            cost[n, :] + np.abs(sample_rate / np.arange(frame_length) - sample_rate / j)
        )
        path[n + 1, j] = np.argmin(
            cost[n, :] + np.abs(sample_rate / np.arange(frame_length) - sample_rate / j)
        )

l_hat = np.zeros(num_frames, dtype=np.int32)
l_hat[num_frames - 1] = np.argmin(cost[num_frames - 1, :])

for n in range(num_frames - 2, -1, -1):
    l_hat[n] = path[n + 1, l_hat[n + 1]]

f0 = sample_rate / l_hat