from typing import Optional import scipy.io.wavfile as wav import numpy as np import matplotlib.pyplot as plt import ipdb def hamming(frame_length: int) -> np.ndarray: # frame_length - 窗长 n = np.arange(frame_length) h = 0.54 - 0.4 * np.cos(2 * np.pi * n / (frame_length - 1)) return h def delta_sgn(x: np.ndarray) -> np.ndarray: # x - 语音信号 sound = x threshold = np.max(np.abs(sound)) / 20 negative_sound = sound + threshold negative_sound -= np.abs(negative_sound) positive_sound = sound - threshold positive_sound += np.abs(positive_sound) sound = negative_sound + positive_sound return np.sign(sound) def ampf( x: np.ndarray, FrameLen: Optional[int] = 128, inc: Optional[int] = 90 ) -> np.ndarray: # x - 语音时域信号 # FrameLen - 每一帧的长度 # inc - 步长 frames = [] for i in range(0, len(x) - FrameLen, inc): frame = x[i : i + FrameLen] frames.append(frame) frames = np.array(frames) h = hamming(frame_length=FrameLen) # 海明窗 amp = np.dot(frames**2, h.T**2).T / FrameLen return amp def zcrf( x: np.ndarray, FrameLen: Optional[int] = 128, inc: Optional[int] = 90 ) -> np.ndarray: # x - 语音时域信号 # FrameLen - 每一帧的长度 # inc - 步长 sound = x sgn_sound = np.sign(sound) dif_sound = np.abs(sgn_sound[1:] - sgn_sound[:-1]) h = np.ones((FrameLen,)) / (2 * FrameLen) frames = [] for i in range(0, len(dif_sound) - FrameLen, inc): frame = dif_sound[i : i + FrameLen] frames.append(frame) frames = np.array(frames) zcr = np.dot(frames, h.T).T return zcr def zcrf_delta( x: np.ndarray, FrameLen: Optional[int] = 128, inc: Optional[int] = 90 ) -> np.ndarray: # x - 语音时域信号 # FrameLen - 每一帧的长度 # inc - 步长 sound = x sgn_sound = delta_sgn(sound) dif_sound = np.abs(sgn_sound[1:] - sgn_sound[:-1]) h = np.ones((FrameLen,)) / (2 * FrameLen) frames = [] for i in range(0, len(dif_sound) - FrameLen, inc): frame = dif_sound[i : i + FrameLen] frames.append(frame) frames = np.array(frames) zcr = np.dot(frames, h.T).T return zcr def analyze_sound( filename: str, FrameLen: Optional[int] = 128, inc: Optional[int] = 90 ) -> None: sr, sound_array = wav.read(filename) sound_array = sound_array.T[0, :] if sound_array.ndim != 1 else sound_array sound_array = sound_array / np.max(np.abs(sound_array)) # 归一化 amp = ampf(sound_array, FrameLen, inc) zcr = zcrf_delta(sound_array, FrameLen, inc) rescale_rate = len(sound_array) / amp.shape[0] frameTime = np.arange(len(amp)) * rescale_rate # 边界检测 x1 = [] x2 = [] x3 = [] amp2 = np.min(amp) + (np.max(amp) - np.min(amp)) / 20 zcr2 = np.min(zcr) + (np.max(zcr) - np.min(zcr)) / 18 threshold_len = 6 state = 1 for i in range(threshold_len, len(amp) - threshold_len): if state == 1: if np.all(zcr[i : i + threshold_len] > zcr2): x1.append(i * rescale_rate) state = 2 elif state == 2: if np.all(amp[i : i + threshold_len] > amp2): x3.append(i * rescale_rate) state = 3 if ( state != 1 and np.all(amp[i : i + threshold_len] < amp2) and np.all(zcr[i : i + threshold_len] < zcr2) ): x2.append(i * rescale_rate) state = 1 # 绘制语音波形、短时能量、短时过零率 plt.figure(figsize=(12, 8)) # 语音波形 plt.subplot(3, 1, 1) plt.plot(sound_array) plt.title("Waveform") for boundary in x1: plt.axvline(x=boundary, color="r", linestyle="--", linewidth=0.5) for boundary in x2: plt.axvline(x=boundary, color="b", linestyle="--", linewidth=0.5) for boundary in x3: plt.axvline(x=boundary, color="g", linestyle="--", linewidth=0.5) # 短时能量 plt.subplot(3, 1, 2) plt.plot(frameTime, amp, label="Energy") plt.axhline(y=amp2, color="r", linestyle="--", label="Energy Threshold") plt.legend() plt.title("Short-time Energy") for boundary in x1: plt.axvline(x=boundary, color="r", linestyle="--", linewidth=0.5) for boundary in x2: plt.axvline(x=boundary, color="b", linestyle="--", linewidth=0.5) for boundary in x3: plt.axvline(x=boundary, color="g", linestyle="--", linewidth=0.5) # 短时过零率 plt.subplot(3, 1, 3) plt.plot(frameTime, zcr, label="Zero Crossing Rate") plt.axhline(y=zcr2, color="r", linestyle="--", label="ZCR Threshold") plt.legend() plt.title("Short-time Zero Crossing Rate") # 显示语音端点和清/浊音边界 for boundary in x1: plt.axvline(x=boundary, color="r", linestyle="--", linewidth=0.5) for boundary in x2: plt.axvline(x=boundary, color="b", linestyle="--", linewidth=0.5) for boundary in x3: plt.axvline(x=boundary, color="g", linestyle="--", linewidth=0.5) plt.tight_layout() plt.show() if __name__ == "__main__": analyze_sound("tang1.wav", FrameLen=128, inc=90)