python 音频处理（1）——重采样、音高提取

link管理

链接快照平台

输入网页链接，自动生成快照
标签化管理网页链接

使用使用 `torchaudio` 进行重采样（gpu版）：

有了上面cpu的基础，其实调用gpu也就更换一下设备，和放入gpu的操作就好了，因此不过多赘述

def resample_use_cuda():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    start_time = time()
    file_path = input("please input your file path:")
    y, sr = torchaudio.load(file_path)
    y = y.to(device)
    target_sample = 32000
    resampler = Resample(orig_freq=sr, new_freq=target_sample).to(device)
    resample_misic = resampler(y)
    torchaudio.save("test.mp3", resample_misic.to('cpu'), target_sample)    #这里注意要把结果从gpu中拿出来到cpu，不然会报错。
    print(f"cost :{time() - start_time}s")

时间方面嘛，单个音频多了放入gpu取出gpu的步骤肯定会稍慢的，但是跑过cuda都知道它的强大，更多是用于后续的操作说是。

使用librosa库进行重采样

具体步骤：

导入两个库文件， librosa 和音频文件读写库 soundfile

import librosa
import soundfile as sf
from time import time#仅计算时间，不影响主体

导入音频文件

设定目标采样率

综合封装成函数：

1 def resample_by_lisa():
2     file_path = input("please input your file path:")
3     start_time = time()
4     y, sr = librosa.load(file_path)     #使用librosa导入音频文件
5     target_sample_rate = 32000
6     y_32k = librosa.resample(y=y, orig_sr=sr, target_sr=target_sample_rate)         #使用librosa进行重采样至目标采样率
7     sf.write("test_lisa.mp3", data=y_32k, samplerate=target_sample_rate)        #使用soundfile进行文件写入
8     print(f"cost :{time() - start_time}s")

优点，简单小巧， ibrosa 有很多能处理音频的功能

缺点：无法调用cuda，保存的时候需要依赖 soundfile 库。

时间：也是几秒左右，和 torchaudio cpu版差不多

小声bb：提取32k的效果好像没有torchaudio好【嘛，毕竟librosa历史有点久了，没有专注深度学习的torch好很正常啦】，你们也可以自己测一下

8 def resample_by_cpu(): 9 file_path = input( " please input your file path: " ) 10 start_time = time() 11 y, sr = torchaudio.load(file_path) # 使用torchaudio.load导入音频文件 12 13 target_sample = 32000 # 设定目标采样率 14 resampler = Resample(orig_freq=sr, new_freq=target_sample) # 构造resample函数，输入原始采样率和目标采样率 15 resample_misic = resampler(y) # 调用resample函数 16 17 torchaudio.save( " test.mp3 " , resample_misic, target_sample) # 调用torchaudio的保存即可 18 print (f " cost :{time() - start_time}s " ) 19 def resample_use_cuda(): 20 21 device = torch.device( ' cuda ' if torch.cuda.is_available() else ' cpu ' ) 22 start_time = time() 23 file_path = input( " please input your file path: " ) 24 y, sr = torchaudio.load(file_path) 25 26 y = y.to(device) 27 target_sample = 32000 28 resampler = Resample(orig_freq=sr, new_freq= target_sample).to(device) 29 resample_misic = resampler(y) 30 torchaudio.save( " test.mp3 " , resample_misic.to( ' cpu ' ), target_sample) 31 print (f " cost :{time() - start_time}s " ) 32 33 def resample_by_lisa(): 34 file_path = input( " please input your file path: " ) 35 start_time = time() 36 y, sr = librosa.load(file_path) # 使用librosa导入音频文件 37 target_sample_rate = 32000 38 y_32k = librosa.resample(y=y, orig_sr=sr, target_sr=target_sample_rate) # 使用librosa进行重采样至目标采样率 39 sf.write( " test_lisa.mp3 " , data=y_32k, samplerate=target_sample_rate) # 使用soundfile进行文件写入 40 print (f " cost :{time() - start_time}s " ) 41 42 if __name__ == ' __main__ ' : 43 resample_use_cuda() 44 resample_by_cpu() 45 resample_by_lisa()

2.2 提取pitch基频特征【音高提取】

使用 `torchaudio` 进行基频特征提取

其实主要使用的这个函数： torchaudio.transforms._transforms.PitchShift

让我们来看看它官方的 example ，仿照着来写就好啦

>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> transform = transforms.PitchShift(sample_rate, 4)
>>> waveform_shift = transform(waveform)  # (channel, time)

import torchaudio
import torchaudio.transforms as Tf
import matplotlib.pyplot as plt     #画图依赖

def get_pitch_by_torch():
    file_path = input("file path:")
    y, sr = torchaudio.load(file_path)
    """specimen:
    >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
    >>> transform = transforms.PitchShift(sample_rate, 4)
    >>> waveform_shift = transform(waveform)  # (channel, time)
    pitch_tf = Tf.PitchShift(sample_rate=sr, n_steps=0)
    feature = pitch_tf(y)
    # 绘制基频特征 这部分可以忽略，只是画图而已，可以直接复制不用理解
    plt.figure(figsize=(16, 5))
    plt.plot(feature[0].numpy(), label='Pitch')
    plt.xlabel('Frame')
    plt.ylabel('Frequency (Hz)')
    plt.title('Pitch Estimation')
    plt.legend()
    plt.show()

输出图片【总歌曲】效果：

将输出的范围稍微改一下，切分特征的一部分，就是歌曲部分的音高特征啦，效果就很明显了

改为： plt.plot(feature[0][5000:10000].numpy(), label='Pitch')

使用librosa提取基频特征

提取基频特征

（可选）绘制基频特征

#Computing a fundamental frequency (F0) curve from an audio input
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> f0, voiced_flag, voiced_probs = librosa.pyin(y,
...                                              sr=sr,
...                                              fmin=librosa.note_to_hz('C2'),
...                                              fmax=librosa.note_to_hz('C7'))
>>> times = librosa.times_like(f0, sr=sr)

code：

 1 def get_pitch_by_librosa():
 3     file_path = input("请输入音频文件路径：")
 4     y, sr = librosa.load(file_path)
 5     """librosa.pyin(y,sr=sr,fmin=librosa.note_to_hz('C2'),fmax=librosa.note_to_hz('C7'))"""
 6     # 使用pyin提取基频特征
 7     f0, voiced_flag, voiced_probs = librosa.pyin(y, sr=sr, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
 9     # 绘制基频特征,可忽略
10     plt.figure(figsize=(14, 5))
11     librosa.display.waveshow(y, sr=sr, alpha=0.5)
12     plt.plot(librosa.times_like(f0), f0, label='f0 (fundamental frequency)', color='r')
13     plt.xlabel('Time (s)')
14     plt.ylabel('Frequency (Hz)')
15     plt.title('Pitch (fundamental frequency) Estimation')
16     plt.legend()
17     plt.show()

比torchaudio略微麻烦一点，不过多了两个参数 voiced_flag, voiced_probs ，看起来的视觉图好像也有些不一样，不过都是按照官方的这个来了，这也不对的话我也不会了

def get_pitch_by_torch(): file_path = input( " file path: " ) y, sr = torchaudio.load(file_path) """ specimen: >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) >>> transform = transforms.PitchShift(sample_rate, 4) >>> waveform_shift = transform(waveform) # (channel, time) pitch_tf = Tf.PitchShift(sample_rate=sr, n_steps= 0) feature = pitch_tf(y) # 绘制基频特征 plt.figure(figsize=(16, 5 )) plt.plot(feature[0][ 5000:10000].numpy(), label= ' Pitch ' ) plt.xlabel( ' Frame ' ) plt.ylabel( ' Frequency (Hz) ' ) plt.title( ' Pitch Estimation ' ) plt.legend() plt.show() def get_pitch_by_librosa(): file_path = input( " 请输入音频文件路径： " ) y, sr = librosa.load(file_path) """ librosa.pyin(y,sr=sr,fmin=librosa.note_to_hz('C2'),fmax=librosa.note_to_hz('C7')) """ # 使用pyin提取基频特征 f0, voiced_flag, voiced_probs = librosa.pyin(y, sr=sr, fmin=librosa.note_to_hz( ' C2 ' ), fmax=librosa.note_to_hz( ' C7 ' )) # 绘制基频特征,可忽略 plt.figure(figsize=(14, 5 )) librosa.display.waveshow(y, sr =sr, alpha=0.5 ) plt.plot(librosa.times_like(f0), f0, label = ' f0 (fundamental frequency) ' , color= ' r ' ) plt.xlabel( ' Time (s) ' ) plt.ylabel( ' Frequency (Hz) ' ) plt.title( ' Pitch (fundamental frequency) Estimation ' ) plt.legend() plt.show() if __name__ == ' __main__ ' : # get_pitch_by_torch() # get_pitch_by_librosa()

使用使用 torchaudio 进行重采样（gpu版）：

2.2 提取pitch基频特征【音高提取】

使用 torchaudio 进行基频特征提取

使用librosa提取基频特征

后续PPG特征、 vec特征见下一章

使用使用 `torchaudio` 进行重采样（gpu版）：

使用 `torchaudio` 进行基频特征提取