Remove silents using VAD #

Remove silents actually is pretty hard, traditional people use certain dB threshold, if lower, we assume it is a silent with certain window size. If I set -20 dB for one sample audio, does not mean able to do it for another samples.

This tutorial is available as an IPython notebook at malaya-speech/example/remove-silents-vad .

This module is language independent, so it save to use on different languages.

This is an application of malaya-speech Pipeline, read more about malaya-speech Pipeline at malaya-speech/example/pipeline .

import malaya_speech import numpy as np import librosa from malaya_speech import Pipeline def norm_mel ( y , sr ): mel = librosa . feature . melspectrogram ( y , sr = sr , n_mels = 80 ) return np . log10 ( np . maximum ( mel , 1e-10 )) . T def plot ( y , sr ): mel = norm_mel ( y , sr ) fig , axs = plt . subplots ( 2 , figsize = ( 10 , 8 )) axs [ 0 ] . plot ( y ) im = axs [ 1 ] . imshow ( np . rot90 ( mel ), aspect = 'auto' , interpolation = 'none' ) fig . colorbar ( mappable = im , shrink = 0.65 , orientation = 'horizontal' , ax = axs [ 1 ]) plt . show ()

If you see at waveform graph or mel graph, we can see silent periods at the start, middle and end.

Use librosa.effects.trim #

y_ = librosa . effects . trim ( y , top_db = 20 )[ 0 ] y_int = malaya_speech . astype . float_to_int ( y ) audio = AudioSegment ( y_int . tobytes (), frame_rate = sr , sample_width = y_int . dtype . itemsize , channels = 1 [<pydub.audio_segment.AudioSegment at 0x14fb01810>, <pydub.audio_segment.AudioSegment at 0x14fb01950>, <pydub.audio_segment.AudioSegment at 0x14fb01990>, <pydub.audio_segment.AudioSegment at 0x14fb01dd0>, <pydub.audio_segment.AudioSegment at 0x14fb07490>]

[13]:
y_ = sum(audio_chunks)
y_ = np.array(y_.get_array_of_samples())
y_ = malaya_speech.astype.int_to_float(y_)
Use WebRTC VAD#
We also can split using VAD, good thing about VAD, we do not need to define certain threshold, it depends on how good the VAD model.
[16]:
vad = malaya_speech.vad.webrtc()
y_= malaya_speech.resample(y, sr, 16000)
y_ = malaya_speech.astype.float_to_int(y_)
frames = malaya_speech.generator.frames(y, 30, sr)
frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
Or can use pipeline,
[18]:
p = Pipeline()
pipeline_left = (
    p.map(malaya_speech.generator.frames, frame_duration_ms = 30, sample_rate = 16000)
pipeline_right = (
    p.map(malaya_speech.resample, old_samplerate = sr, new_samplerate = 16000)
    .map(malaya_speech.astype.float_to_int)
    .map(malaya_speech.generator.frames, frame_duration_ms = 30, sample_rate = 16000,
         append_ending_trail = False)
    .foreach_map(vad)
pipeline_left.foreach_zip(pipeline_right).map(malaya_speech.combine.without_silent)
p.visualize()
y, sr = malaya_speech.load('speech/khutbah/wadi-annuar.wav')
y = y[: sr * 15]
len(y), sr
y_int = malaya_speech.astype.float_to_int(y)
audio = AudioSegment(
    y_int.tobytes(),
    frame_rate = sr,
    sample_width = y_int.dtype.itemsize,
    channels = 1
audio_chunks = split_on_silence(
    audio,
    min_silence_len = 200,
    silence_thresh = -30,
    keep_silence = 100,
audio_chunks
y_ = sum(audio_chunks)
y_ = np.array(y_.get_array_of_samples())
y_ = malaya_speech.astype.int_to_float(y_)
ipd.Audio(y_, rate = sr)
pipeline_left = (
    p.map(malaya_speech.generator.frames, frame_duration_ms = 30, sample_rate = 16000)
pipeline_right = (
    pipeline_left.batching(5)
    .foreach_map(quantized_model.predict)
    .flatten()
pipeline_left.foreach_zip(pipeline_right).map(malaya_speech.combine.without_silent,
                                             threshold_to_stop = 0.05)
p.visualize()
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/librosa/core/spectrum.py:224: UserWarning: n_fft=512 is too small for input signal of length=480
  n_fft, y.shape[-1]
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/librosa/core/spectrum.py:224: UserWarning: n_fft=512 is too small for input signal of length=160
  n_fft, y.shape[-1]
[36]: