[13]:
y_ = sum(audio_chunks)
y_ = np.array(y_.get_array_of_samples())
y_ = malaya_speech.astype.int_to_float(y_)
Use WebRTC VAD
We also can split using VAD, good thing about VAD, we do not need to define certain threshold, it depends on how good the VAD model.
[16]:
vad = malaya_speech.vad.webrtc()
y_= malaya_speech.resample(y, sr, 16000)
y_ = malaya_speech.astype.float_to_int(y_)
frames = malaya_speech.generator.frames(y, 30, sr)
frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
Or can use pipeline,
[18]:
p = Pipeline()
pipeline_left = (
p.map(malaya_speech.generator.frames, frame_duration_ms = 30, sample_rate = 16000)
pipeline_right = (
p.map(malaya_speech.resample, old_samplerate = sr, new_samplerate = 16000)
.map(malaya_speech.astype.float_to_int)
.map(malaya_speech.generator.frames, frame_duration_ms = 30, sample_rate = 16000,
append_ending_trail = False)
.foreach_map(vad)
pipeline_left.foreach_zip(pipeline_right).map(malaya_speech.combine.without_silent)
p.visualize()
y, sr = malaya_speech.load('speech/khutbah/wadi-annuar.wav')
y = y[: sr * 15]
len(y), sr
y_int = malaya_speech.astype.float_to_int(y)
audio = AudioSegment(
y_int.tobytes(),
frame_rate = sr,
sample_width = y_int.dtype.itemsize,
channels = 1
audio_chunks = split_on_silence(
audio,
min_silence_len = 200,
silence_thresh = -30,
keep_silence = 100,
audio_chunks
y_ = sum(audio_chunks)
y_ = np.array(y_.get_array_of_samples())
y_ = malaya_speech.astype.int_to_float(y_)
ipd.Audio(y_, rate = sr)
pipeline_left = (
p.map(malaya_speech.generator.frames, frame_duration_ms = 30, sample_rate = 16000)
pipeline_right = (
pipeline_left.batching(5)
.foreach_map(quantized_model.predict)
.flatten()
pipeline_left.foreach_zip(pipeline_right).map(malaya_speech.combine.without_silent,
threshold_to_stop = 0.05)
p.visualize()
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/librosa/core/spectrum.py:224: UserWarning: n_fft=512 is too small for input signal of length=480
n_fft, y.shape[-1]
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/librosa/core/spectrum.py:224: UserWarning: n_fft=512 is too small for input signal of length=160
n_fft, y.shape[-1]