forked from ufal/whisper_streaming
-
Notifications
You must be signed in to change notification settings - Fork 1
/
mic_test_whisper_simple.py
95 lines (69 loc) · 2.84 KB
/
mic_test_whisper_simple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from microphone_stream import MicrophoneStream
from voice_activity_controller import VoiceActivityController
from whisper_online import *
import numpy as np
import librosa
import io
import soundfile
import sys
class SimpleASRProcessor:
def __init__(self, asr, sampling_rate = 16000):
"""run this when starting or restarting processing"""
self.audio_buffer = np.array([],dtype=np.float32)
self.prompt_buffer = ""
self.asr = asr
self.sampling_rate = sampling_rate
self.init_prompt = ''
def ts_words(self, segments):
result = ""
for segment in segments:
if segment.no_speech_prob > 0.9:
continue
for word in segment.words:
w = word.word
t = (word.start, word.end, w)
result +=w
return result
def stream_process(self, vad_result):
iter_in_phrase = 0
for chunk, is_final in vad_result:
iter_in_phrase += 1
if chunk is not None:
sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
out = []
out.append(audio)
a = np.concatenate(out)
self.audio_buffer = np.append(self.audio_buffer, a)
if is_final and len(self.audio_buffer) > 0:
res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
tsw = self.ts_words(res)
self.init_prompt = self.init_prompt + tsw
self.init_prompt = self.init_prompt [-100:]
self.audio_buffer.resize(0)
iter_in_phrase =0
yield True, tsw
# show progress evry 50 chunks
elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
# use custom ts_words
tsw = self.ts_words(res)
yield False, tsw
SAMPLING_RATE = 16000
model = "large-v2"
src_lan = "en" # source language
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
use_vad = False
min_sample_length = 1 * SAMPLING_RATE
vac = VoiceActivityController(use_vad_result = use_vad)
asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
tokenizer = create_tokenizer(tgt_lan)
online = SimpleASRProcessor(asr)
stream = MicrophoneStream()
stream = vac.detect_user_speech(stream, audio_in_int16 = False)
stream = online.stream_process(stream)
for isFinal, text in stream:
if isFinal:
print( text, end="\r\n")
else:
print( text, end="\r")