Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Misc. fixes ahead of py2app packaging #2

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
PySide6
PySide6==6.7.0
pyaudio
shortuuid
sounddevice
webrtcvad
sounddevice
3 changes: 2 additions & 1 deletion scripts/start_macos_arm64.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
# setup dirs
mkdir -p ~/Desktop/audio-data

env DYLD_LIBRARY_PATH="/opt/homebrew/lib:$DYLD_LIBRARY_PATH" python3 -m yoruba_voice_speech_recorder -p src/yoruba_voice_speech_recorder/prompts/yovo_3501.txt -d ~/Desktop/audio-data
# launch app with specific homebrew environment
env DYLD_LIBRARY_PATH="/opt/homebrew/lib:$DYLD_LIBRARY_PATH" python3 -m yoruba_voice_speech_recorder
2 changes: 1 addition & 1 deletion scripts/start_macos_x86_64.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
mkdir -p ~/Desktop/audio-data

# launch app
python3 -m yoruba_voice_speech_recorder -p src/yoruba_voice_speech_recorder/prompts/yovo_3501.txt -d ~/Desktop/audio-data
python3 -m yoruba_voice_speech_recorder
250 changes: 2 additions & 248 deletions src/yoruba_voice_speech_recorder/__main__.py
Original file line number Diff line number Diff line change
@@ -1,253 +1,7 @@
#!/usr/bin/env python3

import argparse
import datetime
import logging
import math
import os
import os.path
import random
import re
import sys
import threading

from PySide6.QtCore import QObject, Slot, QUrl
from PySide6.QtWidgets import QApplication
from PySide6.QtQml import QQmlApplicationEngine
from PySide6.QtWidgets import QMessageBox

local_src_module_path = os.path.join(os.path.dirname(__file__), "../../")
sys.path.append(local_src_module_path)

import src.yoruba_voice_speech_recorder.audio as audio
import shortuuid

event = threading.Event()
current_frame = 0


class Recorder(QObject):
"""docstring for Recorder"""

def __init__(self, save_dir, prompts_filename, ordered=True, prompts_count=250, prompt_len_soft_max=None):
super(Recorder, self).__init__()
self.scriptModel = None
self.speaker_id = None
self.speaker_name = None
if not os.path.isdir(save_dir): raise Exception("save_dir '%s' is not a directory" % save_dir)
self.save_dir = save_dir
if not os.path.isfile(prompts_filename):
# raise Exception("prompts_filename '%s' is not a file" % prompts_filename)
self.msgWarning = QMessageBox()
self.msgWarning.setIcon(QMessageBox.Warning)
self.msgWarning.setText(" Prompts file ńkọ́ ?\n Please load a prompts file")
self.msgWarning.setStandardButtons(QMessageBox.Ok)
self.msgWarning.setWindowTitle("Prompt file needed Message")
self.msgWarning.show()

self.prompts_filename = prompts_filename
print(self.count_prompts_file_prompts_count())
self.prompts_count = prompts_count
self.prompt_len_soft_max = prompt_len_soft_max
self.ordered = ordered
self.audio = audio.Audio()

def count_prompts_file_prompts_count(self):
try:
with open(self.prompts_filename, 'r') as fp:
num_lines = 0
for count, line in enumerate(fp):
line = line.strip()

if line == "" \
or line.startswith(";") \
or line.startswith("#"):
continue
else:
num_lines += 1
return num_lines
except FileNotFoundError as not_found:
print(not_found.filename)

@Slot(QUrl)
def reinit_with_url(self, url):
filename = url.toLocalFile()
logging.debug('reinit_with_url: new prompt filename: %s', filename)
self.prompts_filename = filename # set new prompt filename
self.scriptModel.clear() # empty out list view
self.populate_listview() # re-init

@Slot(QObject)
def init(self, scriptModel):
logging.debug("init: %s", scriptModel)
self.window.setProperty('saveDir', self.save_dir)
self.scriptModel = scriptModel
self.populate_listview()

def populate_listview(self):
self.prompts_count = self.count_prompts_file_prompts_count()
logging.info("prompts_count >>>>> {}".format(self.prompts_count))
self.window.setProperty('promptsName', os.path.splitext(os.path.basename(self.prompts_filename))[0])
for script in self.get_scripts_from_file(self.prompts_count, self.prompts_filename, self.ordered,
split_len=self.prompt_len_soft_max):
self.window.appendScript({'script': script, 'filename': ''})

@Slot(bool)
def toggleRecording(self, recording):
logging.debug('toggleRecording: recording is now %s', recording)

@Slot()
def startRecording(self):
size = self.flush()
logging.debug('flushed %s', size)
self.audio.stream.start_stream()

@Slot()
def finishRecording(self):
self.audio.stream.stop_stream()
data = self.read_audio(drop_last=3)
if self.window.property('scriptFilename'):
self.deleteFile(self.window.property('scriptFilename'))
filename = os.path.normpath(os.path.join(self.window.property('saveDir'),
"recorder_" + datetime.datetime.now().strftime(
"%Y-%m-%d_%H-%M-%S_%f") + ".wav"))
self.window.setProperty('scriptFilename', filename)
self.audio.write_wav(filename, data)
scriptText = self.window.property('scriptText')

# Double check speaker name and
if self.speaker_id is None or self.speaker_id.isspace() or self.speaker_id == "":
self.speaker_id = "UNNAMED_SPEAKER"
print(self.speaker_id)
with open(os.path.join(self.window.property('saveDir'), "recorder.tsv"), "a") as xsvfile:
xsvfile.write('\t'.join(
[filename, self.speaker_id, self.window.property('promptsName'), '',
self.sanitize_script(scriptText)]) + '\n')
logging.debug("wrote %s to %s", len(data), filename)

@Slot(str)
def deleteFile(self, filename):
os.remove(filename)
xsvfile_in_path = os.path.join(self.window.property('saveDir'), "recorder.tsv")
xsvfile_out_path = os.path.join(self.window.property('saveDir'), "recorder_delete_temp.tsv")
with open(xsvfile_in_path, "r") as xsvfile_in:
with open(xsvfile_out_path, "w") as xsvfile_out:
for line in xsvfile_in:
if filename not in line:
xsvfile_out.write(line)
os.replace(xsvfile_out_path, xsvfile_in_path)
self.window.setProperty('scriptFilename', '')

@Slot(str)
def acceptSpeakerNameText(self, speakerName):
print("acceptSpeakerNameText Slot")
self.speaker_name = speakerName
if self.speaker_name is None or self.speaker_name.isspace() or self.speaker_name == "":
self.speaker_name = "UNNAMED_SPEAKER"
self.speaker_id = self.speaker_name + "_" + str(shortuuid.uuid()[:16])

def read_audio(self, drop_last=None):
blocks = []
while not self.audio.buffer_queue.empty():
block = self.audio.buffer_queue.get_nowait()
# logging.debug('read %s', len(block) if block else None)
if block:
blocks.append(block)
# logging.debug('read total %s', len(b''.join(blocks)))
if drop_last:
blocks = blocks[:-drop_last]
return b''.join(blocks)

def flush(self):
size = self.audio.buffer_queue.qsize()
while not self.audio.buffer_queue.empty():
self.audio.buffer_queue.get_nowait()
return size

def get_scripts_from_file(self, n, filename, ordered=False, split_len=None):
def filter(script):
# match = re.fullmatch(r'\w+ "(.*)"', script)
patterns = [
r'^\w+ "(.*)"$', # arctic
r'^(.*) \(s.\d+\)$', # timit
]
for pat in patterns:
script = re.sub(pat, r'\1', script, count=1)
return script

with open(filename, 'r') as file:
scripts = [line.strip() for line in file if not line.startswith(';')]
if n is None: n = len(scripts)
if not ordered:
# random.shuffle(scripts)
scripts = [random.choice(scripts) for _ in range(n)]
scripts = scripts[:n]
scripts = [filter(script) for script in scripts]
if split_len is not None:
scripts = [self.split_script(script, split_len) for script in scripts]
scripts = sum(scripts, [])
return scripts[:n]

# TODO - IO do we need to sanitize scripts?
@classmethod
def sanitize_script(cls, script):
script = re.sub(r'[\-]', ' ', script)
# script = re.sub(r'[,.?!:;"]', '', script)
return script.strip()


@classmethod
def split_script(cls, script, split_len):
scripts = []
n = math.ceil(len(script) / split_len)
startpos = 0
# print(script)
regex = re.compile(r'\s+')
for i in range(n):
match = regex.search(script, pos=startpos + split_len)
endpos = match.start() if match else None
scripts.append(script[startpos:endpos].strip())
# print(startpos, endpos, scripts)
if endpos is None: break
startpos = endpos
return scripts


def main():
current_path = os.path.abspath(os.path.dirname(__file__))
qml_file = os.path.join(current_path, os.path.splitext(__file__)[0] + '.qml')

parser = argparse.ArgumentParser(description='''
Given a text file containing prompts, this app will choose a random selection
and ordering of them, display them to be dictated by the user, and record the
dictation audio and metadata to a `.wav` file and `recorder.tsv` file
respectively.
''')
parser.add_argument('-p', '--prompts_filename', help='file containing prompts to choose from')
parser.add_argument('-d', '--save_dir', default='./audio_data',
help='where to save .wav & recorder.tsv files (default: %(default)s)')
parser.add_argument('-c', '--prompts_count', type=int, default=250,
help='number of prompts to select and display (default: %(default)s)')
parser.add_argument('-l', '--prompt_len_soft_max', type=int)
parser.add_argument('-o', '--ordered', action='store_true', default=True,
help='present prompts in order, as opposed to random (default: %(default)s)')
args = parser.parse_args()
assert args.prompts_filename

os.environ["QT_AUTO_SCREEN_SCALE_FACTOR"] = "1"
app = QApplication(sys.argv)
engine = QQmlApplicationEngine()
engine.addImportPath(current_path)
kwargs = {k: v for k, v in vars(args).items() if v is not None and k in 'prompts_count prompt_len_soft_max'.split()}
recorder = Recorder(args.save_dir, args.prompts_filename, args.ordered, **kwargs)
engine.rootContext().setContextProperty('recorder', recorder)
engine.load(qml_file)
recorder.window = engine.rootObjects()[0]

res = app.exec()
sys.exit(res)
import yoruba_voice_speech_recorder.yv_recorder as yv_recorder


if __name__ == '__main__':
logging.basicConfig(level=10)
main()
yv_recorder.main()
51 changes: 8 additions & 43 deletions src/yoruba_voice_speech_recorder/audio.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import collections, wave, logging, os, datetime
import pyaudio
import webrtcvad
import queue


Expand Down Expand Up @@ -75,48 +74,10 @@ def write_wav(self, filename, data):
wf.close()


class VADAudio(Audio):
"""Filter & segment audio with voice activity detection."""

def __init__(self, aggressiveness=3):
super(VADAudio, self).__init__()
self.vad = webrtcvad.Vad(aggressiveness)

def vad_collector(self, padding_ms=300, ratio=0.75, blocks=None):
"""Generator that yields series of consecutive audio blocks comprising each utterence, separated by yielding a single None.
Determines voice activity by ratio of blocks in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
Example: (block, ..., block, None, block, ..., block, None, ...)
|---utterence---| |---utterence---|
"""
if blocks is None: blocks = iter(self)
num_padding_blocks = padding_ms // self.block_duration_ms
ring_buffer = collections.deque(maxlen=num_padding_blocks)
triggered = False

for block in blocks:
is_speech = self.vad.is_speech(block, self.sample_rate)

if not triggered:
ring_buffer.append((block, is_speech))
num_voiced = len([f for f, speech in ring_buffer if speech])
if num_voiced > ratio * ring_buffer.maxlen:
triggered = True
for f, s in ring_buffer:
yield f
ring_buffer.clear()

else:
yield block
ring_buffer.append((block, is_speech))
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
if num_unvoiced > ratio * ring_buffer.maxlen:
triggered = False
yield None
ring_buffer.clear()


class AudioStore(object):
"""Stores last `maxlen` recognitions as tuples (audio, text, grammar_name, rule_name), indexed in reverse order (0 most recent)"""
"""Stores last `maxlen` recognitions as tuples (audio, text, grammar_name, rule_name), indexed in
reverse order (0 most recent)
"""

def __init__(self, audio_obj, maxlen=0, save_dir=None, auto_save_func=None):
self.audio_obj = audio_obj
Expand All @@ -140,17 +101,21 @@ def finalize(self, text, grammar_name, rule_name):

def save(self, index):
if self.save_dir:
filename = os.path.join(self.save_dir, "retain_" + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S_%f") + ".wav")
filename = os.path.join(self.save_dir,
"retain_" + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S_%f") + ".wav")
audio, text, grammar_name, rule_name = self.deque[index]
self.audio_obj.write_wav(filename, audio)
with open(os.path.join(self.save_dir, "retain.csv"), "a") as csvfile:
csvfile.write(','.join([filename, '0', grammar_name, rule_name, text]) + '\n')

def __getitem__(self, key):
return self.deque[key]

def __len__(self):
return len(self.deque)

def __bool__(self):
return True

def __nonzero__(self):
return True
Loading