Auto backup: 2026-02-21 07:01

This commit is contained in:
Krilly
2026-02-21 07:01:51 +00:00
parent 8757148122
commit 17b5b82d99
2012 changed files with 352552 additions and 331 deletions

View File

@@ -0,0 +1,275 @@
import os
import sys
import srt
import datetime
import json
import requests
from urllib.request import urlretrieve
from zipfile import ZipFile
from re import match
from pathlib import Path
from .vosk_cffi import ffi as _ffi
from tqdm import tqdm
# Remote location of the models and local folders
MODEL_PRE_URL = "https://alphacephei.com/vosk/models/"
MODEL_LIST_URL = MODEL_PRE_URL + "model-list.json"
MODEL_DIRS = [os.getenv("VOSK_MODEL_PATH"), Path("/usr/share/vosk"),
Path.home() / "AppData/Local/vosk", Path.home() / ".cache/vosk"]
def open_dll():
dlldir = os.path.abspath(os.path.dirname(__file__))
if sys.platform == "win32":
# We want to load dependencies too
os.environ["PATH"] = dlldir + os.pathsep + os.environ["PATH"]
if hasattr(os, "add_dll_directory"):
os.add_dll_directory(dlldir)
return _ffi.dlopen(os.path.join(dlldir, "libvosk.dll"))
elif sys.platform == "linux":
return _ffi.dlopen(os.path.join(dlldir, "libvosk.so"))
elif sys.platform == "darwin":
return _ffi.dlopen(os.path.join(dlldir, "libvosk.dyld"))
else:
raise TypeError("Unsupported platform")
_c = open_dll()
def list_models():
response = requests.get(MODEL_LIST_URL, timeout=10)
for model in response.json():
print(model["name"])
def list_languages():
response = requests.get(MODEL_LIST_URL, timeout=10)
languages = {m["lang"] for m in response.json()}
for lang in languages:
print (lang)
class Model:
def __init__(self, model_path=None, model_name=None, lang=None):
if model_path is not None:
self._handle = _c.vosk_model_new(model_path.encode("utf-8"))
else:
model_path = self.get_model_path(model_name, lang)
self._handle = _c.vosk_model_new(model_path.encode("utf-8"))
if self._handle == _ffi.NULL:
raise Exception("Failed to create a model")
def __del__(self):
_c.vosk_model_free(self._handle)
def vosk_model_find_word(self, word):
return _c.vosk_model_find_word(self._handle, word.encode("utf-8"))
def get_model_path(self, model_name, lang):
if model_name is None:
model_path = self.get_model_by_lang(lang)
else:
model_path = self.get_model_by_name(model_name)
return str(model_path)
def get_model_by_name(self, model_name):
for directory in MODEL_DIRS:
if directory is None or not Path(directory).exists():
continue
model_file_list = os.listdir(directory)
model_file = [model for model in model_file_list if model == model_name]
if model_file != []:
return Path(directory, model_file[0])
response = requests.get(MODEL_LIST_URL, timeout=10)
result_model = [model["name"] for model in response.json() if model["name"] == model_name]
if result_model == []:
print("model name %s does not exist" % (model_name))
sys.exit(1)
else:
self.download_model(Path(directory, result_model[0]))
return Path(directory, result_model[0])
def get_model_by_lang(self, lang):
for directory in MODEL_DIRS:
if directory is None or not Path(directory).exists():
continue
model_file_list = os.listdir(directory)
model_file = [model for model in model_file_list if
match(r"vosk-model(-small)?-{}".format(lang), model)]
if model_file != []:
return Path(directory, model_file[0])
response = requests.get(MODEL_LIST_URL, timeout=10)
result_model = [model["name"] for model in response.json() if
model["lang"] == lang and model["type"] == "small" and model["obsolete"] == "false"]
if result_model == []:
print("lang %s does not exist" % (lang))
sys.exit(1)
else:
self.download_model(Path(directory, result_model[0]))
return Path(directory, result_model[0])
def download_model(self, model_name):
if not (model_name.parent).exists():
(model_name.parent).mkdir(parents=True)
with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1,
desc=(MODEL_PRE_URL + str(model_name.name) + ".zip").rsplit("/",
maxsplit=1)[-1]) as t:
reporthook = self.download_progress_hook(t)
urlretrieve(MODEL_PRE_URL + str(model_name.name) + ".zip",
str(model_name) + ".zip", reporthook=reporthook, data=None)
t.total = t.n
with ZipFile(str(model_name) + ".zip", "r") as model_ref:
model_ref.extractall(model_name.parent)
Path(str(model_name) + ".zip").unlink()
def download_progress_hook(self, t):
last_b = [0]
def update_to(b=1, bsize=1, tsize=None):
if tsize not in (None, -1):
t.total = tsize
displayed = t.update((b - last_b[0]) * bsize)
last_b[0] = b
return displayed
return update_to
class SpkModel:
def __init__(self, model_path):
self._handle = _c.vosk_spk_model_new(model_path.encode("utf-8"))
if self._handle == _ffi.NULL:
raise Exception("Failed to create a speaker model")
def __del__(self):
_c.vosk_spk_model_free(self._handle)
class KaldiRecognizer:
def __init__(self, *args):
if len(args) == 2:
self._handle = _c.vosk_recognizer_new(args[0]._handle, args[1])
elif len(args) == 3 and isinstance(args[2], SpkModel):
self._handle = _c.vosk_recognizer_new_spk(args[0]._handle,
args[1], args[2]._handle)
elif len(args) == 3 and isinstance(args[2], str):
self._handle = _c.vosk_recognizer_new_grm(args[0]._handle,
args[1], args[2].encode("utf-8"))
else:
raise TypeError("Unknown arguments")
if self._handle == _ffi.NULL:
raise Exception("Failed to create a recognizer")
def __del__(self):
_c.vosk_recognizer_free(self._handle)
def SetMaxAlternatives(self, max_alternatives):
_c.vosk_recognizer_set_max_alternatives(self._handle, max_alternatives)
def SetWords(self, enable_words):
_c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0)
def SetPartialWords(self, enable_partial_words):
_c.vosk_recognizer_set_partial_words(self._handle, 1 if enable_partial_words else 0)
def SetNLSML(self, enable_nlsml):
_c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0)
def SetSpkModel(self, spk_model):
_c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle)
def SetGrammar(self, grammar):
_c.vosk_recognizer_set_grm(self._handle, grammar.encode("utf-8"))
def AcceptWaveform(self, data):
res = _c.vosk_recognizer_accept_waveform(self._handle, data, len(data))
if res < 0:
raise Exception("Failed to process waveform")
return res
def Result(self):
return _ffi.string(_c.vosk_recognizer_result(self._handle)).decode("utf-8")
def PartialResult(self):
return _ffi.string(_c.vosk_recognizer_partial_result(self._handle)).decode("utf-8")
def FinalResult(self):
return _ffi.string(_c.vosk_recognizer_final_result(self._handle)).decode("utf-8")
def Reset(self):
return _c.vosk_recognizer_reset(self._handle)
def SrtResult(self, stream, words_per_line = 7):
results = []
while True:
data = stream.read(4000)
if len(data) == 0:
break
if self.AcceptWaveform(data):
results.append(self.Result())
results.append(self.FinalResult())
subs = []
for res in results:
jres = json.loads(res)
if not "result" in jres:
continue
words = jres["result"]
for j in range(0, len(words), words_per_line):
line = words[j : j + words_per_line]
s = srt.Subtitle(index=len(subs),
content=" ".join([l["word"] for l in line]),
start=datetime.timedelta(seconds=line[0]["start"]),
end=datetime.timedelta(seconds=line[-1]["end"]))
subs.append(s)
return srt.compose(subs)
def SetLogLevel(level):
return _c.vosk_set_log_level(level)
def GpuInit():
_c.vosk_gpu_init()
def GpuThreadInit():
_c.vosk_gpu_thread_init()
class BatchModel:
def __init__(self, model_path, *args):
self._handle = _c.vosk_batch_model_new(model_path.encode('utf-8'))
if self._handle == _ffi.NULL:
raise Exception("Failed to create a model")
def __del__(self):
_c.vosk_batch_model_free(self._handle)
def Wait(self):
_c.vosk_batch_model_wait(self._handle)
class BatchRecognizer:
def __init__(self, *args):
self._handle = _c.vosk_batch_recognizer_new(args[0]._handle, args[1])
if self._handle == _ffi.NULL:
raise Exception("Failed to create a recognizer")
def __del__(self):
_c.vosk_batch_recognizer_free(self._handle)
def AcceptWaveform(self, data):
res = _c.vosk_batch_recognizer_accept_waveform(self._handle, data, len(data))
def Result(self):
ptr = _c.vosk_batch_recognizer_front_result(self._handle)
res = _ffi.string(ptr).decode("utf-8")
_c.vosk_batch_recognizer_pop(self._handle)
return res
def FinishStream(self):
_c.vosk_batch_recognizer_finish_stream(self._handle)
def GetPendingChunks(self):
return _c.vosk_batch_recognizer_get_pending_chunks(self._handle)

View File

@@ -0,0 +1,89 @@
#!/usr/bin/env python3
import argparse
import logging
import sys
import os
from pathlib import Path
from vosk import list_models, list_languages
from vosk.transcriber.transcriber import Transcriber
parser = argparse.ArgumentParser(
description = "Transcribe audio file and save result in selected format")
parser.add_argument(
"--model", "-m", type=str,
help="model path")
parser.add_argument(
"--server", "-s", const="ws://localhost:2700", action="store_const",
help="use server for recognition")
parser.add_argument(
"--list-models", default=False, action="store_true",
help="list available models")
parser.add_argument(
"--list-languages", default=False, action="store_true",
help="list available languages")
parser.add_argument(
"--model-name", "-n", type=str,
help="select model by name")
parser.add_argument(
"--lang", "-l", default="en-us", type=str,
help="select model by language")
parser.add_argument(
"--input", "-i", type=str,
help="audiofile")
parser.add_argument(
"--output", "-o", default="", type=str,
help="optional output filename path")
parser.add_argument(
"--output-type", "-t", default="txt", type=str,
help="optional arg output data type")
parser.add_argument(
"--tasks", "-ts", default=10, type=int,
help="number of parallel recognition tasks")
parser.add_argument(
"--log-level", default="INFO",
help="logging level")
def main():
args = parser.parse_args()
log_level = args.log_level.upper()
logging.getLogger().setLevel(log_level)
if args.list_models is True:
list_models()
return
if args.list_languages is True:
list_languages()
return
if not args.input:
logging.info("Please specify input file or directory")
sys.exit(1)
if not Path(args.input).exists():
logging.info("File/folder {args.input} does not exist, "\
"please specify an existing file/directory")
sys.exit(1)
transcriber = Transcriber(args)
if Path(args.input).is_dir():
task_list = [(Path(args.input, fn),
Path(args.output,
Path(fn).stem).with_suffix("." + args.output_type)) for fn in os.listdir(args.input)]
elif Path(args.input).is_file():
if args.output == "":
task_list = [(Path(args.input), args.output)]
else:
task_list = [(Path(args.input), Path(args.output))]
else:
logging.info("Wrong arguments")
sys.exit(1)
transcriber.process_task_list(task_list)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,195 @@
import json
import logging
import asyncio
import websockets
import srt
import datetime
import shlex
import subprocess
from vosk import KaldiRecognizer, Model
from queue import Queue
from timeit import default_timer as timer
from multiprocessing.dummy import Pool
CHUNK_SIZE = 4000
SAMPLE_RATE = 16000.0
class Transcriber:
def __init__(self, args):
self.model = Model(model_path=args.model, model_name=args.model_name, lang=args.lang)
self.args = args
self.queue = Queue()
def recognize_stream(self, rec, stream):
tot_samples = 0
result = []
while True:
data = stream.stdout.read(CHUNK_SIZE)
if len(data) == 0:
break
tot_samples += len(data)
if rec.AcceptWaveform(data):
jres = json.loads(rec.Result())
logging.info(jres)
result.append(jres)
else:
jres = json.loads(rec.PartialResult())
if jres["partial"] != "":
logging.info(jres)
jres = json.loads(rec.FinalResult())
result.append(jres)
return result, tot_samples
async def recognize_stream_server(self, proc):
async with websockets.connect(self.args.server) as websocket:
tot_samples = 0
result = []
await websocket.send('{ "config" : { "sample_rate" : %f } }' % (SAMPLE_RATE))
while True:
data = await proc.stdout.read(CHUNK_SIZE)
tot_samples += len(data)
if len(data) == 0:
break
await websocket.send(data)
jres = json.loads(await websocket.recv())
logging.info(jres)
if not "partial" in jres:
result.append(jres)
await websocket.send('{"eof" : 1}')
jres = json.loads(await websocket.recv())
logging.info(jres)
result.append(jres)
return result, tot_samples
def format_result(self, result, words_per_line=7):
processed_result = ""
if self.args.output_type == "srt":
subs = []
for _, res in enumerate(result):
if not "result" in res:
continue
words = res["result"]
for j in range(0, len(words), words_per_line):
line = words[j : j + words_per_line]
s = srt.Subtitle(index=len(subs),
content = " ".join([l["word"] for l in line]),
start=datetime.timedelta(seconds=line[0]["start"]),
end=datetime.timedelta(seconds=line[-1]["end"]))
subs.append(s)
processed_result = srt.compose(subs)
elif self.args.output_type == "txt":
for part in result:
if part["text"] != "":
processed_result += part["text"] + "\n"
elif self.args.output_type == "json":
monologues = {"schemaVersion":"2.0", "monologues":[], "text":[]}
for part in result:
if part["text"] != "":
monologue["text"] += part["text"]
for _, res in enumerate(result):
if not "result" in res:
continue
monologue = { "speaker": {"id": "unknown", "name": None}, "start": 0, "end": 0, "terms": []}
monologue["start"] = res["result"][0]["start"]
monologue["end"] = res["result"][-1]["end"]
monologue["terms"] = [{"confidence": t["conf"], "start": t["start"], "end": t["end"], "text": t["word"], "type": "WORD" } for t in res["result"]]
monologues["monologues"].append(monologue)
processed_result = json.dumps(monologues)
return processed_result
def resample_ffmpeg(self, infile):
cmd = shlex.split("ffmpeg -nostdin -loglevel quiet "
"-i \'{}\' -ar {} -ac 1 -f s16le -".format(str(infile), SAMPLE_RATE))
stream = subprocess.Popen(cmd, stdout=subprocess.PIPE)
return stream
async def resample_ffmpeg_async(self, infile):
cmd = "ffmpeg -nostdin -loglevel quiet "\
"-i \'{}\' -ar {} -ac 1 -f s16le -".format(str(infile), SAMPLE_RATE)
return await asyncio.create_subprocess_shell(cmd, stdout=subprocess.PIPE)
async def server_worker(self):
while True:
try:
input_file, output_file = self.queue.get_nowait()
except Exception:
break
logging.info("Recognizing {}".format(input_file))
start_time = timer()
proc = await self.resample_ffmpeg_async(input_file)
result, tot_samples = await self.recognize_stream_server(proc)
processed_result = self.format_result(result)
if output_file != "":
logging.info("File {} processing complete".format(output_file))
with open(output_file, "w", encoding="utf-8") as fh:
fh.write(processed_result)
else:
print(processed_result)
await proc.wait()
elapsed = timer() - start_time
logging.info("Execution time: {:.3f} sec; "\
"xRT {:.3f}".format(elapsed, float(elapsed) * (2 * SAMPLE_RATE) / tot_samples))
self.queue.task_done()
def pool_worker(self, inputdata):
logging.info("Recognizing {}".format(inputdata[0]))
start_time = timer()
try:
stream = self.resample_ffmpeg(inputdata[0])
except FileNotFoundError as e:
print(e, "Missing FFMPEG, please install and try again")
return
except Exception as e:
logging.info(e)
return
rec = KaldiRecognizer(self.model, SAMPLE_RATE)
rec.SetWords(True)
result, tot_samples = self.recognize_stream(rec, stream)
processed_result = self.format_result(result)
if inputdata[1] != "":
logging.info("File {} processing complete".format(inputdata[1]))
with open(inputdata[1], "w", encoding="utf-8") as fh:
fh.write(processed_result)
else:
print(processed_result)
elapsed = timer() - start_time
logging.info("Execution time: {:.3f} sec; "\
"xRT {:.3f}".format(elapsed, float(elapsed) * (2 * SAMPLE_RATE) / tot_samples))
async def process_task_list_server(self, task_list):
for x in task_list:
self.queue.put(x)
workers = [asyncio.create_task(self.server_worker()) for i in range(self.args.tasks)]
await asyncio.gather(*workers)
def process_task_list_pool(self, task_list):
with Pool() as pool:
pool.map(self.pool_worker, task_list)
def process_task_list(self, task_list):
if self.args.server is None:
self.process_task_list_pool(task_list)
else:
asyncio.run(self.process_task_list_server(task_list))

View File

@@ -0,0 +1,10 @@
# auto-generated file
import _cffi_backend
ffi = _cffi_backend.FFI('vosk.vosk_cffi',
_version = 0x2601,
_types = b'\x00\x00\x04\x0D\x00\x00\x65\x03\x00\x00\x00\x0F\x00\x00\x1C\x0D\x00\x00\x60\x03\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x0B\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x1F\x0D\x00\x00\x62\x03\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x1F\x0D\x00\x00\x0B\x11\x00\x00\x0D\x01\x00\x00\x64\x03\x00\x00\x00\x0F\x00\x00\x1F\x0D\x00\x00\x0B\x11\x00\x00\x0D\x01\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x01\x0D\x00\x00\x61\x03\x00\x00\x00\x0F\x00\x00\x01\x0D\x00\x00\x63\x03\x00\x00\x00\x0F\x00\x00\x2B\x0D\x00\x00\x1C\x11\x00\x00\x00\x0F\x00\x00\x2B\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x2B\x0D\x00\x00\x1F\x11\x00\x00\x01\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x2B\x0D\x00\x00\x1F\x11\x00\x00\x05\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x2B\x0D\x00\x00\x1F\x11\x00\x00\x66\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x04\x11\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x1C\x11\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x1C\x11\x00\x00\x01\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x1C\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x0B\x11\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x1F\x11\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x1F\x11\x00\x00\x11\x11\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x1F\x11\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x1F\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x11\x11\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x67\x0D\x00\x00\x00\x0F\x00\x00\x00\x09\x00\x00\x01\x09\x00\x00\x02\x09\x00\x00\x03\x09\x00\x00\x04\x09\x00\x00\x02\x01\x00\x00\x05\x01\x00\x00\x00\x01',
_globals = (b'\x00\x00\x37\x23vosk_batch_model_free',0,b'\x00\x00\x00\x23vosk_batch_model_new',0,b'\x00\x00\x37\x23vosk_batch_model_wait',0,b'\x00\x00\x3D\x23vosk_batch_recognizer_accept_waveform',0,b'\x00\x00\x3A\x23vosk_batch_recognizer_finish_stream',0,b'\x00\x00\x3A\x23vosk_batch_recognizer_free',0,b'\x00\x00\x1B\x23vosk_batch_recognizer_front_result',0,b'\x00\x00\x21\x23vosk_batch_recognizer_get_pending_chunks',0,b'\x00\x00\x03\x23vosk_batch_recognizer_new',0,b'\x00\x00\x3A\x23vosk_batch_recognizer_pop',0,b'\x00\x00\x42\x23vosk_batch_recognizer_set_nlsml',0,b'\x00\x00\x5E\x23vosk_gpu_init',0,b'\x00\x00\x5E\x23vosk_gpu_thread_init',0,b'\x00\x00\x24\x23vosk_model_find_word',0,b'\x00\x00\x46\x23vosk_model_free',0,b'\x00\x00\x07\x23vosk_model_new',0,b'\x00\x00\x28\x23vosk_recognizer_accept_waveform',0,b'\x00\x00\x2D\x23vosk_recognizer_accept_waveform_f',0,b'\x00\x00\x32\x23vosk_recognizer_accept_waveform_s',0,b'\x00\x00\x1E\x23vosk_recognizer_final_result',0,b'\x00\x00\x49\x23vosk_recognizer_free',0,b'\x00\x00\x0A\x23vosk_recognizer_new',0,b'\x00\x00\x13\x23vosk_recognizer_new_grm',0,b'\x00\x00\x0E\x23vosk_recognizer_new_spk',0,b'\x00\x00\x1E\x23vosk_recognizer_partial_result',0,b'\x00\x00\x49\x23vosk_recognizer_reset',0,b'\x00\x00\x1E\x23vosk_recognizer_result',0,b'\x00\x00\x50\x23vosk_recognizer_set_grm',0,b'\x00\x00\x54\x23vosk_recognizer_set_max_alternatives',0,b'\x00\x00\x54\x23vosk_recognizer_set_nlsml',0,b'\x00\x00\x54\x23vosk_recognizer_set_partial_words',0,b'\x00\x00\x4C\x23vosk_recognizer_set_spk_model',0,b'\x00\x00\x54\x23vosk_recognizer_set_words',0,b'\x00\x00\x5B\x23vosk_set_log_level',0,b'\x00\x00\x58\x23vosk_spk_model_free',0,b'\x00\x00\x18\x23vosk_spk_model_new',0),
_struct_unions = ((b'\x00\x00\x00\x60\x00\x00\x00\x10VoskBatchModel',),(b'\x00\x00\x00\x61\x00\x00\x00\x10VoskBatchRecognizer',),(b'\x00\x00\x00\x62\x00\x00\x00\x10VoskModel',),(b'\x00\x00\x00\x63\x00\x00\x00\x10VoskRecognizer',),(b'\x00\x00\x00\x64\x00\x00\x00\x10VoskSpkModel',)),
_typenames = (b'\x00\x00\x00\x60VoskBatchModel',b'\x00\x00\x00\x61VoskBatchRecognizer',b'\x00\x00\x00\x62VoskModel',b'\x00\x00\x00\x63VoskRecognizer',b'\x00\x00\x00\x64VoskSpkModel'),
)