diff --git a/README.md b/README.md index 1643f2f..7fdcbad 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ Simulation modes: ### Server -- real-time from mic -The entry point `simulstreaming_whisper_server.py` has the same model options as `simulstreaming_whisper.py`, plus `--host` and `--port` of the TCP connection and the `--warmup-file`. The warmup file is decoded by the Whisper backend after the model is loaded because without that, processing of the very the first input chunk may take longer. +The entry point `simulstreaming_whisper_server.py` has the same model options as `simulstreaming_whisper.py`, plus `--host` and `--port` of the TCP connection, `--out-txt`, which switches the output format from json to simple text with timestamp information, and the `--warmup-file`. The warmup file is decoded by the Whisper backend after the model is loaded because without that, processing of the very the first input chunk may take longer. See the help message (`-h` option). @@ -150,7 +150,7 @@ arecord -f S16_LE -c1 -r 16000 -t raw -D default | nc localhost 43001 ### Output format -This is example of the output format of the simulation from file. The output from the server is the same except that the first space-separated column is not there. +This is example of the output format of the simulation from file. ``` 1200.0000 0 1200 And so @@ -170,6 +170,7 @@ It's space-separated. The first three columns are: - columns 2-3: the beginning and end timestamp of the line in original audio. (TODO: it should be, currently it is very rough approximation.) - columns 4-: This column starts either with a space, if the previous line had to be appended with a space, or with a character that has to be appended to the previous line (like comma or dot). +The output from the server is in json format by default. If argument --out-txt is passed, the output from the server is the same as the output from the simulation from file, except the first column is not there. ## 📣 Feedback Welcome! diff --git a/whisper_streaming/whisper_server.py b/whisper_streaming/whisper_server.py index 2aed7ee..a3d6cd2 100644 --- a/whisper_streaming/whisper_server.py +++ b/whisper_streaming/whisper_server.py @@ -5,6 +5,7 @@ import argparse import os import logging +import json import numpy as np logger = logging.getLogger(__name__) @@ -52,10 +53,11 @@ def non_blocking_receive_audio(self): # next client should be served by a new instance of this object class ServerProcessor: - def __init__(self, c, online_asr_proc, min_chunk): + def __init__(self, c, online_asr_proc, min_chunk, out_txt: bool): self.connection = c self.online_asr_proc = online_asr_proc self.min_chunk = min_chunk + self.out_txt = out_txt self.is_first = True @@ -88,7 +90,10 @@ def send_result(self, iteration_output): # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway # - the next words: segment transcript if iteration_output: - message = "%1.0f %1.0f %s" % (iteration_output['start'] * 1000, iteration_output['end'] * 1000, iteration_output['text']) + if self.out_txt: + message = "%1.0f %1.0f %s" % (iteration_output['start'] * 1000, iteration_output['end'] * 1000, iteration_output['text']) + else: + message = json.dumps(iteration_output) print(message, flush=True, file=sys.stderr) self.connection.send(message) else: @@ -127,6 +132,7 @@ def main_server(factory, add_args): parser.add_argument("--warmup-file", type=str, dest="warmup_file", help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. " "https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .") + parser.add_argument("--out-txt", action="store_true") # options from whisper_online processor_args(parser) @@ -170,7 +176,7 @@ def main_server(factory, add_args): conn, addr = s.accept() logger.info('Connected to client on {}'.format(addr)) connection = Connection(conn) - proc = ServerProcessor(connection, online, min_chunk) + proc = ServerProcessor(connection, online, min_chunk, args.out_txt) proc.process() conn.close() logger.info('Connection to client closed')