From 414a8ad2c3d62dd27a561e0f0e9137aadc4602bd Mon Sep 17 00:00:00 2001 From: Marko Hlavaty Date: Sun, 19 Oct 2025 21:45:41 +0200 Subject: [PATCH 1/3] Change output format to json --- whisper_streaming/whisper_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/whisper_streaming/whisper_server.py b/whisper_streaming/whisper_server.py index 2aed7ee..c0894f8 100644 --- a/whisper_streaming/whisper_server.py +++ b/whisper_streaming/whisper_server.py @@ -5,6 +5,7 @@ import argparse import os import logging +import json import numpy as np logger = logging.getLogger(__name__) @@ -88,7 +89,7 @@ def send_result(self, iteration_output): # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway # - the next words: segment transcript if iteration_output: - message = "%1.0f %1.0f %s" % (iteration_output['start'] * 1000, iteration_output['end'] * 1000, iteration_output['text']) + message = json.dumps(iteration_output) print(message, flush=True, file=sys.stderr) self.connection.send(message) else: From 8df813ebf6d907c5f5e920acba0d33abb291b0c4 Mon Sep 17 00:00:00 2001 From: Marko Hlavaty Date: Sun, 19 Oct 2025 23:35:37 +0200 Subject: [PATCH 2/3] Add optional parameter '--out-txt' to enable simple, non-json outputs --- whisper_streaming/whisper_server.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/whisper_streaming/whisper_server.py b/whisper_streaming/whisper_server.py index c0894f8..a3d6cd2 100644 --- a/whisper_streaming/whisper_server.py +++ b/whisper_streaming/whisper_server.py @@ -53,10 +53,11 @@ def non_blocking_receive_audio(self): # next client should be served by a new instance of this object class ServerProcessor: - def __init__(self, c, online_asr_proc, min_chunk): + def __init__(self, c, online_asr_proc, min_chunk, out_txt: bool): self.connection = c self.online_asr_proc = online_asr_proc self.min_chunk = min_chunk + self.out_txt = out_txt self.is_first = True @@ -89,7 +90,10 @@ def send_result(self, iteration_output): # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway # - the next words: segment transcript if iteration_output: - message = json.dumps(iteration_output) + if self.out_txt: + message = "%1.0f %1.0f %s" % (iteration_output['start'] * 1000, iteration_output['end'] * 1000, iteration_output['text']) + else: + message = json.dumps(iteration_output) print(message, flush=True, file=sys.stderr) self.connection.send(message) else: @@ -128,6 +132,7 @@ def main_server(factory, add_args): parser.add_argument("--warmup-file", type=str, dest="warmup_file", help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. " "https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .") + parser.add_argument("--out-txt", action="store_true") # options from whisper_online processor_args(parser) @@ -171,7 +176,7 @@ def main_server(factory, add_args): conn, addr = s.accept() logger.info('Connected to client on {}'.format(addr)) connection = Connection(conn) - proc = ServerProcessor(connection, online, min_chunk) + proc = ServerProcessor(connection, online, min_chunk, args.out_txt) proc.process() conn.close() logger.info('Connection to client closed') From ee6502956f4b624fc4628c1ade22bdd7ade84543 Mon Sep 17 00:00:00 2001 From: Marko Hlavaty Date: Sun, 19 Oct 2025 23:56:35 +0200 Subject: [PATCH 3/3] Update README.md with information about parameter --out-txt --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1643f2f..7fdcbad 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ Simulation modes: ### Server -- real-time from mic -The entry point `simulstreaming_whisper_server.py` has the same model options as `simulstreaming_whisper.py`, plus `--host` and `--port` of the TCP connection and the `--warmup-file`. The warmup file is decoded by the Whisper backend after the model is loaded because without that, processing of the very the first input chunk may take longer. +The entry point `simulstreaming_whisper_server.py` has the same model options as `simulstreaming_whisper.py`, plus `--host` and `--port` of the TCP connection, `--out-txt`, which switches the output format from json to simple text with timestamp information, and the `--warmup-file`. The warmup file is decoded by the Whisper backend after the model is loaded because without that, processing of the very the first input chunk may take longer. See the help message (`-h` option). @@ -150,7 +150,7 @@ arecord -f S16_LE -c1 -r 16000 -t raw -D default | nc localhost 43001 ### Output format -This is example of the output format of the simulation from file. The output from the server is the same except that the first space-separated column is not there. +This is example of the output format of the simulation from file. ``` 1200.0000 0 1200 And so @@ -170,6 +170,7 @@ It's space-separated. The first three columns are: - columns 2-3: the beginning and end timestamp of the line in original audio. (TODO: it should be, currently it is very rough approximation.) - columns 4-: This column starts either with a space, if the previous line had to be appended with a space, or with a character that has to be appended to the previous line (like comma or dot). +The output from the server is in json format by default. If argument --out-txt is passed, the output from the server is the same as the output from the simulation from file, except the first column is not there. ## 📣 Feedback Welcome!