From f83743dd95958374931b2707b3353439f2300d1b Mon Sep 17 00:00:00 2001 From: James Gilmore Date: Fri, 28 Mar 2025 09:29:26 +0000 Subject: [PATCH 1/8] first commit, deal with end-of-utterance markers in responses --- speechmatics/cli.py | 8 ++++++++ speechmatics/models.py | 3 +++ 2 files changed, 11 insertions(+) diff --git a/speechmatics/cli.py b/speechmatics/cli.py index 6aff1fd..6fcd700 100755 --- a/speechmatics/cli.py +++ b/speechmatics/cli.py @@ -547,6 +547,13 @@ def audio_event_handler(message): sys.stdout.write(f"{escape_seq}[{event_name}]\n") transcripts.text += f"[{event_name}] " + def end_of_utterance_handler(message): + if print_json: + print(json.dumps(message)) + return + sys.stdout.write("[EndOfUtterance]\n") + transcripts.text += "[EndOfUtterance]" + def partial_translation_handler(message): if print_json: print(json.dumps(message)) @@ -594,6 +601,7 @@ def end_of_transcript_handler(_): partial_transcript_handler, ) api.add_event_handler(ServerMessageType.AddTranscript, transcript_handler) + api.add_event_handler(ServerMessageType.EndOfUtterance, end_of_utterance_handler) else: if translation_config is not None: if enable_partials or enable_translation_partials: diff --git a/speechmatics/models.py b/speechmatics/models.py index 0becc9f..156238f 100644 --- a/speechmatics/models.py +++ b/speechmatics/models.py @@ -531,6 +531,9 @@ class ServerMessageType(str, Enum): AddTranscript = "AddTranscript" """Indicates the final transcript of a part of the audio.""" + EndOfUtterance = "EndOfUtterance" + """Indicates that an utterance has ended, based on silence""" + AudioEventStarted = "AudioEventStarted" """Indicates the start of an audio event.""" From f5f8353bcdd31316eac5538ac640dba2095c7266 Mon Sep 17 00:00:00 2001 From: James Gilmore Date: Thu, 15 May 2025 16:56:58 +0100 Subject: [PATCH 2/8] Print to stdout even in non-json mode --- speechmatics/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/speechmatics/cli.py b/speechmatics/cli.py index 6fcd700..6e3b72a 100755 --- a/speechmatics/cli.py +++ b/speechmatics/cli.py @@ -588,6 +588,8 @@ def end_of_transcript_handler(_): # print transcription (if text was requested without translation) api.add_event_handler(ServerMessageType.AudioEventStarted, audio_event_handler) + api.add_event_handler(ServerMessageType.EndOfUtterance, end_of_utterance_handler) + if print_json: if enable_partials or enable_translation_partials: api.add_event_handler( @@ -601,7 +603,6 @@ def end_of_transcript_handler(_): partial_transcript_handler, ) api.add_event_handler(ServerMessageType.AddTranscript, transcript_handler) - api.add_event_handler(ServerMessageType.EndOfUtterance, end_of_utterance_handler) else: if translation_config is not None: if enable_partials or enable_translation_partials: From 704c5ded6bd9f23e35745858151e120c011530c8 Mon Sep 17 00:00:00 2001 From: James Gilmore Date: Fri, 16 May 2025 13:12:37 +0100 Subject: [PATCH 3/8] Add to arguments --- speechmatics/cli_parser.py | 7 +++++++ speechmatics/models.py | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/speechmatics/cli_parser.py b/speechmatics/cli_parser.py index e0e5d4b..b1f74fd 100644 --- a/speechmatics/cli_parser.py +++ b/speechmatics/cli_parser.py @@ -218,6 +218,13 @@ def get_arg_parser(): default=None, help=("Filter out quiet audio which falls below this threshold (0.0-100.0)"), ) + config_parser.add_argument( + "--end-of-utterance-silence-trigger", + dest="end_of_utterance_silence_trigger", + type=float, + default=None, + help=("Generate an EndOfUtterance message from the server after this many seconds of silence."), + ) config_parser.add_argument( "--remove-disfluencies", default=None, diff --git a/speechmatics/models.py b/speechmatics/models.py index 156238f..f3e6bba 100644 --- a/speechmatics/models.py +++ b/speechmatics/models.py @@ -153,7 +153,13 @@ class TranslationConfig: def asdict(self): return asdict(self) +@dataclass +class ConversationConfig: + """Conversation config.""" + end_of_utterance_silence_trigger: Optional[float] = None + """How much silence in seconds is required to trigger end of utterance detection.""" + @dataclass class RTTranslationConfig(TranslationConfig): """Real-time mode: Translation config.""" @@ -268,6 +274,9 @@ class TranscriptionConfig(_TranscriptionConfig): """Indicates if partial translation, where words are produced immediately, is enabled.""" + conversation_config: Optional[ConversationConfig] = None + """Optional configuration for end-of-utterance detection.""" + translation_config: Optional[TranslationConfig] = None """Optional configuration for translation.""" From 90e6a45682b1c1777667f034af3bb4d07735b361 Mon Sep 17 00:00:00 2001 From: James Gilmore Date: Fri, 16 May 2025 13:17:14 +0100 Subject: [PATCH 4/8] Pass argument through --- speechmatics/cli.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/speechmatics/cli.py b/speechmatics/cli.py index 6e3b72a..8e1bb8b 100755 --- a/speechmatics/cli.py +++ b/speechmatics/cli.py @@ -284,6 +284,11 @@ def get_transcription_config( ]: config[option] = True if args.get(option) else config.get(option) + if args.get("end_of_utterance_silence_trigger") is not None: + config["conversation_config"] = { + "end_of_utterance_silence_trigger": args.get("end_of_utterance_silence_trigger") + } + if args.get("volume_threshold") is not None: config["audio_filtering_config"] = { "volume_threshold": args.get("volume_threshold") From 342c75fb58760e2e41fcc830aadabe465fe4dea8 Mon Sep 17 00:00:00 2001 From: James Gilmore Date: Mon, 19 May 2025 16:10:15 +0100 Subject: [PATCH 5/8] doc limits --- speechmatics/cli_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/speechmatics/cli_parser.py b/speechmatics/cli_parser.py index b1f74fd..c8bac10 100644 --- a/speechmatics/cli_parser.py +++ b/speechmatics/cli_parser.py @@ -223,7 +223,8 @@ def get_arg_parser(): dest="end_of_utterance_silence_trigger", type=float, default=None, - help=("Generate an EndOfUtterance message from the server after this many seconds of silence."), + help=("Generate an EndOfUtterance message from the server after this many seconds of silence (0.0-2.0)"), + ) config_parser.add_argument( "--remove-disfluencies", From 6529bf9f5baf02b0a428a3f216193be31f5f6506 Mon Sep 17 00:00:00 2001 From: James Gilmore Date: Mon, 19 May 2025 16:11:48 +0100 Subject: [PATCH 6/8] changelog and version --- CHANGELOG.md | 6 ++++++ VERSION | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c3a5ff8..383374f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [3.0.4] - 2025-05-19 + +### Added + +- Support end-of-utterance messages (DEL-24982) + ## [3.0.3] - 2025-03-03 ### Added diff --git a/VERSION b/VERSION index 75a22a2..b0f2dcb 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.3 +3.0.4 From 76be3cc21470c032d675c854dc6efa865b3b9f5f Mon Sep 17 00:00:00 2001 From: James Gilmore Date: Tue, 20 May 2025 09:54:01 +0100 Subject: [PATCH 7/8] lint --- speechmatics/models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/speechmatics/models.py b/speechmatics/models.py index f3e6bba..ed5253d 100644 --- a/speechmatics/models.py +++ b/speechmatics/models.py @@ -153,13 +153,15 @@ class TranslationConfig: def asdict(self): return asdict(self) + @dataclass class ConversationConfig: """Conversation config.""" end_of_utterance_silence_trigger: Optional[float] = None """How much silence in seconds is required to trigger end of utterance detection.""" - + + @dataclass class RTTranslationConfig(TranslationConfig): """Real-time mode: Translation config.""" From 087dc125efe2a89ccd7fd313132885601cbb72d3 Mon Sep 17 00:00:00 2001 From: James Gilmore Date: Tue, 20 May 2025 10:17:12 +0100 Subject: [PATCH 8/8] CLI tests --- speechmatics/cli.py | 4 +++- speechmatics/cli_parser.py | 5 +++-- tests/mock_rt_server.py | 12 +++++++++++- tests/test_cli.py | 4 ++++ tests/test_models.py | 14 ++++++++++++++ 5 files changed, 35 insertions(+), 4 deletions(-) diff --git a/speechmatics/cli.py b/speechmatics/cli.py index 8e1bb8b..3338c63 100755 --- a/speechmatics/cli.py +++ b/speechmatics/cli.py @@ -286,7 +286,9 @@ def get_transcription_config( if args.get("end_of_utterance_silence_trigger") is not None: config["conversation_config"] = { - "end_of_utterance_silence_trigger": args.get("end_of_utterance_silence_trigger") + "end_of_utterance_silence_trigger": args.get( + "end_of_utterance_silence_trigger" + ) } if args.get("volume_threshold") is not None: diff --git a/speechmatics/cli_parser.py b/speechmatics/cli_parser.py index c8bac10..3d88dd0 100644 --- a/speechmatics/cli_parser.py +++ b/speechmatics/cli_parser.py @@ -223,8 +223,9 @@ def get_arg_parser(): dest="end_of_utterance_silence_trigger", type=float, default=None, - help=("Generate an EndOfUtterance message from the server after this many seconds of silence (0.0-2.0)"), - + help=( + "Generate an EndOfUtterance message from the server after this many seconds of silence (0.0-2.0)" + ), ) config_parser.add_argument( "--remove-disfluencies", diff --git a/tests/mock_rt_server.py b/tests/mock_rt_server.py index 387c452..30f3c24 100644 --- a/tests/mock_rt_server.py +++ b/tests/mock_rt_server.py @@ -120,6 +120,15 @@ def dummy_add_partial_transcript(): } +def dummy_end_of_utterance(): + """Returns a dummy EndOfUtterance message.""" + return { + "message": "EndOfUtterance", + "format": "2.1", + "metadata": {"start_time": 3.0, "end_time": 3.0}, + } + + def dummy_add_transcript(): """Returns a dummy AddTranscript message.""" return { @@ -194,9 +203,10 @@ def get_responses(message, is_binary=False): ) mock_server_handler.next_audio_seq_no += 1 - # Answer immediately with a partial and a final. + # Answer immediately with a partial and a final and an end of utterance. responses.append(dummy_add_partial_transcript()) responses.append(dummy_add_transcript()) + responses.append(dummy_end_of_utterance()) else: msg_name = message.get("message") if not msg_name: diff --git a/tests/test_cli.py b/tests/test_cli.py index 2113f33..1698ef1 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -183,6 +183,10 @@ {"enable_translation_partials": True}, ), (["rt", "transcribe", "--enable-entities"], {"enable_entities": True}), + ( + ["rt", "transcribe", "--end-of-utterance-silence-trigger=1.8"], + {"end_of_utterance_silence_trigger": 1.8}, + ), (["batch", "transcribe", "--enable-entities"], {"enable_entities": True}), ( ["batch", "transcribe", "--speaker-diarization-sensitivity=0.7"], diff --git a/tests/test_models.py b/tests/test_models.py index 62703cb..19d3af4 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -220,3 +220,17 @@ def test_notification_config(params, want): def test_audio_events_config_config(params, want): audio_events_config = models.AudioEventsConfig(**params) assert audio_events_config.asdict() == want + + +@mark.parametrize( + "params, want", + [ + param( + {"end_of_utterance_silence_trigger": 1.8}, + {"end_of_utterance_silence_trigger": 1.8}, + ), + ], +) +def test_conversation_config(params, want): + conversation_config = models.ConversationConfig(**params) + assert asdict(conversation_config) == want