diff --git a/CHANGELOG.md b/CHANGELOG.md index b2e643f..46c554a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [3.0.6] - 2025-05-20 + +### Added + +- Support end-of-utterance messages (DEL-24982) + ## [3.0.5] - 2025-05-15 - cli: fix some config options not being set when defined in a config file: `topic_detection_config` and `speaker_diarization_config` diff --git a/VERSION b/VERSION index eca690e..818bd47 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.5 +3.0.6 diff --git a/speechmatics/cli.py b/speechmatics/cli.py index 8178b37..400bda8 100755 --- a/speechmatics/cli.py +++ b/speechmatics/cli.py @@ -281,6 +281,13 @@ def get_transcription_config( ]: config[option] = True if args.get(option) else config.get(option) + if args.get("end_of_utterance_silence_trigger") is not None: + config["conversation_config"] = { + "end_of_utterance_silence_trigger": args.get( + "end_of_utterance_silence_trigger" + ) + } + if args.get("volume_threshold") is not None: config["audio_filtering_config"] = { "volume_threshold": args.get("volume_threshold") @@ -556,6 +563,13 @@ def audio_event_handler(message): sys.stdout.write(f"{escape_seq}[{event_name}]\n") transcripts.text += f"[{event_name}] " + def end_of_utterance_handler(message): + if print_json: + print(json.dumps(message)) + return + sys.stdout.write("[EndOfUtterance]\n") + transcripts.text += "[EndOfUtterance]" + def partial_translation_handler(message): if print_json: print(json.dumps(message)) @@ -590,6 +604,8 @@ def end_of_transcript_handler(_): # print transcription (if text was requested without translation) api.add_event_handler(ServerMessageType.AudioEventStarted, audio_event_handler) + api.add_event_handler(ServerMessageType.EndOfUtterance, end_of_utterance_handler) + if print_json: if enable_partials or enable_translation_partials: api.add_event_handler( diff --git a/speechmatics/cli_parser.py b/speechmatics/cli_parser.py index 9398899..79af284 100644 --- a/speechmatics/cli_parser.py +++ b/speechmatics/cli_parser.py @@ -218,6 +218,15 @@ def get_arg_parser(): default=None, help=("Filter out quiet audio which falls below this threshold (0.0-100.0)"), ) + config_parser.add_argument( + "--end-of-utterance-silence-trigger", + dest="end_of_utterance_silence_trigger", + type=float, + default=None, + help=( + "Generate an EndOfUtterance message from the server after this many seconds of silence (0.0-2.0)" + ), + ) config_parser.add_argument( "--remove-disfluencies", default=None, diff --git a/speechmatics/models.py b/speechmatics/models.py index 9957918..1b3a7fd 100644 --- a/speechmatics/models.py +++ b/speechmatics/models.py @@ -166,6 +166,14 @@ def asdict(self): return asdict(self) +@dataclass +class ConversationConfig: + """Conversation config.""" + + end_of_utterance_silence_trigger: Optional[float] = None + """How much silence in seconds is required to trigger end of utterance detection.""" + + @dataclass class RTTranslationConfig(TranslationConfig): """Real-time mode: Translation config.""" @@ -287,6 +295,9 @@ class TranscriptionConfig(_TranscriptionConfig): """Indicates if partial translation, where words are produced immediately, is enabled.""" + conversation_config: Optional[ConversationConfig] = None + """Optional configuration for end-of-utterance detection.""" + translation_config: Optional[TranslationConfig] = None """Optional configuration for translation.""" @@ -550,6 +561,9 @@ class ServerMessageType(str, Enum): AddTranscript = "AddTranscript" """Indicates the final transcript of a part of the audio.""" + EndOfUtterance = "EndOfUtterance" + """Indicates that an utterance has ended, based on silence""" + AudioEventStarted = "AudioEventStarted" """Indicates the start of an audio event.""" diff --git a/tests/mock_rt_server.py b/tests/mock_rt_server.py index 387c452..30f3c24 100644 --- a/tests/mock_rt_server.py +++ b/tests/mock_rt_server.py @@ -120,6 +120,15 @@ def dummy_add_partial_transcript(): } +def dummy_end_of_utterance(): + """Returns a dummy EndOfUtterance message.""" + return { + "message": "EndOfUtterance", + "format": "2.1", + "metadata": {"start_time": 3.0, "end_time": 3.0}, + } + + def dummy_add_transcript(): """Returns a dummy AddTranscript message.""" return { @@ -194,9 +203,10 @@ def get_responses(message, is_binary=False): ) mock_server_handler.next_audio_seq_no += 1 - # Answer immediately with a partial and a final. + # Answer immediately with a partial and a final and an end of utterance. responses.append(dummy_add_partial_transcript()) responses.append(dummy_add_transcript()) + responses.append(dummy_end_of_utterance()) else: msg_name = message.get("message") if not msg_name: diff --git a/tests/test_cli.py b/tests/test_cli.py index 7a19d59..4b2037a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -182,6 +182,10 @@ {"enable_translation_partials": True}, ), (["rt", "transcribe", "--enable-entities"], {"enable_entities": True}), + ( + ["rt", "transcribe", "--end-of-utterance-silence-trigger=1.8"], + {"end_of_utterance_silence_trigger": 1.8}, + ), (["batch", "transcribe", "--enable-entities"], {"enable_entities": True}), ( ["batch", "transcribe", "--diarization=speaker"], diff --git a/tests/test_models.py b/tests/test_models.py index 62703cb..19d3af4 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -220,3 +220,17 @@ def test_notification_config(params, want): def test_audio_events_config_config(params, want): audio_events_config = models.AudioEventsConfig(**params) assert audio_events_config.asdict() == want + + +@mark.parametrize( + "params, want", + [ + param( + {"end_of_utterance_silence_trigger": 1.8}, + {"end_of_utterance_silence_trigger": 1.8}, + ), + ], +) +def test_conversation_config(params, want): + conversation_config = models.ConversationConfig(**params) + assert asdict(conversation_config) == want