From 428c8a5cdfe08db986770a776aa40a1284710c33 Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Tue, 18 Nov 2025 10:03:31 +0000 Subject: [PATCH 01/17] Updated Voice SDK and Pydantic models. --- examples/voice/cli/cli.py | 6 +- sdk/voice/README.md | 20 ---- sdk/voice/speechmatics/voice/_client.py | 6 +- sdk/voice/speechmatics/voice/_models.py | 134 +++++++++++++++-------- sdk/voice/speechmatics/voice/_presets.py | 7 +- tests/voice/test_03_conversation.py | 2 +- tests/voice/test_04_models.py | 67 ++++++++---- tests/voice/test_08_multiple_speakers.py | 4 +- tests/voice/test_09_speaker_id.py | 2 +- tests/voice/test_10_finalize.py | 2 +- 10 files changed, 150 insertions(+), 100 deletions(-) diff --git a/examples/voice/cli/cli.py b/examples/voice/cli/cli.py index 67d7958..1592cfe 100644 --- a/examples/voice/cli/cli.py +++ b/examples/voice/cli/cli.py @@ -141,7 +141,7 @@ async def main() -> None: # Use JSON config if args.config is not None: try: - config = VoiceAgentConfig.model_validate(args.config) + config = VoiceAgentConfig.from_dict(args.config) except Exception as e: print(f"Error validating config: {e}") return @@ -182,7 +182,7 @@ async def main() -> None: # Handle config display if args.show: - print(config.model_dump_json(indent=2, exclude_unset=True, exclude_none=True)) + print(config.to_json(indent=2, exclude_unset=True, exclude_none=True)) return # Set the audio sample rate @@ -550,7 +550,7 @@ def log_message(message: dict[str, Any]) -> None: log_message( { "message": "VoiceAgentClientConfig", - "config": client._config.model_dump(exclude_none=True, exclude_unset=True), + "config": client._config.to_dict(exclude_none=True, exclude_unset=True), } ) diff --git a/sdk/voice/README.md b/sdk/voice/README.md index 9ff5d7d..7213b51 100644 --- a/sdk/voice/README.md +++ b/sdk/voice/README.md @@ -364,7 +364,6 @@ def on_partial(message): "timestamp": "2025-11-11T23:18:37.189+00:00", "language": "en", "text": "Welcome to", - "annotation": ["has_partial"], "metadata": { "start_time": 1.28, "end_time": 1.6 @@ -384,7 +383,6 @@ def on_partial(message): - `speaker_id` - Speaker label (e.g., `"S1"`, `"S2"`) - `is_active` - `true` if speaker is in focus (based on `speaker_config`) - `text` - Current partial transcription text -- `annotation` - Status flags (see annotation section below) - `metadata.start_time` - Segment start time (seconds since session start) - `metadata.end_time` - Segment end time (seconds since session start) @@ -416,13 +414,6 @@ def on_segment(message): "timestamp": "2025-11-11T23:18:37.189+00:00", "language": "en", "text": "Welcome to Speechmatics.", - "annotation": [ - "has_final", - "starts_with_final", - "ends_with_final", - "ends_with_eos", - "ends_with_punctuation" - ], "metadata": { "start_time": 1.28, "end_time": 8.04 @@ -437,17 +428,6 @@ def on_segment(message): } ``` -**Annotation Flags:** - -- `has_final` - Contains finalized words -- `has_partial` - Contains partial (interim) words -- `starts_with_final` - First word is finalized -- `ends_with_final` - Last word is finalized -- `ends_with_eos` - Ends with end-of-sentence -- `ends_with_punctuation` - Ends with punctuation -- `fast_speaker` - Speaker is speaking quickly (may appear in some segments) -- `has_disfluency` - Contains disfluencies like "um", "er" (may appear in some segments) - #### END_OF_TURN Emitted when a speaker's turn is complete. Timing depends on `end_of_utterance_mode`. diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py index 3277090..58626c2 100644 --- a/sdk/voice/speechmatics/voice/_client.py +++ b/sdk/voice/speechmatics/voice/_client.py @@ -182,7 +182,7 @@ def __init__( config=self._config, session_id="NOT_SET", base_time=datetime.datetime.now(datetime.timezone.utc), - language_pack_info=LanguagePackInfo.model_validate({}), + language_pack_info=LanguagePackInfo.from_dict({}), ) # ------------------------------------- @@ -705,7 +705,7 @@ def _evt_on_recognition_started(message: dict[str, Any]) -> None: config=self._config, session_id=message.get("id", "UNKNOWN"), base_time=datetime.datetime.now(datetime.timezone.utc), - language_pack_info=LanguagePackInfo.model_validate(message.get("language_pack_info", {})), + language_pack_info=LanguagePackInfo.from_dict(message.get("language_pack_info", {})), ) # Partial transcript event @@ -746,7 +746,7 @@ def _emit_message(self, message: BaseMessage) -> None: """ # Forward to the emit() method - self.emit(message.message, message.model_dump()) + self.emit(message.message, message.to_dict()) def _emit_info_message(self, message: Union[str, dict[str, Any]]) -> None: """Emit an info message to the client.""" diff --git a/sdk/voice/speechmatics/voice/_models.py b/sdk/voice/speechmatics/voice/_models.py index fa9c1ea..056d18a 100644 --- a/sdk/voice/speechmatics/voice/_models.py +++ b/sdk/voice/speechmatics/voice/_models.py @@ -10,9 +10,10 @@ from typing import Literal from typing import Optional -from pydantic import BaseModel +from pydantic import BaseModel as PydanticBaseModel from pydantic import ConfigDict from pydantic import Field +from typing_extensions import Self from speechmatics.rt import AudioEncoding from speechmatics.rt import OperatingPoint @@ -248,13 +249,43 @@ class AnnotationFlags(str, Enum): # ============================================================================== -class BaseConfigModel(BaseModel): +class BaseModel(PydanticBaseModel): """Base configuration model.""" model_config = ConfigDict(extra="forbid") + @classmethod + def from_dict(cls, data: dict, **kwargs: Any) -> Self: + """Convert a dictionary to a config object.""" + return cls.model_validate(data, **kwargs) # type: ignore[no-any-return] + + def to_dict( + self, exclude_none: bool = True, exclude_defaults: bool = False, exclude_unset: bool = False, **kwargs: Any + ) -> dict[str, Any]: + """Convert the model to a dictionary.""" + return super().model_dump( # type: ignore[no-any-return] + mode="json", + exclude_none=exclude_none, + exclude_defaults=exclude_defaults, + exclude_unset=exclude_unset, + **kwargs, + ) + + @classmethod + def from_json(cls, json_data: str, **kwargs: Any) -> Self: + """Convert a JSON string to a config object.""" + return cls.model_validate_json(json_data, **kwargs) # type: ignore[no-any-return] + + def to_json( + self, exclude_none: bool = True, exclude_defaults: bool = False, exclude_unset: bool = False, **kwargs: Any + ) -> str: + """Convert the model to a JSON string.""" + return self.model_dump_json( # type: ignore[no-any-return] + exclude_none=exclude_none, exclude_defaults=exclude_defaults, exclude_unset=exclude_unset, **kwargs + ) + -class AdditionalVocabEntry(BaseConfigModel): +class AdditionalVocabEntry(BaseModel): """Additional vocabulary entry. Parameters: @@ -280,10 +311,10 @@ class AdditionalVocabEntry(BaseConfigModel): """ content: str - sounds_like: list[str] = Field(default_factory=list) + sounds_like: Optional[list[str]] = None -class SpeakerFocusConfig(BaseConfigModel): +class SpeakerFocusConfig(BaseModel): """Speaker Focus Config. List of speakers to focus on, ignore and how to deal with speakers that are not @@ -317,7 +348,7 @@ class SpeakerFocusConfig(BaseConfigModel): focus_mode: SpeakerFocusMode = SpeakerFocusMode.RETAIN -class SpeechSegmentConfig(BaseConfigModel): +class SpeechSegmentConfig(BaseModel): """Configuration on how segments are emitted. Parameters: @@ -339,7 +370,7 @@ class SpeechSegmentConfig(BaseConfigModel): pause_mark: Optional[str] = None -class EndOfTurnPenaltyItem(BaseConfigModel): +class EndOfTurnPenaltyItem(BaseModel): """End of turn penalty item. Parameters: @@ -353,7 +384,7 @@ class EndOfTurnPenaltyItem(BaseConfigModel): is_not: bool = False -class EndOfTurnConfig(BaseConfigModel): +class EndOfTurnConfig(BaseModel): """Configuration for end of turn. Parameters: @@ -386,7 +417,7 @@ class EndOfTurnConfig(BaseConfigModel): ) -class SmartTurnConfig(BaseConfigModel): +class SmartTurnConfig(BaseModel): """Smart turn configuration for the Speechmatics Voice Agent. This configuration is used to determine when a turn has completed. It is used to @@ -421,7 +452,7 @@ class SmartTurnConfig(BaseConfigModel): negative_penalty: float = 1.7 -class VoiceAgentConfig(BaseConfigModel): +class VoiceAgentConfig(BaseModel): """Voice Agent configuration. A framework-independent configuration object for the Speechmatics Voice Agent. This uses @@ -635,19 +666,6 @@ class VoiceAgentConfig(BaseConfigModel): sample_rate: int = 16000 audio_encoding: AudioEncoding = AudioEncoding.PCM_S16LE - # Parse JSON - @classmethod - def from_json(cls, json_data: str) -> VoiceAgentConfig: - """Convert a JSON string to a VoiceAgentConfig object.""" - cfg: VoiceAgentConfig = cls.model_validate_json(json_data) - return cfg - - # To JSON - def to_json(self) -> str: - """Convert the model to a JSON string.""" - config_str: str = self.model_dump_json(exclude_none=True, exclude_defaults=True, exclude_unset=True) - return config_str - # ============================================================================== # SESSION & INFO MODELS @@ -852,12 +870,28 @@ def end_time(self) -> float: """Return the end time of the segment.""" return self.fragments[-1].end_time if self.fragments else 0.0 - def model_dump(self, include_results: bool = False, **kwargs: Any) -> dict[str, Any]: + def to_dict( + self, + exclude_none: bool = True, + exclude_defaults: bool = False, + exclude_unset: bool = False, + include_results: bool = False, + **kwargs: Any, + ) -> dict[str, Any]: """Override model_dump to control fragments/results inclusion.""" # Always exclude fragments from the base dump - kwargs["exclude"] = {"fragments"} - data: dict[str, Any] = super().model_dump(**kwargs) + exclude = kwargs.get("exclude", set()) + if isinstance(exclude, set): + exclude.add("fragments") + else: + exclude = {"fragments"} + kwargs["exclude"] = exclude + + # Get the base dump + data: dict[str, Any] = super().model_dump( + exclude_none=exclude_none, exclude_defaults=exclude_defaults, exclude_unset=exclude_unset, **kwargs + ) # Add timing information data["start_time"] = self.start_time @@ -904,7 +938,16 @@ def __init__( annotate_segments=annotate_segments, ) - super().__init__(session=session, fragments=fragments, segments=segments, focus_speakers=focus_speakers, **data) + # Initialize with the computed values + data.update( + { + "session": session, + "fragments": fragments, + "segments": segments, + "focus_speakers": focus_speakers, + } + ) + super().__init__(**data) @property def start_time(self) -> float: @@ -998,22 +1041,18 @@ def trim(self, start_time: float, end_time: float, annotate_segments: bool = Tru # ============================================================================== -class BaseMessageModel(BaseModel): +class BaseMessage(BaseModel): """Base model for all messages.""" - def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]: - """Default to excluding None values.""" - return super().model_dump(*args, **kwargs, exclude_none=True, mode="json") # type: ignore[no-any-return] - - def model_dump_json(self, *args: Any, **kwargs: Any) -> str: - """Default to excluding None values.""" - return super().model_dump_json(*args, **kwargs, exclude_none=True) # type: ignore[no-any-return] - + message: str -class BaseMessage(BaseMessageModel): - """Base model for all messages.""" + @classmethod + def from_message(cls, data: dict, **kwargs: Any) -> Self: + """Convert a message dictionary to a message object. - message: AgentServerMessageType + Alias for from_dict() for semantic clarity when working with messages. + """ + return cls.from_dict(data, **kwargs) class ErrorMessage(BaseMessage): @@ -1065,7 +1104,7 @@ class VADStatusMessage(BaseMessage): time: Optional[float] = None -class MessageTimeMetadata(BaseMessageModel): +class MessageTimeMetadata(BaseModel): """Metadata for segment messages. Parameters: @@ -1097,16 +1136,16 @@ class TurnStartEndResetMessage(BaseMessage): metadata: MessageTimeMetadata -class TurnPredictionMetadata(BaseMessageModel): +class TurnPredictionMetadata(BaseModel): """Metadata for turn prediction messages. Parameters: ttl: The time to live of the prediction in seconds. - reasons: The reasons for the prediction. """ ttl: float - reasons: list[str] + + model_config = ConfigDict(extra="ignore") class TurnPredictionMessage(BaseMessage): @@ -1128,7 +1167,7 @@ class SpeakerMetricsMessage(BaseMessage): speakers: list[SessionSpeaker] -class SegmentMessageSegmentFragment(BaseMessageModel): +class SegmentMessageSegmentFragment(BaseModel): """Speech fragment for segment messages. Parameters: @@ -1148,11 +1187,12 @@ class SegmentMessageSegmentFragment(BaseMessageModel): type: str = Field(default="word", alias="type_") content: str = "" attaches_to: str = "" + is_eos: bool = False model_config = ConfigDict(extra="ignore") -class SegmentMessageSegment(BaseMessageModel): +class SegmentMessageSegment(BaseModel): """Partial or final segment. Parameters: @@ -1162,7 +1202,6 @@ class SegmentMessageSegment(BaseMessageModel): language: The language of the frame. text: The text of the segment. fragments: The fragments associated with the segment. - annotation: The annotation associated with the segment. metadata: The metadata associated with the segment. """ @@ -1172,9 +1211,10 @@ class SegmentMessageSegment(BaseMessageModel): language: Optional[str] = None text: Optional[str] = None fragments: Optional[list[SegmentMessageSegmentFragment]] = None - annotation: list[AnnotationFlags] = Field(default_factory=list) metadata: MessageTimeMetadata + model_config = ConfigDict(extra="ignore") + class SegmentMessage(BaseMessage): """Emitted when a segment is added to the session.""" diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py index 88703c9..c5c608c 100644 --- a/sdk/voice/speechmatics/voice/_presets.py +++ b/sdk/voice/speechmatics/voice/_presets.py @@ -138,6 +138,7 @@ def EXTERNAL(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # end_of_utterance_silence_trigger=1.2, end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL, speech_segment_config=SpeechSegmentConfig(emit_sentences=True), + use_forced_eou_message=True, ), overlay, ) @@ -161,7 +162,7 @@ def load(preset: str, overlay_json: Optional[str] = None) -> VoiceAgentConfig: try: config: VoiceAgentConfig = getattr(VoiceAgentConfigPreset, preset.upper())() if overlay_json is not None: - overlay = VoiceAgentConfig.model_validate_json(overlay_json) + overlay = VoiceAgentConfig.from_json(overlay_json) config = VoiceAgentConfigPreset._merge_configs(config, overlay) return config except ValueError: @@ -189,9 +190,9 @@ def _merge_configs(base: VoiceAgentConfig, overlay: Optional[VoiceAgentConfig]) if overlay is None: return base - # Merge overlay into base - use model_validate to properly reconstruct nested models + # Merge overlay into base merged_dict = { **base.model_dump(exclude_unset=True, exclude_none=True), **overlay.model_dump(exclude_unset=True, exclude_none=True), } - return VoiceAgentConfig.model_validate(merged_dict) # type: ignore[no-any-return] + return VoiceAgentConfig.from_dict(merged_dict) diff --git a/tests/voice/test_03_conversation.py b/tests/voice/test_03_conversation.py index aa2398b..6adc6ad 100644 --- a/tests/voice/test_03_conversation.py +++ b/tests/voice/test_03_conversation.py @@ -90,7 +90,7 @@ def log_message(message): print() print("---") log_message({"message": "AudioFile", "path": audio_file}) - log_message({"message": "VoiceAgentConfig", **client._config.model_dump()}) + log_message({"message": "VoiceAgentConfig", **client._config.to_dict()}) log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()}) log_message({"message": "AudioFormat", **client._audio_format.to_dict()}) diff --git a/tests/voice/test_04_models.py b/tests/voice/test_04_models.py index 6e3af3d..04c698e 100644 --- a/tests/voice/test_04_models.py +++ b/tests/voice/test_04_models.py @@ -4,9 +4,11 @@ from speechmatics.voice import VoiceAgentConfig from speechmatics.voice._models import AdditionalVocabEntry +from speechmatics.voice._models import AgentServerMessageType from speechmatics.voice._models import AnnotationFlags from speechmatics.voice._models import AnnotationResult from speechmatics.voice._models import OperatingPoint +from speechmatics.voice._models import SessionMetricsMessage from speechmatics.voice._models import SpeakerFocusConfig from speechmatics.voice._models import SpeakerFocusMode from speechmatics.voice._models import SpeakerIdentifier @@ -28,7 +30,7 @@ async def test_voice_agent_config(): ) # Test JSON serialisation - config_dict = config.model_dump() + config_dict = config.to_dict() assert config_dict["language"] == "en" assert config_dict["max_delay"] == 1.5 assert config_dict["enable_diarization"] is True @@ -121,19 +123,19 @@ async def test_additional_vocab_entry(): entry = AdditionalVocabEntry(content="hello", sounds_like=["helo", "hallo"]) # Test JSON serialisation - json_data = entry.model_dump() - assert json_data["content"] == "hello" - assert json_data["sounds_like"] == ["helo", "hallo"] + json_dict = entry.to_dict() + assert json_dict["content"] == "hello" + assert json_dict["sounds_like"] == ["helo", "hallo"] # Test JSON deserialisation - entry_from_json = AdditionalVocabEntry.model_validate(json_data) + entry_from_json = AdditionalVocabEntry.from_dict(json_dict) assert entry_from_json.content == entry.content assert entry_from_json.sounds_like == entry.sounds_like # Test with defaults entry_minimal = AdditionalVocabEntry(content="test") - json_minimal = entry_minimal.model_dump() - assert json_minimal["sounds_like"] == [] + json_minimal = entry_minimal.to_dict() + assert "sounds_like" not in json_minimal @pytest.mark.asyncio @@ -153,23 +155,21 @@ async def test_speaker_focus_config(): ) # Test JSON serialisation - json_data = config.model_dump() - assert json_data["focus_speakers"] == ["S1", "S2"] - assert json_data["ignore_speakers"] == ["__ASSISTANT__", "__SYSTEM__"] - assert json_data["focus_mode"] == SpeakerFocusMode.IGNORE + json_dict = config.to_dict() + assert json_dict["focus_speakers"] == ["S1", "S2"] + assert json_dict["ignore_speakers"] == ["__ASSISTANT__", "__SYSTEM__"] + assert json_dict["focus_mode"] == SpeakerFocusMode.IGNORE # Test JSON deserialisation - config_from_json = SpeakerFocusConfig.model_validate(json_data) + config_from_json = SpeakerFocusConfig.from_dict(json_dict) assert config_from_json.focus_speakers == config.focus_speakers assert config_from_json.ignore_speakers == config.ignore_speakers assert config_from_json.focus_mode == config.focus_mode # Test with defaults config_default = SpeakerFocusConfig() - json_default = config_default.model_dump() - assert json_default["focus_speakers"] == [] - assert json_default["ignore_speakers"] == [] - assert json_default["focus_mode"] == SpeakerFocusMode.RETAIN + json_default = config_default.to_json(exclude_none=False) + assert json_default == '{"focus_speakers":[],"ignore_speakers":[],"focus_mode":"retain"}' @pytest.mark.asyncio @@ -198,7 +198,7 @@ async def test_speech_fragment(): ) # Test JSON serialisation - json_data = fragment.model_dump() + json_data = fragment.to_dict() assert json_data["idx"] == 1 assert json_data["start_time"] == 0.5 assert json_data["end_time"] == 1.2 @@ -237,7 +237,7 @@ async def test_speaker_segment(): ) # Test model_dump() default behavior (should exclude fragments by default) - json_data = segment.model_dump() + json_data = segment.to_dict() assert json_data["speaker_id"] == "S1" assert json_data["is_active"] is True assert json_data["timestamp"] == "2025-01-01T12:00:00.500" @@ -247,9 +247,38 @@ async def test_speaker_segment(): assert isinstance(json_data["annotation"], list) # Test model_dump with include_results=True - dict_data_results = segment.model_dump(include_results=True) + dict_data_results = segment.to_dict(include_results=True) assert dict_data_results["speaker_id"] == "S1" assert dict_data_results["text"] == "Hello world" assert "results" in dict_data_results assert "fragments" not in dict_data_results assert len(dict_data_results["results"]) == 2 + + +@pytest.mark.asyncio +async def test_event_messages(): + """Test event messages.""" + + # Create a new event message + event_message = SessionMetricsMessage( + total_time=1.0, + total_time_str="00:00:01", + total_bytes=1024, + processing_time=0.5, + ) + + # Test dict + dict_data = event_message.to_dict() + assert dict_data["message"] == AgentServerMessageType.SESSION_METRICS + assert dict_data["message"] == "SessionMetrics" + assert dict_data["total_time"] == 1.0 + assert dict_data["total_time_str"] == "00:00:01" + assert dict_data["total_bytes"] == 1024 + assert dict_data["processing_time"] == 0.5 + + # Test JSON + json_data = event_message.to_json() + assert ( + json_data + == '{"message":"SessionMetrics","total_time":1.0,"total_time_str":"00:00:01","total_bytes":1024,"processing_time":0.5}' + ) diff --git a/tests/voice/test_08_multiple_speakers.py b/tests/voice/test_08_multiple_speakers.py index adbebd5..553d8ed 100644 --- a/tests/voice/test_08_multiple_speakers.py +++ b/tests/voice/test_08_multiple_speakers.py @@ -167,8 +167,8 @@ def log_final_segment(message): print() print() print("---") - log_message({"message": "Sample", **sample.model_dump()}) - log_message({"message": "VoiceAgentConfig", **client._config.model_dump()}) + log_message({"message": "Sample", **sample.to_dict()}) + log_message({"message": "VoiceAgentConfig", **client._config.to_dict()}) log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()}) log_message({"message": "AudioFormat", **client._audio_format.to_dict()}) diff --git a/tests/voice/test_09_speaker_id.py b/tests/voice/test_09_speaker_id.py index 9592984..71438aa 100644 --- a/tests/voice/test_09_speaker_id.py +++ b/tests/voice/test_09_speaker_id.py @@ -115,7 +115,7 @@ def save_speakers_result(message): print() print() print("---") - log_message({"message": "VoiceAgentConfig", **client._config.model_dump()}) + log_message({"message": "VoiceAgentConfig", **client._config.to_dict()}) log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()}) log_message({"message": "AudioFormat", **client._audio_format.to_dict()}) diff --git a/tests/voice/test_10_finalize.py b/tests/voice/test_10_finalize.py index 247d46f..bc7df46 100644 --- a/tests/voice/test_10_finalize.py +++ b/tests/voice/test_10_finalize.py @@ -93,7 +93,7 @@ def eot_received_callback(message): print() print() print("---") - log_message({"message": "VoiceAgentConfig", **client._config.model_dump()}) + log_message({"message": "VoiceAgentConfig", **client._config.to_dict()}) log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()}) log_message({"message": "AudioFormat", **client._audio_format.to_dict()}) From 15c2861f18b2e933022926c27a679da6192b331e Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Tue, 18 Nov 2025 10:04:56 +0000 Subject: [PATCH 02/17] Updated CLI example. --- examples/voice/cli/cli.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/voice/cli/cli.py b/examples/voice/cli/cli.py index 1592cfe..ade068a 100644 --- a/examples/voice/cli/cli.py +++ b/examples/voice/cli/cli.py @@ -482,10 +482,7 @@ def console_print(ts: datetime.datetime, message: dict) -> None: _segs = [] for segment in message["segments"]: suffix = "" if segment["is_active"] else " (background)" - if args.verbose >= 3: - _segs.append(f"@{segment['speaker_id']}{suffix}: `{segment['text']}` {segment['annotation']}") - else: - _segs.append(f"@{segment['speaker_id']}{suffix}: `{segment['text']}`") + _segs.append(f"@{segment['speaker_id']}{suffix}: `{segment['text']}`") payload = {"segments": _segs} # Print to console @@ -528,12 +525,12 @@ def log_message(message: dict[str, Any]) -> None: client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, log_message) # Metrics - if args.verbose >= 4: + if args.verbose >= 3: client.on(AgentServerMessageType.SESSION_METRICS, log_message) client.on(AgentServerMessageType.SPEAKER_METRICS, log_message) # Verbose STT events - if args.verbose >= 5: + if args.verbose >= 4: client.on(AgentServerMessageType.END_OF_UTTERANCE, log_message) client.on("ForcedEndOfUtterance", log_message) client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, log_message) From 9b29aeed28e41e150b036510269fca32be75f5f8 Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Thu, 4 Dec 2025 10:39:54 +0000 Subject: [PATCH 03/17] Squashed commit of the following: Merge in dev/sams with changes to smart turn, VAD and preset names. --- .gitattributes | 1 - .github/RELEASE.md | 38 +- .gitignore | 1 + README.md | 23 +- examples/tts/tts_autoplay/README.md | 43 +++ examples/tts/tts_autoplay/requirements.txt | 3 + .../tts/tts_autoplay/tts_stream_example.py | 119 ++++++ examples/voice/cli/README.md | 111 ++---- examples/voice/cli/cli.py | 253 +++---------- examples/voice/simple/simple.py | 20 +- sdk/rt/speechmatics/rt/_auth.py | 3 - sdk/rt/speechmatics/rt/_models.py | 29 +- sdk/rt/speechmatics/rt/_transport.py | 8 + sdk/tts/speechmatics/tts/_models.py | 2 + sdk/voice/README.md | 10 +- sdk/voice/speechmatics/voice/__init__.py | 9 + sdk/voice/speechmatics/voice/_audio.py | 7 +- sdk/voice/speechmatics/voice/_client.py | 350 ++++++++++++----- sdk/voice/speechmatics/voice/_models.py | 186 ++++++--- sdk/voice/speechmatics/voice/_presets.py | 95 +++-- sdk/voice/speechmatics/voice/_smart_turn.py | 19 +- sdk/voice/speechmatics/voice/_utils.py | 3 +- sdk/voice/speechmatics/voice/_vad.py | 354 ++++++++++++++++++ tests/voice/assets/audio_03_16kHz.wav | 4 +- tests/voice/test_05_utterance.py | 15 +- tests/voice/test_07_languages.py | 10 +- tests/voice/test_09_speaker_id.py | 4 + tests/voice/test_10_finalize.py | 1 - tests/voice/test_11_audio_buffer.py | 30 +- tests/voice/test_12_smart_turn_with_files.py | 9 - tests/voice/test_13_smart_turn_transcribe.py | 5 +- tests/voice/test_14_presets.py | 16 +- 32 files changed, 1224 insertions(+), 557 deletions(-) delete mode 100644 .gitattributes create mode 100644 examples/tts/tts_autoplay/README.md create mode 100644 examples/tts/tts_autoplay/requirements.txt create mode 100644 examples/tts/tts_autoplay/tts_stream_example.py create mode 100644 sdk/voice/speechmatics/voice/_vad.py diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index d899f65..0000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.wav filter=lfs diff=lfs merge=lfs -text diff --git a/.github/RELEASE.md b/.github/RELEASE.md index 9b6ef51..3409583 100644 --- a/.github/RELEASE.md +++ b/.github/RELEASE.md @@ -8,8 +8,8 @@ The Speechmatics Python SDK repository contains two separate packages: - `speechmatics-rt` - Real-Time API Client - `speechmatics-batch` - Batch API Client -- `speechmatics-flow` - Flow API Client - `speechmatics-voice` - Voice Agent API Client +- `speechmatics-tts` - TTS API Client Each package is released independently with its own versioning and release workflow. @@ -91,55 +91,55 @@ To release a new version of the Batch SDK: - Update GitHub release notes - Announce the release -### 3. Flow SDK Release +### 3. Voice Agent SDK Release -To release a new version of the Flow SDK: +To release a new version of the Voice Agent SDK: 1. **Create a Release Tag** ```bash - git tag flow/v1.0.0 - git push origin flow/v1.0.0 + git tag voice/v1.0.0 + git push origin voice/v1.0.0 ``` 2. **Automated Workflow** - The `release-flow.yaml` workflow will automatically: + The `release-voice.yaml` workflow will automatically: - - Extract version from tag (e.g., `flow/v1.0.0` → `1.0.0`) + - Extract version from tag (e.g., `voice/v1.0.0` → `1.0.0`) - Run comprehensive tests across Python versions - - Update version in `sdk/flow/speechmatics/flow/__init__.py` + - Update version in `sdk/voice/speechmatics/voice/__init__.py` - Build the package - Publish to PyPI 3. **Manual Steps After Release** - Verify the package is available on PyPI - - Test installation: `pip install speechmatics-flow==1.0.0` + - Test installation: `pip install speechmatics-voice==1.0.0` - Update GitHub release notes - Announce the release -### 4. Voice Agent SDK Release +### 4. TTS SDK Release -To release a new version of the Voice Agent SDK: +To release a new version of the TTS SDK: 1. **Create a Release Tag** ```bash - git tag voice/v1.0.0 - git push origin voice/v1.0.0 + git tag tts/v1.0.0 + git push origin tts/v1.0.0 ``` 2. **Automated Workflow** - The `release-voice.yaml` workflow will automatically: + The `release-tts.yaml` workflow will automatically: - - Extract version from tag (e.g., `voice/v1.0.0` → `1.0.0`) + - Extract version from tag (e.g., `tts/v1.0.0` → `1.0.0`) - Run comprehensive tests across Python versions - - Update version in `sdk/voice/speechmatics/voice/__init__.py` + - Update version in `sdk/tts/speechmatics/tts/__init__.py` - Build the package - Publish to PyPI 3. **Manual Steps After Release** - Verify the package is available on PyPI - - Test installation: `pip install speechmatics-voice==1.0.0` + - Test installation: `pip install speechmatics-tts==1.0.0` - Update GitHub release notes - Announce the release @@ -162,8 +162,8 @@ Both packages follow semantic versioning (SemVer): - RT SDK: `rt/v{version}` (e.g., `rt/v1.0.0`) - Batch SDK: `batch/v{version}` (e.g., `batch/v1.0.0`) -- Flow SDK: `flow/v{version}` (e.g., `flow/v1.0.0`) - Voice Agent SDK: `voice/v{version}` (e.g., `voice/v1.0.0`) +- TTS SDK: `tts/v{version}` (e.g., `tts/v1.0.0`) ## Environment Setup @@ -173,8 +173,8 @@ Both packages are published to PyPI using GitHub Actions with OpenID Connect (OI - RT SDK: Uses `pypi-rt` environment - Batch SDK: Uses `pypi-batch` environment -- Flow SDK: Uses `pypi-flow` environment - Voice Agent SDK: Uses `pypi-voice` environment +- TTS SDK: Uses `pypi-tts` environment ### Required Secrets diff --git a/.gitignore b/.gitignore index d69fb17..14e5012 100644 --- a/.gitignore +++ b/.gitignore @@ -153,6 +153,7 @@ cython_debug/ # Ruff stuff: .ruff_cache/ +**/output.wav # PyPI configuration file .pypirc diff --git a/README.md b/README.md index 5b866f5..c66cf2d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # Speechmatics Python SDK [![License](https://img.shields.io/badge/license-MIT-yellow.svg)](https://github.com/speechmatics/speechmatics-python-sdk/blob/master/LICENSE) +[![PythonSupport](https://img.shields.io/badge/Python-3.9%2B-green)](https://www.python.org/) A collection of Python clients for Speechmatics APIs packaged as separate installable packages. These packages replace the old [speechmatics-python](https://pypi.org/project/speechmatics-python) package, which will be deprecated soon. @@ -10,7 +11,7 @@ Each client targets a specific Speechmatics API (e.g. real-time, batch transcrip This repository contains the following packages: -### (Beta) Real-Time Client (`speechmatics-rt`) +### Real-Time Client (`speechmatics-rt`) A Python client for Speechmatics Real-Time API. @@ -18,7 +19,7 @@ A Python client for Speechmatics Real-Time API. pip install speechmatics-rt ``` -### (Beta) Batch Client (`speechmatics-batch`) +### Batch Client (`speechmatics-batch`) An async Python client for Speechmatics Batch API. @@ -26,15 +27,7 @@ An async Python client for Speechmatics Batch API. pip install speechmatics-batch ``` -### (Beta) Flow Client (`speechmatics-flow`) - -An async Python client for Speechmatics Flow API. - -```bash -pip install speechmatics-flow -``` - -### (Beta) Voice Agent Client (`speechmatics-voice`) +### Voice Agent Client (`speechmatics-voice`) A Voice Agent Python client for Speechmatics Real-Time API. @@ -46,7 +39,7 @@ pip install speechmatics-voice pip install speechmatics-voice[smart] ``` -### (Beta) TTS Client (`speechmatics-tts`) +### TTS Client (`speechmatics-tts`) An async Python client for Speechmatics TTS API. @@ -69,10 +62,6 @@ speechmatics-python-sdk/ │ │ ├── pyproject.toml │ │ └── README.md │ │ -│ ├── flow/ -│ │ ├── pyproject.toml -│ │ └── README.md -│ │ │ ├── voice/ │ │ ├── pyproject.toml │ │ └── README.md @@ -84,7 +73,6 @@ speechmatics-python-sdk/ ├── tests/ │ ├── batch/ │ ├── rt/ -│ ├── flow/ │ ├── voice/ │ └── tts/ │ @@ -126,7 +114,6 @@ Each package can be installed separately: ```bash pip install speechmatics-rt pip install speechmatics-batch -pip install speechmatics-flow pip install speechmatics-voice[smart] pip install speechmatics-tts ``` diff --git a/examples/tts/tts_autoplay/README.md b/examples/tts/tts_autoplay/README.md new file mode 100644 index 0000000..3bd9dd6 --- /dev/null +++ b/examples/tts/tts_autoplay/README.md @@ -0,0 +1,43 @@ +# Speechmatics TTS Async Streaming API Client + +This example shows how to use the Speechmatics TTS API to generate audio from text and autoplay it using sounddevice through the systems default audio output device. +You must have an audio output device configured on their system for this example to work. +## How it Works + +There are two main components in this example, an audio generator and an audio player. These components are run concurrently using asyncio as tasks, ochestrated by the main() function, to generate and play audio in real-time. +### audio_generator() + +This producer function connects to the Speechmatics TTS API using the AsyncClient. It calls client.generate() with your text, the voice you want to use, and the output format - RAW_PCM_16000 in this example. +The code iterates over the audio data as it is streamed in chunks (iter_chunked), and accumulates in a bytearray buffer. +The while len(buffer) >= 2 loop reads each audio sample containing 2 bytes, from the buffer, and converts it to a numpy array of int-16 values, which is then put into the audio_queue. +The processed 2 byte sample is then removed from the front of the buffer. +END_OF_STREAM is used as a sentinel value to signal the end of the audio stream, with no more audio data to process. +If an error occurs during audio generation, the END_OF_STREAM sentinel value is still put into the queue to signal the end of the audio stream to prevent the consumer, audio_player(), from getting stuck in an infinite loop, and raises the exception. +### audio_player() + +This consumer function initialises a sounddevice OutputStream, which is responsible for streaming the audio data to the default audio output device. Within the outputstream, the while True loop means there is continous processing of the incoming audio data. +sample = await asyncio.wait_for(play_queue.get(), timeout=0.1) fetches the next sample from the queue, or waits for 0.1 seconds if the queue is empty. +If the sample is END_OF_STREAM, the while loop breaks and the audio player exits. +If the sample is not END_OF_STREAM, it is converted to a numpy array of int-16 values and written to the audio output device using the sounddevice OutputStream. +play_queue.task_done() is called to signal that the sample has been processed. +If an error occurs during audio playback, the END_OF_STREAM sentinel value is still put into the queue to signal the end of the audio stream to prevent the audio_player() from getting stuck in an infinite loop, and raises the exception. + +## Installation + +```bash +pip install -r requirements.txt +``` + +## Usage + +To run the example, use the following command: + +```bash +python tts_stream_example.py +``` + +## Environment Variables + +The client supports the following environment variables: + +- `SPEECHMATICS_API_KEY`: Your Speechmatics API key diff --git a/examples/tts/tts_autoplay/requirements.txt b/examples/tts/tts_autoplay/requirements.txt new file mode 100644 index 0000000..550abbc --- /dev/null +++ b/examples/tts/tts_autoplay/requirements.txt @@ -0,0 +1,3 @@ +numpy>=1.24.3 +sounddevice>=0.4.6 +speechmatics-tts>=0.1.0 diff --git a/examples/tts/tts_autoplay/tts_stream_example.py b/examples/tts/tts_autoplay/tts_stream_example.py new file mode 100644 index 0000000..6114f2f --- /dev/null +++ b/examples/tts/tts_autoplay/tts_stream_example.py @@ -0,0 +1,119 @@ +import asyncio +import sounddevice as sd +import numpy as np +from speechmatics.tts import AsyncClient, Voice, OutputFormat + +# Configuration +TEXT = "Welcome to the future of audio generation from text! This audio is a demo of the async streaming Speechmatics' text to speech API." +VOICE = Voice.JACK +OUTPUT_FORMAT = OutputFormat.RAW_PCM_16000 + +# Audio Parameters +SAMPLE_RATE = 16000 #Hz +SAMPLE_WIDTH = 2 # 16-bit audio +CHANNELS = 1 # Mono audio +CHUNK_SIZE = 2048 # Size of audio chunks +BUFFER_SIZE = 4096 # Size of buffer + +# Sentinel value to signal end of stream +END_OF_STREAM = None + + +# Core Async Functions + +# 1. Producer: Generates audio and puts chunks into the queue: + +async def audio_generator(audio_queue: asyncio.Queue, text: str, voice: str, output_format: str) -> None: + try: + async with AsyncClient() as client, await client.generate( + text=text, + voice=voice, + output_format=output_format + ) as response: + buffer=bytearray() + async for chunk in response.content.iter_chunked(BUFFER_SIZE): + if not chunk: + continue + buffer.extend(chunk) + + # Process complete frames (2 bytes per sample for 16-bit audio) + # Convert little-endian 16-bit signed int to np.int-16 + while len(buffer) >= 2: + sample = int.from_bytes(buffer[:2], byteorder='little', signed=True) + await audio_queue.put(sample) + buffer = buffer[2:] + + await audio_queue.put(END_OF_STREAM) + print("Audio generated and put into queue.") + + except Exception as e: + print(f"[{'Generator'}] An error occurred in the audio generator: {e}") + await audio_queue.put(END_OF_STREAM) + raise + +# 2. Consumer: Read audio data from queue and play it in real-time using sounddevice. +async def audio_player(play_queue: asyncio.Queue) -> None: + try: + with sd.OutputStream( + samplerate=SAMPLE_RATE, + channels=CHANNELS, + dtype='int16', # 16-bit PCM + blocksize=CHUNK_SIZE, + latency='high', + ) as stream: + buffer=[] + while True: + try: + sample = await asyncio.wait_for(play_queue.get(), timeout=0.1) + if sample is END_OF_STREAM: + if buffer: + audio_data=np.array(buffer, dtype=np.int16) + stream.write(audio_data) + buffer=[] + break + + buffer.append(sample) + if len(buffer) >= CHUNK_SIZE: + audio_data=np.array(buffer[:CHUNK_SIZE], dtype=np.int16) + stream.write(audio_data) + buffer=buffer[CHUNK_SIZE:] + + play_queue.task_done() + + except asyncio.TimeoutError: + if buffer: + audio_data=np.array(buffer, dtype=np.int16) + stream.write(audio_data) + buffer=[] + continue + + except Exception as e: + print(f"[{'Player'}] An error occurred playing audio chunk {e}") + raise + + except Exception as e: + print(f"[{'Player'}] An error occurred in the audio player: {e}") + raise + finally: + sd.stop() + +# 3. Main Function: Orchestrate audio generation and audio stream +async def main() -> None: + play_queue = asyncio.Queue() + + # Create tasks + tasks = [ + asyncio.create_task(audio_generator(play_queue, TEXT, VOICE, OUTPUT_FORMAT)), + asyncio.create_task(audio_player(play_queue)) + ] + + try: + await asyncio.gather(*tasks) + + except Exception as e: + for task in tasks: + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/voice/cli/README.md b/examples/voice/cli/README.md index ffc89f4..7da08bc 100644 --- a/examples/voice/cli/README.md +++ b/examples/voice/cli/README.md @@ -11,7 +11,7 @@ Real-time transcription tool using the Speechmatics Voice SDK. Supports micropho python cli.py -k YOUR_API_KEY -p # Example that saves the output in verbose mode using a preset -python cli.py -k YOUR_API_KEY -vvvvvpDSr -P conversation_smart_turn +python cli.py -k YOUR_API_KEY -vvvvvpDSr -P smart_turn ``` Output saved to `./output/YYYYMMDD_HHMMSS/log.jsonl` @@ -38,9 +38,8 @@ Press `CTRL+C` to stop. Common short codes: - `-k` API key | `-i` input file | `-o` output dir | `-p` pretty print | `-v` verbose -- `-r` record | `-S` save slices | `-P` preset | `-W` show config -- `-l` language | `-m` mode | `-d` max delay | `-t` silence trigger -- `-f` focus speakers | `-s` known speakers | `-E` enrol +- `-r` record | `-P` preset | `-w` show compact config | `-W` show complete config +- `-s` known speakers | `-E` enrol ### Core @@ -55,9 +54,7 @@ Common short codes: - Inside session directory: - `log.jsonl` - All events with timestamps - `recording.wav` - Microphone recording (if `-r` is used) - - `slice_*.wav` and `slice_*.json` - Audio slices (if `-S` is used) - `-r, --record` - Record microphone audio to recording.wav (microphone input only) -- `-S, --save-slices` - Save audio slices on SPEAKER_ENDED events (SMART_TURN mode only) - `-p, --pretty` - Formatted console output with colors - `-v, --verbose` - Increase verbosity (can repeat: `-v`, `-vv`, `-vvv`, `-vvvv`, `-vvvvv`) - `-v` - Add speaker VAD events @@ -67,48 +64,35 @@ Common short codes: - `-vvvvv` - Add STT events - `-L, --legacy` - Show only legacy transcript messages - `-D, --default-device` - Use default audio device (skip selection) -- `-w, --results` - Include word-level results in segments +- `--results` - Include word-level results in segments ### Audio -- `-R, --sample-rate` - Sample rate in Hz (default: 16000) -- `-C, --chunk-size` - Chunk size in bytes (default: 320) +- `--sample-rate` - Sample rate in Hz (default: 16000) +- `--chunk-size` - Chunk size in bytes (default: 320) - `-M, --mute` - Mute audio playback for file input ### Voice Agent Config -**Configuration Priority:** +**Configuration (Required):** -1. Use `--preset` to start with a preset configuration (recommended) -2. Use `-c/--config` to provide a complete JSON configuration -3. Use individual parameters (`-l`, `-d`, `-t`, `-m`) to override preset settings or create custom config - -**Preset Options:** - -- `-P, --preset` - Use preset configuration: `scribe`, `low_latency`, `conversation_adaptive`, `conversation_smart_turn`, or `captions` -- `--list-presets` - List available presets and exit -- `-W, --show` - Display the final configuration as JSON and exit (after applying preset/config and overrides) - -**Configuration Options:** +You must provide either a preset or a config file: +- `-P, --preset` - Use preset configuration: `scribe`, `fast`, `adaptive`, `smart_turn`, or `captions` - `-c, --config` - JSON config string or file path (complete configuration) -- `-l, --language` - Language code (overrides preset if used together) -- `-d, --max-delay` - Max transcription delay in seconds (overrides preset if used together) -- `-t, --end-of-utterance-silence-trigger` - Silence duration for turn end in seconds (overrides preset if used together) -- `-m, --end-of-utterance-mode` - Turn detection mode: `FIXED`, `ADAPTIVE`, `SMART_TURN`, or `EXTERNAL` (overrides preset if used together) +- `--list-presets` - List available presets and exit -**Note:** When using `-c/--config`, you cannot use `-l`, `-d`, `-t`, `-m`, `-f`, `-I`, `-x`, or `-s` as the config JSON should contain all settings. +**Note:** `--preset` and `--config` are mutually exclusive. You cannot use both together. -### Speaker Management +**Display Configuration:** -- `-f, --focus-speakers` - Speakers to focus on (e.g., `S1 S2`) -- `-I, --ignore-speakers` - Speakers to ignore (e.g., `S1 S2`) -- `-x, --ignore-mode` - Use ignore mode (instead of retain) for focus speakers +- `-w, --show-compact` - Display compact configuration as JSON and exit (excludes unset and None values) +- `-W, --show-complete` - Display complete configuration as JSON and exit (includes all defaults) ### Speaker Identification - `-E, --enrol` - Enrol speakers and output identifiers at end -- `-s, --speakers` - Known speakers JSON string or file path +- `-s, --speakers` - Known speakers JSON string or file path (can be used with preset or config) ## Examples @@ -118,16 +102,16 @@ Common short codes: python cli.py --list-presets ``` -**Show config (from preset):** +**Show compact config (from preset):** ```bash -python cli.py -P scribe -W +python cli.py -P scribe -w ``` -**Show config (with overrides):** +**Show complete config (from preset):** ```bash -python cli.py -P scribe -l fr -d 1.0 -W +python cli.py -P scribe -W ``` **Use preset:** @@ -136,16 +120,10 @@ python cli.py -P scribe -l fr -d 1.0 -W python cli.py -k YOUR_KEY -P scribe -p ``` -**Use preset with overrides:** +**Basic microphone (requires preset or config):** ```bash -python cli.py -k YOUR_KEY -P scribe -l fr -d 1.0 -p -``` - -**Basic microphone:** - -```bash -python cli.py -k YOUR_KEY -p +python cli.py -k YOUR_KEY -P adaptive -p ``` Output saved to `./output/YYYYMMDD_HHMMSS/log.jsonl` @@ -153,7 +131,7 @@ Output saved to `./output/YYYYMMDD_HHMMSS/log.jsonl` **Record microphone audio:** ```bash -python cli.py -k YOUR_KEY -r -p +python cli.py -k YOUR_KEY -P adaptive -r -p ``` Recording saved to `./output/YYYYMMDD_HHMMSS/recording.wav` @@ -161,57 +139,35 @@ Recording saved to `./output/YYYYMMDD_HHMMSS/recording.wav` **Custom output directory:** ```bash -python cli.py -k YOUR_KEY -o ./my_sessions -p +python cli.py -k YOUR_KEY -P adaptive -o ./my_sessions -p ``` Output saved to `./my_sessions/YYYYMMDD_HHMMSS/log.jsonl` -**EXTERNAL mode with manual turn control:** - -```bash -python cli.py -k YOUR_KEY -m EXTERNAL -p -``` - -Press 't' or 'T' to manually signal end of turn. - -**Save audio slices (SMART_TURN mode):** - -```bash -python cli.py -k YOUR_KEY -P conversation_smart_turn -S -p -``` - -Audio slices (~8 seconds) saved to `./output/YYYYMMDD_HHMMSS/slice_*.wav` with matching `.json` metadata files on each SPEAKER_ENDED event. - **Audio file:** ```bash -python cli.py -k YOUR_KEY -i audio.wav -p +python cli.py -k YOUR_KEY -P scribe -i audio.wav -p ``` **Audio file (muted):** ```bash -python cli.py -k YOUR_KEY -i audio.wav -Mp +python cli.py -k YOUR_KEY -P scribe -i audio.wav -Mp ``` **Verbose logging:** ```bash -python cli.py -k YOUR_KEY -vv -p +python cli.py -k YOUR_KEY -P adaptive -vv -p ``` Shows additional events (speaker VAD, turn predictions, etc.) -**Focus on speakers:** - -```bash -python cli.py -k YOUR_KEY -f S1 S2 -p -``` - **Enrol speakers:** ```bash -python cli.py -k YOUR_KEY -Ep +python cli.py -k YOUR_KEY -P adaptive -Ep ``` Press `CTRL+C` when done to see speaker identifiers. @@ -219,7 +175,7 @@ Press `CTRL+C` when done to see speaker identifiers. **Use known speakers:** ```bash -python cli.py -k YOUR_KEY -s speakers.json -p +python cli.py -k YOUR_KEY -P adaptive -s speakers.json -p ``` Example `speakers.json`: @@ -231,12 +187,18 @@ Example `speakers.json`: ] ``` -**Custom config:** +**Custom config file:** ```bash python cli.py -k YOUR_KEY -c config.json -p ``` +**Custom config with known speakers:** + +```bash +python cli.py -k YOUR_KEY -c config.json -s speakers.json -p +``` + ## Notes - Output directory (`-o`) defaults to `./output` @@ -244,10 +206,7 @@ python cli.py -k YOUR_KEY -c config.json -p - Session directory contains: - `log.jsonl` - All events with timestamps - `recording.wav` - Microphone recording (if `-r` is used) - - `slice_*.wav` and `slice_*.json` - Audio slices (if `--save-slices` is used in SMART_TURN mode) - Session subdirectories prevent accidental data loss from multiple runs -- Audio slices are ~8 seconds and saved on each SPEAKER_ENDED event -- JSON metadata includes event details, speaker ID, timing, and slice duration - Speaker identifiers are encrypted and unique to your API key - Allow speakers to say at least 20 words before enrolling - Avoid labels `S1`, `S2` (reserved by engine) diff --git a/examples/voice/cli/cli.py b/examples/voice/cli/cli.py index ade068a..5dc53ab 100644 --- a/examples/voice/cli/cli.py +++ b/examples/voice/cli/cli.py @@ -24,8 +24,6 @@ from speechmatics.voice import AdditionalVocabEntry from speechmatics.voice import AgentServerMessageType from speechmatics.voice import EndOfUtteranceMode -from speechmatics.voice import SpeakerFocusConfig -from speechmatics.voice import SpeakerFocusMode from speechmatics.voice import SpeakerIdentifier from speechmatics.voice import VoiceAgentClient from speechmatics.voice import VoiceAgentConfig @@ -35,8 +33,6 @@ # CONSTANTS # ============================================================================== -# Audio slice duration (seconds of audio to capture before speaker ends) -AUDIO_SLICE_DURATION = 8.0 # Default output directory DEFAULT_OUTPUT_DIR = "./output" @@ -48,6 +44,7 @@ # Console colors for message types COLORS = { # Segments + "Diagnostics": "\033[90m", "AddPartialSegment": "\033[93m", "AddSegment": "\033[1;92m", # Speaker events @@ -59,6 +56,8 @@ "StartOfTurn": "\033[91m", "EndOfTurnPrediction": "\033[95m", "EndOfTurn": "\033[1;91m", + # VAD status + "VadStatus": "\033[41;97m", # Transcript events "AddPartialTranscript": "\033[90m", "AddTranscript": "\033[90m", @@ -94,7 +93,7 @@ async def main() -> None: return # Setup audio source (microphone or file) - skip if just showing config - if not args.show: + if not args.show_compact and not args.show_complete: audio_source = setup_audio_source(args) if not audio_source: return @@ -125,15 +124,10 @@ async def main() -> None: # Setup file paths log_file = output_dir / LOG_FILENAME record_file = output_dir / RECORDING_FILENAME if args.record else None - slices_dir = output_dir if args.save_slices else None # Store in args for easy access args.log_file = str(log_file) args.record_file = str(record_file) if record_file else None - args.slices_dir = str(slices_dir) if slices_dir else None - - # Create speaker configuration - speaker_config = create_speaker_config(args) # Known speakers known_speakers: list[SpeakerIdentifier] = [SpeakerIdentifier(**s) for s in args.speakers] if args.speakers else [] @@ -162,28 +156,23 @@ async def main() -> None: ] ) - # Copy in overrides - if args.language: - config.language = args.language - if args.end_of_utterance_silence_trigger: - config.end_of_utterance_silence_trigger = args.end_of_utterance_silence_trigger - if args.max_delay: - config.max_delay = args.max_delay - if args.end_of_utterance_mode: - config.end_of_utterance_mode = args.end_of_utterance_mode - - # Copy speaker settings - config.speaker_config = speaker_config + # Copy speaker settings (only known_speakers can be overridden) config.known_speakers = known_speakers config.include_results = args.results + # Set chunk size + config.chunk_size = args.chunk_size + # Set common items config.enable_diarization = True # Handle config display - if args.show: + if args.show_compact: print(config.to_json(indent=2, exclude_unset=True, exclude_none=True)) return + if args.show_complete: + print(config.to_json(indent=2, exclude_unset=False, exclude_none=False)) + return # Set the audio sample rate config.sample_rate = audio_source["sample_rate"] @@ -362,31 +351,6 @@ def setup_audio_output(audio_source: dict, args) -> AudioPlayer | None: return audio_player -# ============================================================================== -# SPEAKER CONFIGURATION -# ============================================================================== - - -def create_speaker_config(args) -> SpeakerFocusConfig: - """Create speaker diarization configuration from arguments. - - Args: - args: Command-line arguments - - Returns: - SpeakerFocusConfig instance. - """ - if args.focus_speakers or args.ignore_speakers: - focus_mode = SpeakerFocusMode.IGNORE if args.ignore_mode else SpeakerFocusMode.RETAIN - return SpeakerFocusConfig( - focus_speakers=args.focus_speakers or [], - ignore_speakers=args.ignore_speakers or [], - focus_mode=focus_mode, - ) - else: - return SpeakerFocusConfig() - - # ============================================================================== # EVENT HANDLERS # ============================================================================== @@ -401,70 +365,6 @@ def register_event_handlers(client: VoiceAgentClient, args, start_time: datetime start_time: Start time for timestamp calculation """ - # Audio slice counter - slice_counter = {"count": 0} - - async def async_save_audio_slice(message: dict) -> None: - """Save audio slice when speaker ends (SMART_TURN mode only).""" - if not args.slices_dir: - return - - # Only save slices in SMART_TURN mode - if client._config.end_of_utterance_mode != "smart_turn": - return - - # Get time from message - event_time = message.get("time") - if not event_time: - return - - speaker_id = message.get("speaker_id", "unknown") - - # Get audio slice from buffer - # Capture audio leading up to the speaker ending - start_time = event_time - AUDIO_SLICE_DURATION - end_time = event_time - - try: - audio_data = await client._audio_buffer.get_frames( - start_time=start_time, - end_time=end_time, - ) - - if audio_data: - # Generate filenames - slice_counter["count"] += 1 - base_filename = f"slice_{slice_counter['count']:04d}_{speaker_id}_{event_time:.2f}" - wav_filepath = Path(args.slices_dir) / f"{base_filename}.wav" - json_filepath = Path(args.slices_dir) / f"{base_filename}.json" - - # Save audio file - async with AudioFileWriter( - str(wav_filepath), client._audio_sample_rate, client._audio_sample_width - ) as writer: - await writer.write(audio_data) - - # Save JSON metadata - metadata = { - "message": message, - "speaker_id": speaker_id, - "is_active": message.get("is_active"), - "time": event_time, - "slice_start_time": start_time, - "slice_end_time": end_time, - "slice_duration": end_time - start_time, - "audio_file": f"{base_filename}.wav", - } - with open(json_filepath, "w") as f: - json.dump(metadata, f, indent=2) - - except Exception as e: - print(f"Error saving audio slice: {e}") - - def save_audio_slice(message: dict) -> None: - """Save audio slice when speaker ends (SMART_TURN mode only).""" - asyncio.create_task(async_save_audio_slice(message)) - def console_print(ts: datetime.datetime, message: dict) -> None: """Print message to console with optional formatting.""" if not args.pretty: @@ -482,11 +382,11 @@ def console_print(ts: datetime.datetime, message: dict) -> None: _segs = [] for segment in message["segments"]: suffix = "" if segment["is_active"] else " (background)" - _segs.append(f"@{segment['speaker_id']}{suffix}: `{segment['text']}`") + _segs.append(f"@{segment['speaker_id']}{suffix}: `{segment['text']}` {segment.get('annotation', '')}") payload = {"segments": _segs} # Print to console - print(f"{color}{ts_str} {msg_type:<24} {json.dumps(payload)}\033[0m") + print(f"{color}{ts_str} {client._total_time:>7.3f} {msg_type:<24} {json.dumps(payload)}\033[0m") def log_message(message: dict[str, Any]) -> None: """Log message to console and optional JSONL file.""" @@ -499,8 +399,8 @@ def log_message(message: dict[str, Any]) -> None: # Register standard handlers client.on(AgentServerMessageType.INFO, log_message) - client.on(AgentServerMessageType.RECOGNITION_STARTED, log_message) - client.on(AgentServerMessageType.END_OF_TRANSCRIPT, log_message) + client.once(AgentServerMessageType.RECOGNITION_STARTED, log_message) + client.once(AgentServerMessageType.END_OF_TRANSCRIPT, log_message) # Voice SDK messages if not args.legacy: @@ -515,14 +415,13 @@ def log_message(message: dict[str, Any]) -> None: if args.verbose >= 1: client.on(AgentServerMessageType.SPEAKER_STARTED, log_message) client.on(AgentServerMessageType.SPEAKER_ENDED, log_message) - - # Save audio slices on SPEAKER_ENDED (SMART_TURN mode only) - if args.slices_dir: - client.on(AgentServerMessageType.SPEAKER_ENDED, save_audio_slice) + client.on(AgentServerMessageType.VAD_STATUS, log_message) + client.on(AgentServerMessageType.DIAGNOSTICS, log_message) # Verbose turn prediction if args.verbose >= 2: client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, log_message) + client.on(AgentServerMessageType.SMART_TURN_RESULT, log_message) # Metrics if args.verbose >= 3: @@ -532,7 +431,6 @@ def log_message(message: dict[str, Any]) -> None: # Verbose STT events if args.verbose >= 4: client.on(AgentServerMessageType.END_OF_UTTERANCE, log_message) - client.on("ForcedEndOfUtterance", log_message) client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, log_message) client.on(AgentServerMessageType.ADD_TRANSCRIPT, log_message) @@ -748,18 +646,24 @@ def parse_args(): "-P", "--preset", type=str, - help="Preset configuration name (e.g., scribe, low_latency, conversation_adaptive)", + help="Preset configuration name (e.g., scribe, fast, adaptive)", ) parser.add_argument( "--list-presets", action="store_true", help="List available preset configurations and exit", ) + parser.add_argument( + "-w", + "--show-compact", + action="store_true", + help="Display the compact configuration as JSON and exit (excludes unset and None values)", + ) parser.add_argument( "-W", - "--show", + "--show-complete", action="store_true", - help="Display the final configuration as JSON and exit (after applying preset/config and overrides)", + help="Display the complete configuration as JSON and exit (includes all defaults)", ) parser.add_argument( "-c", @@ -797,18 +701,16 @@ def parse_args(): # ============================================================================== parser.add_argument( - "-R", "--sample-rate", type=int, default=16000, help="Audio sample rate in Hz (default: 16000)", ) parser.add_argument( - "-C", "--chunk-size", type=int, - default=320, - help="Audio chunk size in bytes (default: 320)", + default=160, + help="Audio chunk size in bytes (default: 160)", ) parser.add_argument( "-M", @@ -821,12 +723,6 @@ def parse_args(): # Output options # ============================================================================== - parser.add_argument( - "-S", - "--save-slices", - action="store_true", - help="Save audio slices to output directory on SPEAKER_ENDED events (SMART_TURN mode only)", - ) parser.add_argument( "-p", "--pretty", @@ -853,65 +749,11 @@ def parse_args(): help="Use default device (default: False)", ) parser.add_argument( - "-w", "--results", action="store_true", help="Include word-level transcription results in output (default: False)", ) - # ============================================================================== - # Voice Agent configuration overrides - # ============================================================================== - - parser.add_argument( - "-l", - "--language", - type=str, - help="Language code (default: en)", - ) - parser.add_argument( - "-d", - "--max-delay", - type=float, - help="Maximum delay for transcription results in seconds (default: 0.7)", - ) - parser.add_argument( - "-t", - "--end-of-utterance-silence-trigger", - type=float, - help="Silence duration to trigger end of utterance in seconds (default: 0.5)", - ) - parser.add_argument( - "-m", - "--end-of-utterance-mode", - type=lambda s: s.upper(), - choices=["FIXED", "ADAPTIVE", "EXTERNAL", "SMART_TURN"], - help="End of utterance detection mode (default: ADAPTIVE)", - ) - - # ============================================================================== - # Speaker management - # ============================================================================== - - parser.add_argument( - "-f", - "--focus-speakers", - nargs="*", - help="Speakers to focus on (e.g., S1 S2). Use with --ignore-mode to ignore these speakers instead", - ) - parser.add_argument( - "-I", - "--ignore-speakers", - nargs="*", - help="Specific speakers to ignore (e.g., S1 S2)", - ) - parser.add_argument( - "-x", - "--ignore-mode", - action="store_true", - help="Use IGNORE mode instead of RETAIN mode for non-focus speakers", - ) - # ============================================================================== # Speaker identification # ============================================================================== @@ -935,26 +777,21 @@ def parse_args(): args = parser.parse_args() - mutually_excludive = [ - "preset", - "end-of-utterance-mode", - "end-of-utterance-silence-trigger", - "focus-speakers", - "ignore-mode", - "ignore-speakers", - "language", - "max-delay", - "speakers", - ] - - if args.config is not None: - conflicts: list[str] = [] - for arg in mutually_excludive: - if getattr(args, arg.replace("-", "_")): - conflicts.append(arg) - if conflicts: - print(f"**ERROR** -> You cannot use {[f'--{arg}' for arg in conflicts]} in combination with -c/--config") - exit(1) + # Either preset or config must be provided + if ( + args.config is None + and args.preset is None + and not args.list_presets + and not args.show_compact + and not args.show_complete + ): + print("**ERROR** -> You must provide either --preset or --config") + exit(1) + + # Preset and config are mutually exclusive + if args.config is not None and args.preset is not None: + print("**ERROR** -> You cannot use both --preset and --config") + exit(1) # Return the parsed arguments return args diff --git a/examples/voice/simple/simple.py b/examples/voice/simple/simple.py index aa0afe9..00312ca 100644 --- a/examples/voice/simple/simple.py +++ b/examples/voice/simple/simple.py @@ -10,7 +10,10 @@ from speechmatics.rt import Microphone from speechmatics.voice import AgentServerMessageType +from speechmatics.voice import SpeechSegmentConfig from speechmatics.voice import VoiceAgentClient +from speechmatics.voice import VoiceAgentConfig +from speechmatics.voice import VoiceAgentConfigPreset async def main() -> None: @@ -28,15 +31,20 @@ async def main() -> None: print("Error: PyAudio not available - install with: pip install pyaudio") return + # Config + config = VoiceAgentConfigPreset.FAST( + VoiceAgentConfig(speech_segment_config=SpeechSegmentConfig(emit_sentences=False)) + ) + # Create client - client = VoiceAgentClient(api_key=api_key, preset="scribe") + client = VoiceAgentClient(api_key=api_key, config=config) # Handle partial segments (interim results) - @client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT) - def on_partial_segment(message): - segments = message.get("segments", []) - for segment in segments: - print(f"[PARTIAL] {segment['speaker_id']}: {segment['text']}") + # @client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT) + # def on_partial_segment(message): + # segments = message.get("segments", []) + # for segment in segments: + # print(f"[PARTIAL] {segment['speaker_id']}: {segment['text']}") # Handle final segments @client.on(AgentServerMessageType.ADD_SEGMENT) diff --git a/sdk/rt/speechmatics/rt/_auth.py b/sdk/rt/speechmatics/rt/_auth.py index 6968b36..ee75bca 100644 --- a/sdk/rt/speechmatics/rt/_auth.py +++ b/sdk/rt/speechmatics/rt/_auth.py @@ -44,9 +44,6 @@ class StaticKeyAuth(AuthBase): def __init__(self, api_key: Optional[str] = None): self._api_key = api_key or os.environ.get("SPEECHMATICS_API_KEY") - if not self._api_key: - raise ValueError("API key required: provide api_key or set SPEECHMATICS_API_KEY") - async def get_auth_headers(self) -> dict[str, str]: return {"Authorization": f"Bearer {self._api_key}"} diff --git a/sdk/rt/speechmatics/rt/_models.py b/sdk/rt/speechmatics/rt/_models.py index 63680b7..84e5720 100644 --- a/sdk/rt/speechmatics/rt/_models.py +++ b/sdk/rt/speechmatics/rt/_models.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ssl from dataclasses import asdict from dataclasses import dataclass from dataclasses import field @@ -462,9 +463,9 @@ class ConnectionConfig: close_timeout: Timeout for closing WebSocket connection. max_size: Maximum message size in bytes. max_queue: Maximum number of messages in receive queue. - read_limit: Maximum number of bytes to read from WebSocket. - write_limit: Maximum number of bytes to write to WebSocket. - + read_limit: Maximum number of bytes to read from WebSocket (legacy websockets only). + write_limit: Maximum number of bytes to write to WebSocket (legacy websockets only). + ssl_context: SSL context for the WebSocket connection. Returns: Websocket connection configuration as a dict while excluding None values. """ @@ -477,9 +478,29 @@ class ConnectionConfig: max_queue: Optional[int] = None read_limit: Optional[int] = None write_limit: Optional[int] = None + ssl_context: ssl.SSLContext = field(default_factory=ssl.create_default_context) def to_dict(self) -> dict[str, Any]: - return asdict(self, dict_factory=lambda x: {k: v for (k, v) in x if v is not None}) + """Convert to dict, excluding ssl field to avoid pickle errors.""" + result = {} + if self.open_timeout is not None: + result["open_timeout"] = self.open_timeout + if self.ping_interval is not None: + result["ping_interval"] = self.ping_interval + if self.ping_timeout is not None: + result["ping_timeout"] = self.ping_timeout + if self.close_timeout is not None: + result["close_timeout"] = self.close_timeout + if self.max_size is not None: + result["max_size"] = self.max_size + if self.max_queue is not None: + result["max_queue"] = self.max_queue + if self.read_limit is not None: + result["read_limit"] = self.read_limit + if self.write_limit is not None: + result["write_limit"] = self.write_limit + + return result @dataclass diff --git a/sdk/rt/speechmatics/rt/_transport.py b/sdk/rt/speechmatics/rt/_transport.py index 4501dbc..e33a020 100644 --- a/sdk/rt/speechmatics/rt/_transport.py +++ b/sdk/rt/speechmatics/rt/_transport.py @@ -24,12 +24,14 @@ from websockets.asyncio.client import connect WS_HEADERS_KEY = "additional_headers" + IS_LEGACY_WEBSOCKETS = False except ImportError: # Fall back to legacy websockets from websockets.legacy.client import WebSocketClientProtocol from websockets.legacy.client import connect # type: ignore WS_HEADERS_KEY = "extra_headers" + IS_LEGACY_WEBSOCKETS = True class Transport: @@ -116,8 +118,14 @@ async def connect(self, ws_headers: Optional[dict] = None) -> None: ws_kwargs: dict = { WS_HEADERS_KEY: ws_headers, **self._conn_config.to_dict(), + "ssl": self._conn_config.ssl_context, } + # Filter out parameters not supported by new websockets >=13.0 + if not IS_LEGACY_WEBSOCKETS: + ws_kwargs.pop("read_limit", None) + ws_kwargs.pop("write_limit", None) + self._websocket = await connect( url_with_params, **ws_kwargs, diff --git a/sdk/tts/speechmatics/tts/_models.py b/sdk/tts/speechmatics/tts/_models.py index fdbca0e..572e598 100644 --- a/sdk/tts/speechmatics/tts/_models.py +++ b/sdk/tts/speechmatics/tts/_models.py @@ -50,8 +50,10 @@ class Voice(str, Enum): sarah: English (UK) female voice. theo: English (UK) male voice. megan: English (UK) female voice. + jack: English (US) male voice. """ SARAH = "sarah" THEO = "theo" MEGAN = "megan" + JACK = "jack" diff --git a/sdk/voice/README.md b/sdk/voice/README.md index 7213b51..a876ad1 100644 --- a/sdk/voice/README.md +++ b/sdk/voice/README.md @@ -103,13 +103,13 @@ client = VoiceAgentClient(api_key=api_key, preset="external") client = VoiceAgentClient(api_key=api_key, preset="scribe") # Low latency preset - for fast responses -client = VoiceAgentClient(api_key=api_key, preset="low_latency") +client = VoiceAgentClient(api_key=api_key, preset="fast") # Conversation preset - for natural dialogue -client = VoiceAgentClient(api_key=api_key, preset="conversation_adaptive") +client = VoiceAgentClient(api_key=api_key, preset="adaptive") # Advanced conversation with ML turn detection -client = VoiceAgentClient(api_key=api_key, preset="conversation_smart_turn") +client = VoiceAgentClient(api_key=api_key, preset="smart_turn") # Captions preset - for live captioning client = VoiceAgentClient(api_key=api_key, preset="captions") @@ -284,7 +284,7 @@ config = VoiceAgentConfigPreset.SCRIBE( # Available presets presets = VoiceAgentConfigPreset.list_presets() -# ['low_latency', 'conversation_adaptive', 'conversation_smart_turn', 'scribe', 'captions'] +# ['fast', 'adaptive', 'smart_turn', 'scribe', 'captions'] ``` ### Configuration Serialization @@ -678,7 +678,7 @@ class VoiceAgentClient: url: Custom WebSocket URL (defaults to SPEECHMATICS_RT_URL env var) app: Optional application name for endpoint URL config: Voice Agent configuration (optional) - preset: Preset name ("scribe", "low_latency", etc.) (optional) + preset: Preset name ("scribe", "fast", etc.) (optional) """ async def connect(self) -> None: diff --git a/sdk/voice/speechmatics/voice/__init__.py b/sdk/voice/speechmatics/voice/__init__.py index 6cb66fd..5d84048 100644 --- a/sdk/voice/speechmatics/voice/__init__.py +++ b/sdk/voice/speechmatics/voice/__init__.py @@ -32,12 +32,16 @@ from ._models import SpeakerFocusConfig from ._models import SpeakerFocusMode from ._models import SpeakerMetricsMessage +from ._models import SpeakerStatusMessage from ._models import SpeechSegmentConfig from ._models import TurnPredictionMessage from ._models import TurnStartEndResetMessage from ._models import VADStatusMessage +from ._models import VoiceActivityConfig from ._models import VoiceAgentConfig from ._presets import VoiceAgentConfigPreset +from ._smart_turn import SmartTurnDetector +from ._vad import SileroVAD __all__ = [ "__version__", @@ -57,8 +61,12 @@ "SpeakerIdentifier", "SmartTurnConfig", "SpeechSegmentConfig", + "VoiceActivityConfig", "VoiceAgentConfig", "VoiceAgentConfigPreset", + # Models + "SmartTurnDetector", + "SileroVAD", # Client messages "AgentClientMessageType", # Server messages @@ -66,6 +74,7 @@ "SegmentMessage", "SessionMetricsMessage", "SpeakerMetricsMessage", + "SpeakerStatusMessage", "TurnPredictionMessage", "TurnStartEndResetMessage", "VADStatusMessage", diff --git a/sdk/voice/speechmatics/voice/_audio.py b/sdk/voice/speechmatics/voice/_audio.py index 486c18e..6653db9 100644 --- a/sdk/voice/speechmatics/voice/_audio.py +++ b/sdk/voice/speechmatics/voice/_audio.py @@ -42,6 +42,7 @@ def __init__(self, sample_rate: int, frame_size: int, sample_width: int = 2, tot self._sample_width: int = sample_width self._frame_size: int = frame_size self._frame_bytes: int = frame_size * sample_width + self._frame_duration: float = round(frame_size / sample_rate, 3) # Queue self._frames: list[bytes] = [] @@ -63,7 +64,7 @@ def _get_time_from_frame(self, frame_index: int) -> float: Returns: The time in seconds. """ - return frame_index / (self._sample_rate / self._frame_size) + return frame_index * self._frame_duration def _get_frame_from_time(self, time: float) -> int: """Get the frame index from a time. @@ -77,7 +78,7 @@ def _get_frame_from_time(self, time: float) -> int: Returns: The frame index. """ - return int(time * (self._sample_rate / self._frame_size) + 1e-9) + return int(time / self._frame_duration) # + 1e-9) async def put_bytes(self, data: bytes) -> None: """Add data to the buffer. @@ -230,7 +231,7 @@ def total_frames(self) -> int: @property def total_time(self) -> float: """Get the total time added to the buffer.""" - return self._get_time_from_frame(self._total_frames) + return self._total_frames * self._frame_duration @property def size(self) -> int: diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py index 58626c2..2b1d838 100644 --- a/sdk/voice/speechmatics/voice/_client.py +++ b/sdk/voice/speechmatics/voice/_client.py @@ -49,6 +49,7 @@ from ._models import SpeakerMetricsMessage from ._models import SpeakerSegment from ._models import SpeakerSegmentView +from ._models import SpeakerStatusMessage from ._models import SpeechFragment from ._models import TranscriptionUpdatePreset from ._models import TurnPredictionMessage @@ -62,6 +63,9 @@ from ._smart_turn import SmartTurnPredictionResult from ._turn import TurnTaskProcessor from ._utils import FragmentUtils +from ._vad import SILERO_INSTALL_HINT +from ._vad import SileroVAD +from ._vad import SileroVADResult class VoiceAgentClient(AsyncClient): @@ -114,12 +118,12 @@ def __init__( >>> client = VoiceAgentClient( ... api_key="your_api_key", ... url="wss://custom.endpoint.com/v2", - ... preset="conversation_adaptive" + ... preset="adaptive" ... ) Using a preset (utility class): >>> from speechmatics.voice import VoiceAgentClient, VoiceAgentConfigPreset - >>> config=VoiceAgentConfigPreset.CONVERSATION_ADAPTIVE() + >>> config=VoiceAgentConfigPreset.ADAPTIVE() >>> client = VoiceAgentClient( ... api_key="your_api_key", ... url="wss://custom.endpoint.com/v2", @@ -238,49 +242,89 @@ def __init__( self._current_view: Optional[SpeakerSegmentView] = None self._previous_view: Optional[SpeakerSegmentView] = None + # ------------------------------------- + # VAD + # ------------------------------------- + + # Handlers + self._uses_silero_vad: bool = False + self._silero_detector: Optional[SileroVAD] = None + + # Silero VAD detector + if self._config.vad_config and self._config.vad_config.enabled: + if not SileroVAD.dependencies_available(): + self._logger.warning(SILERO_INSTALL_HINT) + else: + silero_detector = SileroVAD( + silence_duration=self._config.vad_config.silence_duration, + threshold=self._config.vad_config.threshold, + auto_init=True, + on_state_change=self._handle_silero_vad_result, + ) + if silero_detector.model_exists(): + self._silero_detector = silero_detector + self._uses_silero_vad = True + if not self._uses_silero_vad: + self._logger.warning("Silero model not available and VAD will be disabled.") + # ------------------------------------- # EOU / EOT # ------------------------------------- # Handlers - self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize) + self._uses_smart_turn: bool = False self._smart_turn_detector: Optional[SmartTurnDetector] = None - self._eot_calculation_task: Optional[asyncio.Task] = None # Current turn self._turn_start_time: Optional[float] = None self._turn_active: bool = False + # Smart turn cutoff time - filters late transcripts during finalization + self._smart_turn_pending_cutoff: Optional[float] = None + # Start turn detector if SMART_TURN requested - if self._config.end_of_utterance_mode == EndOfUtteranceMode.SMART_TURN: - eou_mode_ok: bool = False + if self._config.smart_turn_config and self._config.smart_turn_config.enabled: if not SmartTurnDetector.dependencies_available(): self._logger.warning(SMART_TURN_INSTALL_HINT) else: - detector = SmartTurnDetector( - auto_init=True, - threshold=self._config.smart_turn_config.smart_turn_threshold, + smart_turn_detector = SmartTurnDetector( + auto_init=True, threshold=self._config.smart_turn_config.smart_turn_threshold ) - if detector.model_exists(): - self._smart_turn_detector = detector - self._config.smart_turn_config.audio_buffer_length = 10.0 - eou_mode_ok = True - if not eou_mode_ok: + if smart_turn_detector.model_exists(): + self._smart_turn_detector = smart_turn_detector + self._uses_smart_turn = True + if not self._uses_smart_turn: self._logger.warning("Smart Turn model not available. Falling back to ADAPTIVE.") self._config.end_of_utterance_mode = EndOfUtteranceMode.ADAPTIVE + # ------------------------------------- + # Turn / End of Utterance Handling + # ------------------------------------- + # EOU mode self._eou_mode: EndOfUtteranceMode = self._config.end_of_utterance_mode - # Uses fixed EndOfUtterance message - self._uses_fixed_eou: bool = self._eou_mode == EndOfUtteranceMode.FIXED + # Handlers + self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize) + self._eot_calculation_task: Optional[asyncio.Task] = None + + # Uses fixed EndOfUtterance message from STT + self._uses_fixed_eou: bool = ( + self._eou_mode == EndOfUtteranceMode.FIXED + and not self._silero_detector + and not self._config.end_of_turn_config.use_forced_eou + ) # Uses ForceEndOfUtterance message - self._uses_forced_eou: bool = self._eou_mode in [ - EndOfUtteranceMode.ADAPTIVE, - EndOfUtteranceMode.SMART_TURN, - ] + self._uses_forced_eou: bool = not self._uses_fixed_eou self._forced_eou_active: bool = False + self._last_forced_eou_latency: float = 0.0 + + # Emit EOT prediction (uses _uses_forced_eou) + self._uses_eot_prediction: bool = self._eou_mode not in [ + EndOfUtteranceMode.FIXED, + EndOfUtteranceMode.EXTERNAL, + ] # ------------------------------------- # Diarization / Speakers @@ -291,6 +335,9 @@ def __init__( self._current_speaker: Optional[str] = None self._dz_enabled: bool = self._config.enable_diarization self._dz_config = self._config.speaker_config + self._last_speak_start_time: Optional[float] = None + self._last_speak_end_time: Optional[float] = None + self._last_speak_end_latency: float = 0 # ------------------------------------- # Metrics @@ -310,12 +357,17 @@ def __init__( AudioEncoding.PCM_S16LE: 2, }.get(self._audio_format.encoding, 1) + # Default audio buffer + if not self._config.audio_buffer_length and (self._uses_smart_turn or self._uses_silero_vad): + self._config.audio_buffer_length = 15.0 + # Audio buffer - if self._config.smart_turn_config.audio_buffer_length > 0: + if self._config.audio_buffer_length > 0: self._audio_buffer: AudioBuffer = AudioBuffer( sample_rate=self._audio_format.sample_rate, frame_size=self._audio_format.chunk_size, - total_seconds=self._config.smart_turn_config.audio_buffer_length, + sample_width=self._audio_sample_width, + total_seconds=self._config.audio_buffer_length, ) # Register handlers @@ -390,8 +442,10 @@ def _prepare_config( speakers=dz_speakers or None, ) - # End of Utterance (for fixed) - if config.end_of_utterance_silence_trigger and config.end_of_utterance_mode == EndOfUtteranceMode.FIXED: + # Fixed end of Utterance + if bool( + config.end_of_utterance_mode == EndOfUtteranceMode.FIXED and not config.end_of_turn_config.use_forced_eou + ): transcription_config.conversation_config = ConversationConfig( end_of_utterance_silence_trigger=config.end_of_utterance_silence_trigger, ) @@ -404,7 +458,7 @@ def _prepare_config( audio_format = AudioFormat( encoding=config.audio_encoding, sample_rate=config.sample_rate, - chunk_size=320, + chunk_size=config.chunk_size, ) # Return the config objects @@ -602,8 +656,12 @@ async def send_audio(self, payload: bytes) -> None: await self.disconnect() return + # Process with Silero VAD + if self._silero_detector: + asyncio.create_task(self._silero_detector.process_audio(payload)) + # Add to audio buffer (use put_bytes to handle variable chunk sizes) - if self._config.smart_turn_config.audio_buffer_length > 0: + if self._config.audio_buffer_length > 0: await self._audio_buffer.put_bytes(payload) # Calculate the time (in seconds) for the payload @@ -645,6 +703,12 @@ def update_diarization_config(self, config: SpeakerFocusConfig) -> None: ... SpeakerFocusConfig(focus_speakers=["main_speaker"]) ... ) """ + + # Only allow updates if diarization is enabled + if not self._config.enable_diarization: + raise ValueError("Diarization is not enabled") + + # Update the diarization config self._dz_config = config # ============================================================================ @@ -661,6 +725,9 @@ def finalize(self, end_of_turn: bool = False) -> None: end_of_turn: Whether to emit an end of turn message. """ + # Clear smart turn cutoff + self._smart_turn_pending_cutoff = None + # Current turn _turn_id = self._turn_handler.handler_id @@ -669,11 +736,7 @@ async def emit() -> None: """Wait for EndOfUtterance if needed, then emit segments.""" # Forced end of utterance message (only when no speaker is detected) - if ( - self._config.use_forced_eou_message - and self._current_view - and (self._eou_mode == EndOfUtteranceMode.EXTERNAL or not self._is_speaking) - ) and not (self._current_view.fragments[-1].is_eos and self._current_view.fragments[-1].is_final): + if self._config.end_of_turn_config.use_forced_eou: await self._await_forced_eou() # Check if the turn has changed @@ -683,8 +746,9 @@ async def emit() -> None: # Emit the segments self._stt_message_queue.put_nowait(lambda: self._emit_segments(finalize=True)) - # Call async task - asyncio.create_task(emit()) + # Call async task (only if not already waiting for forced EOU) + if not (self._config.end_of_turn_config.use_forced_eou and self._forced_eou_active): + asyncio.create_task(emit()) # ============================================================================ # EVENT REGISTRATION & HANDLERS @@ -748,11 +812,11 @@ def _emit_message(self, message: BaseMessage) -> None: # Forward to the emit() method self.emit(message.message, message.to_dict()) - def _emit_info_message(self, message: Union[str, dict[str, Any]]) -> None: - """Emit an info message to the client.""" + def _emit_diagnostic_message(self, message: Union[str, dict[str, Any]]) -> None: + """Emit a diagnostic message to the client.""" if isinstance(message, str): message = {"msg": message} - self.emit(AgentServerMessageType.INFO, {"message": AgentServerMessageType.INFO.value, **message}) + self.emit(AgentServerMessageType.DIAGNOSTICS, {"message": AgentServerMessageType.DIAGNOSTICS.value, **message}) # ============================================================================ # QUEUE PROCESSING @@ -1140,17 +1204,28 @@ async def _process_speech_fragments(self, change_filter: Optional[list[Annotatio if change_filter and not changes.any(*change_filter): return + # Skip re-evaluation if transcripts are older than smart turn cutoff + if self._smart_turn_pending_cutoff is not None and self._current_view: + latest_end_time = max( + (f.end_time for f in self._current_view.fragments if f.end_time is not None), default=0.0 + ) + + # If all fragments end before or at the cutoff, skip re-evaluation + if latest_end_time <= self._smart_turn_pending_cutoff: + return + # Turn prediction - if self._uses_forced_eou: + if self._uses_eot_prediction and self._uses_forced_eou and not self._forced_eou_active: async def fn() -> None: ttl = await self._calculate_finalize_delay() if ttl: self._turn_handler.update_timer(ttl) - self._run_background_eot_calculation(fn) + self._run_background_eot_calculation(fn, "speech_fragments") # Check for gaps + # TODO - implement gap-filling # FragmentUtils.find_segment_pauses(self._client_session, self._current_view) # Emit the segments @@ -1363,7 +1438,7 @@ async def _emit_end_of_turn(self) -> None: # TURN DETECTION & FINALIZATION # ============================================================================ - def _run_background_eot_calculation(self, fn: Callable) -> None: + def _run_background_eot_calculation(self, fn: Callable, source: Optional[str] = None) -> None: """Run the calculation async.""" # Existing task takes precedence @@ -1373,6 +1448,29 @@ def _run_background_eot_calculation(self, fn: Callable) -> None: # Create new task self._eot_calculation_task = asyncio.create_task(fn()) + async def _calculate_fixed_finalize_delay(self) -> Optional[float]: + """Will return the end of utterance delay as a default.""" + + # Delay defined in config + delay = self._config.end_of_utterance_silence_trigger + + # Adjust to compensate for known latencies + delay = delay - self._last_forced_eou_latency - self._last_speak_end_latency + + # Emit prediction message + self._emit_message( + TurnPredictionMessage( + turn_id=self._turn_handler.handler_id, + metadata=TurnPredictionMetadata( + ttl=delay, + reasons=["fixed_eou"], + ), + ), + ) + + # Return the delay + return delay + async def _calculate_finalize_delay( self, smart_turn_prediction: Optional[SmartTurnPredictionResult] = None, @@ -1401,6 +1499,10 @@ async def _calculate_finalize_delay( if not view: return None + # If FIXED EOU mode, use the fixed EOU delay + if self._eou_mode == EndOfUtteranceMode.FIXED: + return await self._calculate_fixed_finalize_delay() + # Get last active segment last_active_segment_index = view.last_active_segment_index last_active_segment = view.segments[last_active_segment_index] if last_active_segment_index > -1 else None @@ -1419,23 +1521,24 @@ async def _calculate_finalize_delay( reasons.append((p.penalty, reason)) # Apply smart turn prediction penalty - if smart_turn_prediction: + if smart_turn_prediction and self._config.smart_turn_config: if smart_turn_prediction.prediction: reasons.append((self._config.smart_turn_config.positive_penalty, "smart_turn_true")) else: reasons.append((self._config.smart_turn_config.negative_penalty, "smart_turn_false")) # Calculate final multiplier (compound multiplication) - multiplier = ( - self._config.end_of_turn_config.base_multiplier - * self._config.end_of_turn_config.end_of_turn_adjustment_factor - ) + multiplier = self._config.end_of_turn_config.base_multiplier for penalty, _ in reasons: multiplier *= penalty # Calculate delay with minimum of 25ms delay = round(self._config.end_of_utterance_silence_trigger * multiplier, 3) + # Trim off the most recent forced EOU delay if we're in forced EOU mode + if self._uses_forced_eou: + delay -= self._last_forced_eou_latency + # Clamp to max delay and adjust for TTFB clamped_delay = min(delay, self._config.end_of_utterance_max_delay) finalize_delay = max(clamped_delay - self._last_ttfb, self._config.end_of_turn_config.min_end_of_turn_delay) @@ -1451,14 +1554,15 @@ async def _calculate_finalize_delay( ), ) + # Return the calculated delay return finalize_delay - async def _eot_prediction(self, end_time: Optional[float] = None) -> float: + async def _eot_prediction(self, end_time: Optional[float] = None, speaker: Optional[str] = None) -> float: """Handle end of turn prediction.""" # Wait for Smart Turn result - if self._eou_mode == EndOfUtteranceMode.SMART_TURN and end_time is not None: - result = await self._smart_turn_prediction(end_time, self._config.language) + if self._smart_turn_detector and end_time is not None: + result = await self._smart_turn_prediction(end_time, self._config.language, speaker=speaker) else: result = None @@ -1466,9 +1570,11 @@ async def _eot_prediction(self, end_time: Optional[float] = None) -> float: delay = await self._calculate_finalize_delay(smart_turn_prediction=result) # Return the result - return delay or 0.005 + return max(delay or 0, self._config.end_of_turn_config.min_end_of_turn_delay) - async def _smart_turn_prediction(self, end_time: float, language: str) -> SmartTurnPredictionResult: + async def _smart_turn_prediction( + self, end_time: float, language: str, start_time: float = 0.0, speaker: Optional[str] = None + ) -> SmartTurnPredictionResult: """Predict when to emit the end of turn. This will give an acoustic prediction of when the turn has completed using @@ -1483,14 +1589,28 @@ async def _smart_turn_prediction(self, end_time: float, language: str) -> SmartT """ # Check we have smart turn enabled - if not self._smart_turn_detector: + if not self._smart_turn_detector or not self._config.smart_turn_config: return SmartTurnPredictionResult(error="Smart turn is not enabled") + # Calculate the times + start_time = max(start_time, end_time - self._config.smart_turn_config.max_audio_length) + total_time = self._total_time + + # Find the start / end times for the current speaker for this turn ... + if self._current_view: + """Extract the audio for this speaker only.""" + + # Filter segments that match the current speaker + speaker_segments: list[SpeakerSegment] = [ + seg for seg in self._current_view.segments if seg.speaker_id == speaker + ] + + # Get the LAST segment + if speaker_segments: + start_time = speaker_segments[-1].start_time + # Get audio slice (add small margin of 100ms to the end of the audio) - segment_audio = await self._audio_buffer.get_frames( - start_time=end_time - self._config.smart_turn_config.audio_buffer_length, - end_time=end_time + self._config.smart_turn_config.slice_margin, - ) + segment_audio = await self._audio_buffer.get_frames(start_time=start_time, end_time=end_time) # Evaluate prediction = await self._smart_turn_detector.predict( @@ -1500,10 +1620,29 @@ async def _smart_turn_prediction(self, end_time: float, language: str) -> SmartT sample_width=self._audio_sample_width, ) + # Metadata + metadata = { + "start_time": round(start_time, 3), + "end_time": round(end_time, 3), + "language": language, + "speaker_id": speaker, + "total_time": round(total_time, 3), + } + + # Emit smart turn info + self.emit( + AgentServerMessageType.SMART_TURN_RESULT, + { + "message": AgentServerMessageType.SMART_TURN_RESULT.value, + "prediction": prediction.to_dict(), + "metadata": metadata, + }, + ) + # Return the prediction return prediction - async def _await_forced_eou(self, timeout: float = 2.0) -> None: + async def _await_forced_eou(self, timeout: float = 1.0) -> None: """Await the forced end of utterance.""" # Received EOU @@ -1513,13 +1652,22 @@ async def _await_forced_eou(self, timeout: float = 2.0) -> None: self.once(AgentServerMessageType.END_OF_UTTERANCE, lambda message: eou_received.set()) # Trigger EOU message - self._emit_info_message("ForceEndOfUtterance sent") - await self.force_end_of_utterance() + self._emit_diagnostic_message("ForceEndOfUtterance sent - waiting for EndOfUtterance") # Wait for EOU try: + # Track the start time + start_time = time.time() self._forced_eou_active = True + + # Send the force EOU and wait for the response + await self.force_end_of_utterance() await asyncio.wait_for(eou_received.wait(), timeout=timeout) + + # Record the latency + self._last_forced_eou_latency = time.time() - start_time + self._emit_diagnostic_message(f"EndOfUtterance received after {self._last_forced_eou_latency:.3f}s") + except asyncio.TimeoutError: pass finally: @@ -1549,29 +1697,11 @@ async def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None: for frag in fragments if frag.speaker in self._dz_config.focus_speakers and frag.type_ == "word" and not frag.is_final ] - pre_partials = [ - frag - for frag in self._speech_fragments - if frag.speaker in self._dz_config.focus_speakers and frag.type_ == "word" and not frag.is_final - ] else: new_partials = [frag for frag in fragments if frag.type_ == "word" and not frag.is_final] - pre_partials = [frag for frag in self._speech_fragments if frag.type_ == "word" and not frag.is_final] - - # Check if last new partial matches the last pre partial - if len(pre_partials) > 0 and len(new_partials) > 0: - has_valid_partial = not all( - [ - pre_partials[-1].speaker == new_partials[-1].speaker, - pre_partials[-1].start_time == new_partials[-1].start_time, - pre_partials[-1].end_time == new_partials[-1].end_time, - pre_partials[-1].content == new_partials[-1].content, - ] - ) - # Evaluate if any valid partial words exist - else: - has_valid_partial = len(new_partials) > 0 + # Check if we have new partials + has_valid_partial = len(new_partials) > 0 # Current states current_is_speaking = self._is_speaking @@ -1602,7 +1732,7 @@ async def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None: # Check if speaker is different to the current speaker if current_is_speaking and speaker_changed: self._emit_message( - VADStatusMessage( + SpeakerStatusMessage( message=AgentServerMessageType.SPEAKER_ENDED, speaker_id=current_speaker, is_active=False, @@ -1610,13 +1740,14 @@ async def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None: ), ) self._emit_message( - VADStatusMessage( + SpeakerStatusMessage( message=AgentServerMessageType.SPEAKER_STARTED, speaker_id=latest_speaker, is_active=True, time=speaker_end_time, ), ) + self._last_speak_start_time = speaker_end_time # Update current speaker self._current_speaker = latest_speaker @@ -1641,9 +1772,56 @@ async def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None: else: await self._handle_speaker_stopped(latest_speaker, speaker_end_time) + def _handle_silero_vad_result(self, result: SileroVADResult) -> None: + """Handle VAD state change events. + + Args: + result: VAD result containing state change information. + """ + + # Time of event + event_time = self._total_time + + # Create the message + message = VADStatusMessage( + is_speech=result.is_speech, + probability=result.probability, + transition_duration_ms=result.transition_duration_ms, + metadata=MessageTimeMetadata( + start_time=round(max(0, event_time - 8), 4), + end_time=round(event_time, 4), + ), + ) + + # Emit VAD status message + self._emit_message(message) + + # If speech has ended, we need to predict the end of turn + if result.speech_ended and self._uses_eot_prediction: + """VAD-based end of turn prediction.""" + + # Only proceed if there are fragments to finalize + has_fragments = bool(self._speech_fragments) + + if has_fragments: + # Set cutoff to prevent late transcripts from cancelling finalization + self._smart_turn_pending_cutoff = event_time + + async def fn() -> None: + ttl = await self._eot_prediction(end_time=event_time, speaker=self._current_speaker) + self._turn_handler.update_timer(ttl) + + self._run_background_eot_calculation(fn, "silero_vad") + async def _handle_speaker_started(self, speaker: Optional[str], event_time: float) -> None: """Reset timers when a new speaker starts speaking after silence.""" + # Clear smart turn cutoff for new speech + self._smart_turn_pending_cutoff = None + + # Update last speak start time + self._last_speak_start_time = event_time + # Emit start of turn (not when using EXTERNAL) if self._is_speaking and not self._turn_active: await self._emit_start_of_turn(event_time) @@ -1654,7 +1832,7 @@ async def _handle_speaker_started(self, speaker: Optional[str], event_time: floa # Emit the event self._emit_message( - VADStatusMessage( + SpeakerStatusMessage( message=AgentServerMessageType.SPEAKER_STARTED, speaker_id=speaker, is_active=True, @@ -1668,18 +1846,22 @@ async def _handle_speaker_started(self, speaker: Optional[str], event_time: floa async def _handle_speaker_stopped(self, speaker: Optional[str], event_time: float) -> None: """Reset the current speaker and do smart turn detection (if enabled).""" + # Update last speak end time + self._last_speak_end_time = event_time + self._last_speak_end_latency = self._total_time - event_time + # Turn prediction - if self._uses_forced_eou: + if self._uses_eot_prediction and not self._forced_eou_active: async def fn() -> None: - ttl = await self._eot_prediction(event_time) + ttl = await self._eot_prediction(event_time, speaker) self._turn_handler.update_timer(ttl) - self._run_background_eot_calculation(fn) + self._run_background_eot_calculation(fn, "speaker_stopped") # Emit the event self._emit_message( - VADStatusMessage( + SpeakerStatusMessage( message=AgentServerMessageType.SPEAKER_ENDED, speaker_id=speaker, is_active=False, diff --git a/sdk/voice/speechmatics/voice/_models.py b/sdk/voice/speechmatics/voice/_models.py index 056d18a..dd72cd5 100644 --- a/sdk/voice/speechmatics/voice/_models.py +++ b/sdk/voice/speechmatics/voice/_models.py @@ -13,6 +13,7 @@ from pydantic import BaseModel as PydanticBaseModel from pydantic import ConfigDict from pydantic import Field +from pydantic import model_validator from typing_extensions import Self from speechmatics.rt import AudioEncoding @@ -39,10 +40,6 @@ class EndOfUtteranceMode(str, Enum): based on the content of what the most recent speaker has said, such as rate of speech and whether they have any pauses or disfluencies. - - `SMART_TURN`: Smart turn end of turn delay. The STT engine will use a combination - of silence detection, adaptive delay and smart turn detection using machine learning - to determine the end of turn. - Examples: Using fixed mode (default): >>> config = VoiceAgentConfig( @@ -56,12 +53,6 @@ class EndOfUtteranceMode(str, Enum): ... end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE ... ) - Using smart turn detection: - >>> config = VoiceAgentConfig( - ... language="en", - ... end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN, - ... ) - External control (manual finalization): >>> config = VoiceAgentConfig( ... language="en", @@ -74,7 +65,6 @@ class EndOfUtteranceMode(str, Enum): EXTERNAL = "external" FIXED = "fixed" ADAPTIVE = "adaptive" - SMART_TURN = "smart_turn" class TranscriptionUpdatePreset(str, Enum): @@ -126,11 +116,15 @@ class AgentServerMessageType(str, Enum): Speechmatics RT API / Voice Agent SDK can send to the client. Attributes: - RecognitionStarted: The recognition session has started. - EndOfTranscript: The recognition session has ended. - Info: Informational message. - Warning: Warning message. - Error: Error message. + RecognitionStarted: Server response to 'StartRecognition', + acknowledging that a recognition session has started. + EndOfTranscript: Indicates the server has finished sending all messages. + Info: Informational messages from the server. + Warning: Warning messages that don't stop transcription. + Error: Error messages indicating transcription failure. + AudioAdded: Server response to 'AddAudio', indicating + that audio has been added successfully. + Diagnostics: Diagnostic messages for development and troubleshooting. AddPartialTranscript: Partial transcript has been added. AddTranscript: Transcript has been added. EndOfUtterance: End of utterance has been detected (from STT engine). @@ -141,6 +135,7 @@ class AgentServerMessageType(str, Enum): StartOfTurn: Start of turn has been detected. EndOfTurnPrediction: End of turn prediction timing. EndOfTurn: End of turn has been detected. + SmartTurn: Smart turn metadata. SpeakersResult: Speakers result has been detected. Metrics: Metrics for the STT engine. SpeakerMetrics: Metrics relating to speakers. @@ -172,6 +167,8 @@ class AgentServerMessageType(str, Enum): INFO = "Info" WARNING = "Warning" ERROR = "Error" + AUDIO_ADDED = "AudioAdded" + DIAGNOSTICS = "Diagnostics" # Raw transcription messages ADD_PARTIAL_TRANSCRIPT = "AddPartialTranscript" @@ -187,10 +184,11 @@ class AgentServerMessageType(str, Enum): ADD_SEGMENT = "AddSegment" # Turn messages + VAD_STATUS = "VadStatus" START_OF_TURN = "StartOfTurn" END_OF_TURN_PREDICTION = "EndOfTurnPrediction" END_OF_TURN = "EndOfTurn" - SMART_TURN_AUDIO = "SmartTurnAudio" + SMART_TURN_RESULT = "SmartTurnResult" # Speaker messages SPEAKERS_RESULT = "SpeakersResult" @@ -239,6 +237,8 @@ class AnnotationFlags(str, Enum): ONLY_PUNCTUATION = "only_punctuation" MULTIPLE_SPEAKERS = "multiple_speakers" NO_TEXT = "no_text" + HAS_PAUSE = "has_pause" + ENDS_WITH_PAUSE = "ends_with_pause" # End of utterance detection END_OF_UTTERANCE = "end_of_utterance" @@ -390,13 +390,12 @@ class EndOfTurnConfig(BaseModel): Parameters: base_multiplier: Base multiplier for end of turn delay. min_end_of_turn_delay: Minimum end of turn delay. - end_of_turn_adjustment_factor: End of turn adjustment factor. penalties: List of end of turn penalty items. + use_forced_eou: Whether to use forced end of utterance detection. """ base_multiplier: float = 1.0 - min_end_of_turn_delay: float = 0.3 - end_of_turn_adjustment_factor: float = 1.0 + min_end_of_turn_delay: float = 0.01 penalties: list[EndOfTurnPenaltyItem] = Field( default_factory=lambda: [ # Increase delay @@ -411,10 +410,25 @@ class EndOfTurnConfig(BaseModel): ), # Decrease delay EndOfTurnPenaltyItem( - penalty=0.25, annotation=[AnnotationFlags.ENDS_WITH_FINAL, AnnotationFlags.ENDS_WITH_EOS] + penalty=0.5, annotation=[AnnotationFlags.ENDS_WITH_FINAL, AnnotationFlags.ENDS_WITH_EOS] ), ] ) + use_forced_eou: bool = False + + +class VoiceActivityConfig(BaseModel): + """Configuration for voice activity detection. + + Parameters: + enabled: Whether voice activity detection is enabled. + silence_duration: Duration of silence in seconds before considering speech ended. + threshold: Threshold for voice activity detection. + """ + + enabled: bool = False + silence_duration: float = 0.18 + threshold: float = 0.35 class SmartTurnConfig(BaseModel): @@ -424,14 +438,11 @@ class SmartTurnConfig(BaseModel): extract slices of recent audio for post-processing by end of thought models. Parameters: - audio_buffer_length: Length of audio buffer to extract slices of recent audio for post-processing - by end of thought models. Defaults to 0.0 seconds. + enabled: Whether smart turn is enabled. - smart_turn_threshold: Smart turn threshold. This is used to determine when a turn has completed. - Only used when `end_of_utterance_mode` is `EndOfUtteranceMode.SMART_TURN`. Defaults to 0.5. + smart_turn_threshold: Smart turn threshold. Defaults to 0.5. - slice_margin: Margin to add to the audio buffer to ensure that the end of thought models have - enough audio to work with. Defaults to 0.05 seconds. + max_audio_length: Maximum length of audio to analyze in seconds. Defaults to 8.0. positive_penalty: Positive penalty for smart turn. Defaults to -1.0. @@ -439,17 +450,17 @@ class SmartTurnConfig(BaseModel): Examples: >>> config = SmartTurnConfig( - ... audio_buffer_length=0.5, + ... audio_buffer_length=15.0, ... smart_turn_threshold=0.5, ... slice_margin=0.05 ... ) """ - audio_buffer_length: float = 0.0 + enabled: bool = False smart_turn_threshold: float = 0.5 - slice_margin: float = 0.05 - positive_penalty: float = 0.3 - negative_penalty: float = 1.7 + max_audio_length: float = 8.0 + positive_penalty: float = 0.0 + negative_penalty: float = 1.0 class VoiceAgentConfig(BaseModel): @@ -541,9 +552,6 @@ class VoiceAgentConfig(BaseModel): include_results: Include word data in the response. This is useful for debugging and understanding the STT engine's behavior. Defaults to False. - use_forced_eou_message: Use forced end of utterance message. This will force the STT engine to emit - end of utterance messages. Defaults to False. - transcription_update_preset: Emit segments when the text content or word timings change. Options are: `COMPLETE` (emit on changes to text content), `COMPLETE_PLUS_TIMING` (emit on changes to text content and word timings), `WORDS` (emit on changes to word @@ -553,14 +561,19 @@ class VoiceAgentConfig(BaseModel): end_of_turn_config: End of turn configuration for the Speechmatics Voice Agent. + vad_config: Voice activity detection configuration for the Speechmatics Voice Agent. + smart_turn_config: Smart turn configuration for the Speechmatics Voice Agent. speech_segment_config: Speech segment configuration for the Speechmatics Voice Agent. + audio_buffer_length: Length of internal rolling audio buffer in seconds. Defaults to `0.0`. + advanced_engine_control: Internal use only. sample_rate: Audio sample rate for streaming. Defaults to `16000`. audio_encoding: Audio encoding format. Defaults to `AudioEncoding.PCM_S16LE`. + chunk_size: Audio chunk size in frames. Defaults to `160`. Examples: Basic configuration: @@ -614,9 +627,9 @@ class VoiceAgentConfig(BaseModel): ... enable_diarization=True, ... speaker_sensitivity=0.7, ... max_speakers=3, - ... end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN, + ... end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, ... smart_turn_config=SmartTurnConfig( - ... smart_turn_threshold=0.5 + ... enabled=True ... ), ... additional_vocab=[ ... AdditionalVocabEntry(content="API"), @@ -635,8 +648,8 @@ class VoiceAgentConfig(BaseModel): output_locale: Optional[str] = None # Features - max_delay: float = 0.7 - end_of_utterance_silence_trigger: float = 0.2 + max_delay: float = 1.0 + end_of_utterance_silence_trigger: float = 0.5 end_of_utterance_max_delay: float = 10.0 end_of_utterance_mode: EndOfUtteranceMode = EndOfUtteranceMode.FIXED additional_vocab: list[AdditionalVocabEntry] = Field(default_factory=list) @@ -653,11 +666,12 @@ class VoiceAgentConfig(BaseModel): # Advanced features include_results: bool = False - use_forced_eou_message: bool = False transcription_update_preset: TranscriptionUpdatePreset = TranscriptionUpdatePreset.COMPLETE end_of_turn_config: EndOfTurnConfig = Field(default_factory=EndOfTurnConfig) - smart_turn_config: SmartTurnConfig = Field(default_factory=SmartTurnConfig) + vad_config: Optional[VoiceActivityConfig] = None + smart_turn_config: Optional[SmartTurnConfig] = None speech_segment_config: SpeechSegmentConfig = Field(default_factory=SpeechSegmentConfig) + audio_buffer_length: float = 0.0 # Advanced engine configuration advanced_engine_control: Optional[dict[str, Any]] = None @@ -665,6 +679,54 @@ class VoiceAgentConfig(BaseModel): # Audio sample_rate: int = 16000 audio_encoding: AudioEncoding = AudioEncoding.PCM_S16LE + chunk_size: int = 160 + + # Validation + @model_validator(mode="after") # type: ignore[misc] + def validate_config(self) -> Self: + """Validate the configuration.""" + + # Validation errors + errors: list[str] = [] + + # End of utterance mode cannot be EXTERNAL if smart turn is enabled + if self.end_of_utterance_mode == EndOfUtteranceMode.EXTERNAL and self.smart_turn_config: + errors.append("EXTERNAL mode cannot be used in conjunction with SmartTurnConfig") + + # Cannot have FIXED and forced end of utterance enabled without VAD being enabled + if (self.end_of_utterance_mode == EndOfUtteranceMode.FIXED and self.end_of_turn_config.use_forced_eou) and not ( + self.vad_config and self.vad_config.enabled + ): + errors.append("FIXED mode cannot be used in conjunction with forced end of utterance without VAD enabled") + + # Cannot use VAD with external end of utterance mode + if self.end_of_utterance_mode == EndOfUtteranceMode.EXTERNAL and (self.vad_config and self.vad_config.enabled): + errors.append("EXTERNAL mode cannot be used in conjunction with VAD being enabled") + + # Check end_of_utterance_max_delay is greater than end_of_utterance_silence_trigger + if self.end_of_utterance_max_delay < self.end_of_utterance_silence_trigger: + errors.append("end_of_utterance_max_delay must be greater than end_of_utterance_silence_trigger") + + # If diarization is not enabled, then max_speakers cannot be set + if not self.enable_diarization and self.max_speakers: + errors.append("max_speakers cannot be set when enable_diarization is False") + + # If diarization is not enabled, then SpeakerFocusConfig.focus_speakers and SpeakerFocusConfig.ignore_speakers must be empty + if not self.enable_diarization and (self.speaker_config.focus_speakers or self.speaker_config.ignore_speakers): + errors.append( + "SpeakerFocusConfig.focus_speakers and SpeakerFocusConfig.ignore_speakers must be empty when enable_diarization is False" + ) + + # Check sample rate + if self.sample_rate not in [8000, 16000]: + errors.append("sample_rate must be 8000 or 16000") + + # Raise error if any validation errors + if errors: + raise ValueError(f"{len(errors)} config error(s): {'; '.join(errors)}") + + # Return validated config + return self # ============================================================================== @@ -1055,6 +1117,22 @@ def from_message(cls, data: dict, **kwargs: Any) -> Self: return cls.from_dict(data, **kwargs) +class MessageTimeMetadata(BaseModel): + """Metadata for segment messages. + + Parameters: + time: The time of the event. + start_time: The start time of the segment. + end_time: The end time of the segment. + processing_time: The processing time of the segment. + """ + + time: Optional[float] = None + start_time: Optional[float] = None + end_time: Optional[float] = None + processing_time: Optional[float] = None + + class ErrorMessage(BaseMessage): """Emitted when an error occurs. @@ -1085,7 +1163,7 @@ class SessionMetricsMessage(BaseMessage): processing_time: float -class VADStatusMessage(BaseMessage): +class SpeakerStatusMessage(BaseMessage): """Emitted when a speaker starts or ends speaking. The speaker id is taken from the last word in the segment when @@ -1104,20 +1182,22 @@ class VADStatusMessage(BaseMessage): time: Optional[float] = None -class MessageTimeMetadata(BaseModel): - """Metadata for segment messages. +class VADStatusMessage(BaseMessage): + """Emitted when voice activity detection status changes. Parameters: - time: The time of the event. - start_time: The start time of the segment. - end_time: The end time of the segment. - processing_time: The processing time of the segment. + message: The message type. + is_speech: Whether speech is detected. + probability: The probability of speech. + transition_duration_ms: The duration of the transition in milliseconds. + metadata: The time metadata. """ - time: Optional[float] = None - start_time: Optional[float] = None - end_time: Optional[float] = None - processing_time: Optional[float] = None + message: AgentServerMessageType = AgentServerMessageType.VAD_STATUS + metadata: MessageTimeMetadata + is_speech: bool + probability: float + transition_duration_ms: float class TurnStartEndResetMessage(BaseMessage): @@ -1144,6 +1224,7 @@ class TurnPredictionMetadata(BaseModel): """ ttl: float + reasons: list[str] model_config = ConfigDict(extra="ignore") @@ -1211,6 +1292,7 @@ class SegmentMessageSegment(BaseModel): language: Optional[str] = None text: Optional[str] = None fragments: Optional[list[SegmentMessageSegmentFragment]] = None + annotation: list[AnnotationFlags] = Field(default_factory=list, exclude=True) metadata: MessageTimeMetadata model_config = ConfigDict(extra="ignore") diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py index c5c608c..37c9705 100644 --- a/sdk/voice/speechmatics/voice/_presets.py +++ b/sdk/voice/speechmatics/voice/_presets.py @@ -6,9 +6,12 @@ from typing import Optional +from ._models import EndOfTurnConfig from ._models import EndOfUtteranceMode from ._models import OperatingPoint +from ._models import SmartTurnConfig from ._models import SpeechSegmentConfig +from ._models import VoiceActivityConfig from ._models import VoiceAgentConfig @@ -16,52 +19,81 @@ class VoiceAgentConfigPreset: """Set of preset configurations for the Voice Agent SDK.""" @staticmethod - def LOW_LATENCY(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 + def FAST(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 """Best suited for low latency situations. This configuration will emit the end of turn as soon as possible, with minimal delay to finalizing the spoken sentences. It is not recommended for conversation, as it will not account for pauses, slow speech or disfluencies. + + Use of this will requite `pip install speechmatics-voice[smart]` and may not + be suited to low-power devices. """ return VoiceAgentConfigPreset._merge_configs( VoiceAgentConfig( operating_point=OperatingPoint.STANDARD, enable_diarization=True, max_delay=0.7, - end_of_utterance_silence_trigger=0.5, end_of_utterance_mode=EndOfUtteranceMode.FIXED, speech_segment_config=SpeechSegmentConfig(emit_sentences=True), + vad_config=VoiceActivityConfig(enabled=True, silence_duration=0.18), + end_of_turn_config=EndOfTurnConfig(use_forced_eou=True), + ), + overlay, + ) + + @staticmethod + def FIXED(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 + """Best suited for general conversational use cases with fixed end-of-utterance timing. + + For conversation, there is a balance between accuracy, speed and the rate at + which the end of turn is emitted. This configuration uses fixed timing for + end-of-utterance detection. + """ + return VoiceAgentConfigPreset._merge_configs( + VoiceAgentConfig( + operating_point=OperatingPoint.ENHANCED, + enable_diarization=True, + max_delay=2.0, + end_of_utterance_silence_trigger=0.5, + end_of_utterance_mode=EndOfUtteranceMode.FIXED, + speech_segment_config=SpeechSegmentConfig(emit_sentences=False), ), overlay, ) @staticmethod - def CONVERSATION_ADAPTIVE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 + def ADAPTIVE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 """Best suited for general conversational use cases. For conversation, there is a balance between accuracy, speed and the rate at - which the end of turn is emitted. Tne use of ADAPTIVE means that the delay to + which the end of turn is emitted. The use of ADAPTIVE means that the delay to finalizing the spoken sentences will be adjusted based on the words and whether there are any pauses, slow speech or disfluencies. + + Use of this will requite `pip install speechmatics-voice[smart]` and may not + be suited to low-power devices. """ return VoiceAgentConfigPreset._merge_configs( VoiceAgentConfig( operating_point=OperatingPoint.ENHANCED, enable_diarization=True, max_delay=0.7, - end_of_utterance_silence_trigger=1.0, + end_of_utterance_silence_trigger=0.6, end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, speech_segment_config=SpeechSegmentConfig(emit_sentences=False), + vad_config=VoiceActivityConfig(enabled=True), + end_of_turn_config=EndOfTurnConfig(use_forced_eou=True), ), overlay, ) @staticmethod - def CONVERSATION_SMART_TURN(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 + def SMART_TURN(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 """Best suited for complex conversational use cases. For conversation, there is a balance between accuracy, speed and the rate at - which the end of turn is emitted. Tne use of SMART_TURN means that the delay to + which the end of turn is emitted. The use of SMART_TURN means that the delay to finalizing the spoken sentences will be adjusted based on the words and whether there are any pauses, slow speech or disfluencies. @@ -75,10 +107,15 @@ def CONVERSATION_SMART_TURN(overlay: Optional[VoiceAgentConfig] = None) -> Voice VoiceAgentConfig( operating_point=OperatingPoint.ENHANCED, enable_diarization=True, - max_delay=0.7, - end_of_utterance_silence_trigger=1.0, - end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN, + max_delay=2.0, + end_of_utterance_silence_trigger=0.8, + end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, speech_segment_config=SpeechSegmentConfig(emit_sentences=False), + smart_turn_config=SmartTurnConfig( + enabled=True, + ), + vad_config=VoiceActivityConfig(enabled=True), + end_of_turn_config=EndOfTurnConfig(use_forced_eou=True), ), overlay, ) @@ -89,36 +126,23 @@ def SCRIBE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # n This mode will emit partial and final segments as they become available. The end of utterance is set to fixed. End of turn is not required for note-taking. - """ - return VoiceAgentConfigPreset._merge_configs( - VoiceAgentConfig( - operating_point=OperatingPoint.ENHANCED, - enable_diarization=True, - max_delay=1.0, - end_of_utterance_silence_trigger=1.2, - end_of_utterance_mode=EndOfUtteranceMode.FIXED, - speech_segment_config=SpeechSegmentConfig(emit_sentences=True), - ), - overlay, - ) - @staticmethod - def CAPTIONS(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 - """Best suited for captions. - - This mode will emit partial and final segments as they become available. The end of - utterance is set to fixed. End of turn is not required for captions. The segments - will only include finalized words. + Use of this will requite `pip install speechmatics-voice[smart]` and may not + be suited to low-power devices. """ return VoiceAgentConfigPreset._merge_configs( VoiceAgentConfig( operating_point=OperatingPoint.ENHANCED, enable_diarization=True, - max_delay=0.9, - end_of_utterance_silence_trigger=1.2, + max_delay=2.0, + end_of_utterance_silence_trigger=1.0, end_of_utterance_mode=EndOfUtteranceMode.FIXED, speech_segment_config=SpeechSegmentConfig(emit_sentences=True), - include_partials=False, + smart_turn_config=SmartTurnConfig( + enabled=True, + ), + vad_config=VoiceActivityConfig(enabled=True, silence_duration=0.2), + end_of_turn_config=EndOfTurnConfig(use_forced_eou=True), ), overlay, ) @@ -134,11 +158,10 @@ def EXTERNAL(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # VoiceAgentConfig( operating_point=OperatingPoint.ENHANCED, enable_diarization=True, - max_delay=1.0, - end_of_utterance_silence_trigger=1.2, + max_delay=2.0, end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL, - speech_segment_config=SpeechSegmentConfig(emit_sentences=True), - use_forced_eou_message=True, + speech_segment_config=SpeechSegmentConfig(emit_sentences=False), + end_of_turn_config=EndOfTurnConfig(use_forced_eou=True), ), overlay, ) diff --git a/sdk/voice/speechmatics/voice/_smart_turn.py b/sdk/voice/speechmatics/voice/_smart_turn.py index 011318d..c4667b4 100644 --- a/sdk/voice/speechmatics/voice/_smart_turn.py +++ b/sdk/voice/speechmatics/voice/_smart_turn.py @@ -14,7 +14,8 @@ from urllib.parse import urlparse import numpy as np -from pydantic import BaseModel + +from speechmatics.voice._models import BaseModel ort: Any WhisperFeatureExtractor: Any @@ -187,13 +188,21 @@ async def predict( dtype = np.int16 if sample_width == 2 else np.int8 int16_array: np.ndarray = np.frombuffer(audio_array, dtype=dtype).astype(np.int16) + # Truncate to last 8 seconds if needed (keep the tail/end of audio) + max_samples = 8 * sample_rate + if len(int16_array) > max_samples: + int16_array = int16_array[-max_samples:] + + # Convert int16 to float32 in range [-1, 1] (same as reference implementation) + float32_array: np.ndarray = int16_array.astype(np.float32) / 32768.0 + # Process audio using Whisper's feature extractor inputs = self.feature_extractor( - int16_array, + float32_array, sampling_rate=sample_rate, return_tensors="np", padding="max_length", - max_length=8 * sample_rate, + max_length=max_samples, truncation=True, do_normalize=True, ) @@ -217,8 +226,8 @@ async def predict( # Return the result return SmartTurnPredictionResult( prediction=prediction, - probability=probability, - processing_time=float((end_time - start_time).total_seconds()), + probability=round(probability, 3), + processing_time=round(float((end_time - start_time).total_seconds()), 3), ) @staticmethod diff --git a/sdk/voice/speechmatics/voice/_utils.py b/sdk/voice/speechmatics/voice/_utils.py index 31e7e24..ecb01d0 100644 --- a/sdk/voice/speechmatics/voice/_utils.py +++ b/sdk/voice/speechmatics/voice/_utils.py @@ -70,6 +70,7 @@ def format_segment_text( **{ "speaker_id": segment.speaker_id, "text": content, + "content": content, "ts": segment.timestamp, "lang": segment.language, "start_time": fragments[0].start_time if fragments else 0, @@ -294,7 +295,7 @@ def _annotate_segment(segment: SpeakerSegment) -> AnnotationResult: # Categorize the speaker if wpm < 80: result.add(AnnotationFlags.VERY_SLOW_SPEAKER) - elif wpm < 120: + elif wpm < 110: result.add(AnnotationFlags.SLOW_SPEAKER) elif wpm > 250: result.add(AnnotationFlags.FAST_SPEAKER) diff --git a/sdk/voice/speechmatics/voice/_vad.py b/sdk/voice/speechmatics/voice/_vad.py new file mode 100644 index 0000000..e5a7b1e --- /dev/null +++ b/sdk/voice/speechmatics/voice/_vad.py @@ -0,0 +1,354 @@ +# +# Copyright (c) 2025, Speechmatics / Cantab Research Ltd +# + +from __future__ import annotations + +import logging +import os +import time +import urllib.request +from collections import deque +from typing import Any +from typing import Callable +from typing import Optional +from urllib.parse import urlparse + +import numpy as np + +from speechmatics.voice._models import BaseModel + +ort: Any +logger = logging.getLogger(__name__) + +try: + import onnxruntime as _ort + + ort = _ort +except ModuleNotFoundError: + ort = None + + +# Silero VAD model +SILERO_MODEL_URL = os.getenv( + "SILERO_MODEL_URL", "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx" +) +SILERO_MODEL_PATH = os.getenv("SILERO_MODEL_PATH", ".models/silero_vad.onnx") + +# Hint for when dependencies are not available +SILERO_INSTALL_HINT = "Silero VAD unavailable. Install `speechmatics-voice[smart]` to enable VAD." + +# Silero VAD constants +SILERO_SAMPLE_RATE = 16000 +SILERO_CHUNK_SIZE = 512 # Silero expects 512 samples at 16kHz (32ms chunks) +SILERO_CONTEXT_SIZE = 64 # Silero uses 64-sample context +MODEL_RESET_STATES_TIME = 5.0 # Reset state every 5 seconds +SILERO_CHUNK_DURATION_MS = (SILERO_CHUNK_SIZE / SILERO_SAMPLE_RATE) * 1000 # 32ms per chunk + + +class SileroVADResult(BaseModel): + """VAD result from Silero. + + Attributes: + is_speech: True if speech detected, False if silence + probability: Probability of speech (0.0-1.0) + transition_duration_ms: Duration of consecutive silence in milliseconds (used for transition threshold) + speech_ended: True if silence duration exceeded the threshold + metadata: Additional metadata about the VAD result + error: Error message if an error occurred + """ + + is_speech: bool = False + probability: float = 0.0 + transition_duration_ms: float = 0.0 + speech_ended: bool = False + metadata: Optional[dict] = None + error: Optional[str] = None + + +class SileroVAD: + """Silero Voice Activity Detector. + + Uses Silero's opensource VAD model for detecting speech vs silence. + Processes audio in 512-sample chunks at 16kHz. + + Further information at https://github.com/snakers4/silero-vad + """ + + def __init__( + self, + auto_init: bool = True, + threshold: float = 0.5, + silence_duration: float = 0.1, + on_state_change: Optional[Callable[[SileroVADResult], None]] = None, + ): + """Create the new SileroVAD. + + Args: + auto_init: Whether to automatically initialise the detector. + threshold: Probability threshold for speech detection (0.0-1.0). + silence_duration: Duration of consecutive silence (in ms) before considering speech ended. + on_state_change: Optional callback invoked when VAD state changes (speech <-> silence). + """ + + self._is_initialized: bool = False + self._threshold: float = threshold + self._on_state_change: Optional[Callable[[SileroVADResult], None]] = on_state_change + + # ONNX session state + self._state: Optional[np.ndarray] = None + self._context: Optional[np.ndarray] = None + self._last_reset_time: float = 0.0 + + # Audio buffering + self._audio_buffer: bytes = b"" + + # Rolling window for predictions (100ms window = ~3-4 chunks at 32ms each) + window_chunks = int((silence_duration * 1000) / SILERO_CHUNK_DURATION_MS) + 1 + self._prediction_window: deque[float] = deque(maxlen=window_chunks) + + # State tracking + self._last_is_speech: bool = False # Track previous state for change detection (default: not speaking) + + if auto_init: + self.setup() + + @staticmethod + def dependencies_available() -> bool: + """Return whether optional Silero dependencies are installed.""" + return ort is not None + + def setup(self) -> None: + """Setup the detector. + + Initialises the ONNX model and internal states. + """ + + # Show warning if dependencies are not available + if not self.dependencies_available(): + logger.warning(SILERO_INSTALL_HINT) + return + + try: + # Check / download the model + self.download_model() + + # Check the model downloaded + if not self.model_exists(): + logger.warning("Silero VAD model not found. Please download the model first.") + return + + # Build the session + self.session = self.build_session(SILERO_MODEL_PATH) + + # Initialize states + self._init_states() + + # Set initialized + self._is_initialized = True + + except Exception as e: + logger.error(f"Failed to setup SileroVAD: {e}") + + def build_session(self, onnx_path: str) -> ort.InferenceSession: + """Build the ONNX session and load resources. + + Args: + onnx_path: Path to the ONNX model. + + Returns: + ONNX inference session. + """ + + # Show warning if dependencies are not available + if ort is None: + raise RuntimeError("onnxruntime is not available") + + # Build the session + so = ort.SessionOptions() + so.inter_op_num_threads = 1 + so.intra_op_num_threads = 1 + + # Return the new session + return ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"], sess_options=so) + + def _init_states(self) -> None: + """Initialize or reset internal VAD states.""" + self._state = np.zeros((2, 1, 128), dtype=np.float32) + self._context = np.zeros((1, SILERO_CONTEXT_SIZE), dtype=np.float32) + self._last_reset_time = time.time() + + def _maybe_reset_states(self) -> None: + """Reset ONNX model states periodically to prevent drift. + + Note: Does NOT reset prediction window or speech state tracking. + """ + if (time.time() - self._last_reset_time) >= MODEL_RESET_STATES_TIME: + self._state = np.zeros((2, 1, 128), dtype=np.float32) + self._context = np.zeros((1, SILERO_CONTEXT_SIZE), dtype=np.float32) + self._last_reset_time = time.time() + + def process_chunk(self, chunk_f32: np.ndarray) -> float: + """Process a single 512-sample chunk and return speech probability. + + Args: + chunk_f32: Float32 numpy array of exactly 512 samples. + + Returns: + Speech probability (0.0-1.0). + + Raises: + ValueError: If chunk is not exactly 512 samples. + """ + # Ensure shape (1, 512) + x = np.reshape(chunk_f32, (1, -1)) + if x.shape[1] != SILERO_CHUNK_SIZE: + raise ValueError(f"Expected {SILERO_CHUNK_SIZE} samples, got {x.shape[1]}") + + # Concatenate with context (previous 64 samples) + if self._context is not None: + x = np.concatenate((self._context, x), axis=1) + + # Run ONNX inference + ort_inputs = { + "input": x.astype(np.float32), + "state": self._state, + "sr": np.array(SILERO_SAMPLE_RATE, dtype=np.int64), + } + out, self._state = self.session.run(None, ort_inputs) + + # Update context (keep last 64 samples) + self._context = x[:, -SILERO_CONTEXT_SIZE:] + + # Maybe reset states periodically + self._maybe_reset_states() + + # Return probability (out shape is (1, 1)) + return float(out[0][0]) + + async def process_audio(self, audio_bytes: bytes, sample_rate: int = 16000, sample_width: int = 2) -> None: + """Process incoming audio bytes and invoke callback on state changes. + + This method buffers incomplete chunks and processes all complete 512-sample chunks. + The callback is invoked only once at the end if the VAD state changed during processing. + + Args: + audio_bytes: Raw audio bytes (int16 PCM). + sample_rate: Sample rate of the audio (must be 16000). + sample_width: Sample width in bytes (2 for int16). + """ + + if not self._is_initialized: + logger.error("SileroVAD is not initialized") + return + + if sample_rate != SILERO_SAMPLE_RATE: + logger.error(f"Sample rate must be {SILERO_SAMPLE_RATE}Hz, got {sample_rate}Hz") + return + + # Add new bytes to buffer + self._audio_buffer += audio_bytes + + # Calculate bytes per chunk (512 samples * 2 bytes for int16) + bytes_per_chunk = SILERO_CHUNK_SIZE * sample_width + + # Process all complete chunks in buffer + while len(self._audio_buffer) >= bytes_per_chunk: + # Extract one chunk + chunk_bytes = self._audio_buffer[:bytes_per_chunk] + self._audio_buffer = self._audio_buffer[bytes_per_chunk:] + + # Convert bytes to int16 array + dtype = np.int16 if sample_width == 2 else np.int8 + int16_array: np.ndarray = np.frombuffer(chunk_bytes, dtype=dtype).astype(np.int16) + + # Convert int16 to float32 in range [-1, 1] + float32_array: np.ndarray = int16_array.astype(np.float32) / 32768.0 + + try: + # Process the chunk and add probability to rolling window + probability = self.process_chunk(float32_array) + self._prediction_window.append(probability) + + except Exception as e: + logger.error(f"Error processing VAD chunk: {e}") + + # After processing all chunks, calculate weighted average from window + if len(self._prediction_window) > 0: + # Calculate weighted average (most recent predictions have higher weight) + weights = np.arange(1, len(self._prediction_window) + 1, dtype=np.float32) + weighted_avg = np.average(list(self._prediction_window), weights=weights) + + # Determine speech state from weighted average + is_speech = bool(weighted_avg >= self._threshold) + + # Check if state changed + state_changed = self._last_is_speech != is_speech + + # Emit callback if state changed + if state_changed and self._on_state_change: + # Calculate transition duration (window duration) + transition_duration = len(self._prediction_window) * SILERO_CHUNK_DURATION_MS + + # Determine if speech ended + speech_ended = self._last_is_speech and not is_speech + + # VAD result + result = SileroVADResult( + is_speech=is_speech, + probability=round(float(weighted_avg), 3), + transition_duration_ms=transition_duration, + speech_ended=speech_ended, + ) + + # Trigger callback + self._on_state_change(result) + + # Update state after emitting + self._last_is_speech = is_speech + + def reset(self) -> None: + """Reset the VAD state and clear audio buffer.""" + if self._is_initialized: + self._init_states() + self._audio_buffer = b"" + self._prediction_window.clear() + self._last_is_speech = False + + @staticmethod + def download_model() -> None: + """Download the ONNX model. + + This will check if the model has been downloaded and is available in the + location specified by the SILERO_MODEL_PATH environment variable. + + If not, it will download the model from GitHub. + """ + + # Check if model file exists + if SileroVAD.model_exists(): + return + + # Check the URL for valid schemes + parsed_url = urlparse(SILERO_MODEL_URL) + if parsed_url.scheme not in ("http", "https"): + logger.error(f"Invalid URL scheme: {parsed_url.scheme}") + return + + # Report to the user + logger.warning("Silero VAD model not found. Downloading from GitHub...") + + # Create the directory + os.makedirs(os.path.dirname(SILERO_MODEL_PATH), exist_ok=True) + + # Download + urllib.request.urlretrieve(SILERO_MODEL_URL, SILERO_MODEL_PATH) # nosec B310 + + @staticmethod + def model_exists() -> bool: + """Check the model has been downloaded. + + Returns: + True if the model file exists, False otherwise. + """ + return os.path.exists(SILERO_MODEL_PATH) diff --git a/tests/voice/assets/audio_03_16kHz.wav b/tests/voice/assets/audio_03_16kHz.wav index 0c9b179..2442d8e 100644 --- a/tests/voice/assets/audio_03_16kHz.wav +++ b/tests/voice/assets/audio_03_16kHz.wav @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eadc1c0609e13027d0bb7bfe6bf7868c123cbd17421898e6023b1889f103cf17 -size 58460 +oid sha256:ef18686db712ccb8d7714e86358f64490da3eaa6ff7ed6e090070169d87b6ed2 +size 162670 diff --git a/tests/voice/test_05_utterance.py b/tests/voice/test_05_utterance.py index cf30453..6026c79 100644 --- a/tests/voice/test_05_utterance.py +++ b/tests/voice/test_05_utterance.py @@ -9,6 +9,7 @@ from _utils import get_client from speechmatics.voice import AgentServerMessageType +from speechmatics.voice import EndOfTurnConfig from speechmatics.voice import EndOfUtteranceMode from speechmatics.voice import SpeechSegmentConfig from speechmatics.voice import VoiceAgentConfig @@ -234,7 +235,9 @@ async def test_external_vad(): api_key="NONE", connect=False, config=VoiceAgentConfig( - end_of_utterance_silence_trigger=adaptive_timeout, end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL + end_of_utterance_silence_trigger=adaptive_timeout, + end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL, + end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) assert client is not None @@ -269,6 +272,15 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True): # Emit the message client.emit(message["payload"]["message"], message["payload"]) + # Debug + if SHOW_LOG: + client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, lambda message: print(message)) + client.on(AgentServerMessageType.ADD_SEGMENT, lambda message: print(message)) + client.on(AgentServerMessageType.START_OF_TURN, lambda message: print(message)) + client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, lambda message: print(message)) + client.on(AgentServerMessageType.END_OF_TURN, lambda message: print(message)) + client.on(AgentServerMessageType.END_OF_UTTERANCE, lambda message: print(message)) + # Inject conversation await send_message(0, count=12, use_ttl=False) @@ -333,6 +345,7 @@ async def test_end_of_utterance_adaptive_vad(): end_of_utterance_silence_trigger=adaptive_timeout, end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, speech_segment_config=SpeechSegmentConfig(emit_sentences=False), + end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) assert client is not None diff --git a/tests/voice/test_07_languages.py b/tests/voice/test_07_languages.py index 759e67b..e892f89 100644 --- a/tests/voice/test_07_languages.py +++ b/tests/voice/test_07_languages.py @@ -1,4 +1,3 @@ -import asyncio import datetime import json import os @@ -10,9 +9,11 @@ import pytest from _utils import get_client from _utils import send_audio_file +from _utils import send_silence from speechmatics.voice import AdditionalVocabEntry from speechmatics.voice import AgentServerMessageType +from speechmatics.voice import EndOfTurnConfig from speechmatics.voice import EndOfUtteranceMode from speechmatics.voice import VoiceAgentConfig from speechmatics.voice._utils import TextUtils @@ -119,6 +120,7 @@ async def test_transcribe_languages(sample: AudioSample): end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL, language=sample.language, additional_vocab=[AdditionalVocabEntry(content=vocab) for vocab in sample.vocab], + end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) assert client is not None @@ -169,10 +171,8 @@ def log_segment(message): # Individual payloads await send_audio_file(client, audio_file, progress_callback=log_bytes_sent) - # Send finalize - await asyncio.sleep(1.5) - client.finalize() - await asyncio.sleep(1.5) + # Send some audio silence + await send_silence(client, 4.0) # Extract the last message assert last_message.get("message") == AgentServerMessageType.ADD_SEGMENT diff --git a/tests/voice/test_09_speaker_id.py b/tests/voice/test_09_speaker_id.py index 71438aa..6e8dc0b 100644 --- a/tests/voice/test_09_speaker_id.py +++ b/tests/voice/test_09_speaker_id.py @@ -11,6 +11,7 @@ from speechmatics.rt import ClientMessageType from speechmatics.voice import AdditionalVocabEntry from speechmatics.voice import AgentServerMessageType +from speechmatics.voice import EndOfTurnConfig from speechmatics.voice import EndOfUtteranceMode from speechmatics.voice import SpeakerIdentifier from speechmatics.voice import SpeechSegmentConfig @@ -58,6 +59,7 @@ async def test_extract_speaker_ids(): additional_vocab=[ AdditionalVocabEntry(content="GeoRouter"), ], + end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) @@ -190,6 +192,7 @@ async def test_known_speakers(): additional_vocab=[ AdditionalVocabEntry(content="GeoRouter"), ], + end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) @@ -267,6 +270,7 @@ async def test_ignoring_assistant(): additional_vocab=[ AdditionalVocabEntry(content="GeoRouter"), ], + end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) diff --git a/tests/voice/test_10_finalize.py b/tests/voice/test_10_finalize.py index bc7df46..0abaed2 100644 --- a/tests/voice/test_10_finalize.py +++ b/tests/voice/test_10_finalize.py @@ -43,7 +43,6 @@ async def test_finalize(): end_of_utterance_silence_trigger=0.7, max_delay=1.2, end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL, - use_forced_eou_message=True, ), ) diff --git a/tests/voice/test_11_audio_buffer.py b/tests/voice/test_11_audio_buffer.py index 63f2fdc..a10834e 100644 --- a/tests/voice/test_11_audio_buffer.py +++ b/tests/voice/test_11_audio_buffer.py @@ -14,6 +14,7 @@ from speechmatics.voice import AdditionalVocabEntry from speechmatics.voice import AgentServerMessageType +from speechmatics.voice import EndOfTurnConfig from speechmatics.voice import EndOfUtteranceMode from speechmatics.voice import SmartTurnConfig from speechmatics.voice import VoiceAgentConfig @@ -54,9 +55,15 @@ async def test_buffer(): assert buffer.total_time == 0.0 assert buffer.size == 0 + # Create 20 seconds worth of random bytes + random_data = bytes(random.getrandbits(8) for _ in range(int(20.0 * sample_rate * sample_width))) + assert len(random_data) == int(20.0 * sample_rate * sample_width) + # Add in 20 seconds of data - for _ in range(int(20.0 * sample_rate / frame_size)): - await buffer.put_frame(b"\x00" * frame_bytes) + for i in range(int(20.0 * sample_rate / frame_size)): + start_idx = (i * frame_bytes) % len(random_data) + frame_data = random_data[start_idx : start_idx + frame_bytes] + await buffer.put_frame(frame_data) # Check values assert buffer.total_frames == int(20.0 * sample_rate / frame_size) @@ -64,7 +71,9 @@ async def test_buffer(): assert buffer.size == int(10.0 * sample_rate / frame_size) # Check frame >< time conversion - assert buffer._get_frame_from_time(buffer._get_time_from_frame(1234)) == 1234 + tff = buffer._get_time_from_frame(1234) + tft = buffer._get_frame_from_time(tff) + assert tft == 1234 # Get data from more than 10 seconds ago data = await buffer.get_frames(2.5, 7.5) @@ -74,6 +83,11 @@ async def test_buffer(): data = await buffer.get_frames(12.5, 17.5) assert len(data) == int(5.0 * sample_rate / frame_size) * frame_bytes + # Check the contents of the buffer + data = await buffer.get_frames(15.0, 20.0) + random_data_last_5_seconds = random_data[-int(5.0 * sample_rate * sample_width) :] + assert data == random_data_last_5_seconds + @pytest.mark.asyncio async def test_buffer_bytes(): @@ -126,8 +140,8 @@ async def test_buffer_bytes(): # Extract data data = await buffer.get_frames(start_time, end_time) - # Test - assert len(data) == int((end_time - start_time) * sample_rate / frame_size) * frame_bytes + # Test (two frames) + assert len(data) == int((end_time - start_time) * sample_rate / frame_size) * frame_bytes * 2 @pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping in CI") @@ -248,7 +262,8 @@ async def save_slice( additional_vocab=[ AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]), ], - smart_turn_config=SmartTurnConfig(audio_buffer_length=20.0), + smart_turn_config=SmartTurnConfig(enabled=True), + end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) @@ -353,7 +368,8 @@ async def save_slice( additional_vocab=[ AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]), ], - smart_turn_config=SmartTurnConfig(audio_buffer_length=20.0), + smart_turn_config=SmartTurnConfig(enabled=True), + end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), ), ) diff --git a/tests/voice/test_12_smart_turn_with_files.py b/tests/voice/test_12_smart_turn_with_files.py index 553b6ab..90aa437 100644 --- a/tests/voice/test_12_smart_turn_with_files.py +++ b/tests/voice/test_12_smart_turn_with_files.py @@ -28,7 +28,6 @@ class PredictionTest(BaseModel): language="en", expected=SmartTurnPredictionResult( prediction=False, - probability=0.095, ), ), PredictionTest( @@ -37,7 +36,6 @@ class PredictionTest(BaseModel): language="en", expected=SmartTurnPredictionResult( prediction=False, - probability=0.011, ), ), PredictionTest( @@ -46,7 +44,6 @@ class PredictionTest(BaseModel): language="en", expected=SmartTurnPredictionResult( prediction=True, - probability=0.892, ), ), ] @@ -79,9 +76,3 @@ async def test_prediction(sample: PredictionTest): # Check result assert result.prediction == sample.expected.prediction - - # Prediction within 5% of expected - assert ( - result.probability >= sample.expected.probability - 0.05 - and result.probability <= sample.expected.probability + 0.05 - ) diff --git a/tests/voice/test_13_smart_turn_transcribe.py b/tests/voice/test_13_smart_turn_transcribe.py index 9283c33..d7ec6b6 100644 --- a/tests/voice/test_13_smart_turn_transcribe.py +++ b/tests/voice/test_13_smart_turn_transcribe.py @@ -12,6 +12,7 @@ from speechmatics.voice import AdditionalVocabEntry from speechmatics.voice import AgentServerMessageType from speechmatics.voice import EndOfUtteranceMode +from speechmatics.voice import SmartTurnConfig from speechmatics.voice import SpeechSegmentConfig from speechmatics.voice import VoiceAgentConfig from speechmatics.voice._smart_turn import SmartTurnDetector @@ -94,13 +95,13 @@ async def test_prediction(sample: TranscriptionTest): connect=False, config=VoiceAgentConfig( max_delay=0.7, - end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN, + end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, end_of_utterance_silence_trigger=0.5, enable_diarization=True, sample_rate=sample.sample_rate, additional_vocab=sample.additional_vocab, - use_forced_eou_message=True, speech_segment_config=SpeechSegmentConfig(emit_sentences=False), + smart_turn_config=SmartTurnConfig(enabled=True), ), ) diff --git a/tests/voice/test_14_presets.py b/tests/voice/test_14_presets.py index 1669392..2f51da0 100644 --- a/tests/voice/test_14_presets.py +++ b/tests/voice/test_14_presets.py @@ -11,20 +11,18 @@ async def test_presets(): """Test VoiceAgentConfigPreset presets.""" # Create a preset - preset: VoiceAgentConfig = VoiceAgentConfigPreset.LOW_LATENCY() + preset: VoiceAgentConfig = VoiceAgentConfigPreset.FAST() assert preset is not None assert preset.speech_segment_config.emit_sentences is True # Overlay #1 - preset: VoiceAgentConfig = VoiceAgentConfigPreset.LOW_LATENCY( - VoiceAgentConfig(max_delay=12.34, enable_diarization=False) - ) + preset: VoiceAgentConfig = VoiceAgentConfigPreset.FAST(VoiceAgentConfig(max_delay=12.34, enable_diarization=False)) assert preset is not None assert preset.max_delay == 12.34 assert preset.enable_diarization is False # Overlay #2 - preset: VoiceAgentConfig = VoiceAgentConfigPreset.LOW_LATENCY( + preset: VoiceAgentConfig = VoiceAgentConfigPreset.FAST( VoiceAgentConfig(speech_segment_config=SpeechSegmentConfig(emit_sentences=False)) ) assert preset is not None @@ -33,10 +31,10 @@ async def test_presets(): # Preset names presets = VoiceAgentConfigPreset.list_presets() - assert "low_latency" in presets + assert "fast" in presets # Get a preset by a name - preset: VoiceAgentConfig = VoiceAgentConfigPreset.load("low_latency") + preset: VoiceAgentConfig = VoiceAgentConfigPreset.load("fast") assert preset is not None @@ -45,7 +43,7 @@ async def test_json_presets(): """Test VoiceAgentConfigPreset JSON presets.""" # With a JSON string overlay - preset: VoiceAgentConfig = VoiceAgentConfigPreset.load("low_latency", '{"operating_point": "enhanced"}') + preset: VoiceAgentConfig = VoiceAgentConfigPreset.load("fast", '{"operating_point": "enhanced"}') assert preset is not None assert preset.operating_point == OperatingPoint.ENHANCED @@ -55,4 +53,4 @@ async def test_json_presets(): # Check with invalid overlay with pytest.raises(ValueError): - VoiceAgentConfigPreset.load("low_latency", '{"invalid": "value"}') + VoiceAgentConfigPreset.load("fast", '{"invalid": "value"}') From 9cc65f7e60e3bbfd4cfb4d383f1fd76061c0342f Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Thu, 4 Dec 2025 10:45:23 +0000 Subject: [PATCH 04/17] Merge fix with master. --- examples/tts/tts_autoplay/README.md | 18 +++++++++--------- examples/tts/tts_autoplay/requirements.txt | 2 +- .../tts/tts_autoplay/tts_stream_example.py | 16 ++++++++-------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/examples/tts/tts_autoplay/README.md b/examples/tts/tts_autoplay/README.md index 3bd9dd6..5d540fb 100644 --- a/examples/tts/tts_autoplay/README.md +++ b/examples/tts/tts_autoplay/README.md @@ -5,21 +5,21 @@ You must have an audio output device configured on their system for this example ## How it Works There are two main components in this example, an audio generator and an audio player. These components are run concurrently using asyncio as tasks, ochestrated by the main() function, to generate and play audio in real-time. -### audio_generator() +### audio_generator() -This producer function connects to the Speechmatics TTS API using the AsyncClient. It calls client.generate() with your text, the voice you want to use, and the output format - RAW_PCM_16000 in this example. -The code iterates over the audio data as it is streamed in chunks (iter_chunked), and accumulates in a bytearray buffer. +This producer function connects to the Speechmatics TTS API using the AsyncClient. It calls client.generate() with your text, the voice you want to use, and the output format - RAW_PCM_16000 in this example. +The code iterates over the audio data as it is streamed in chunks (iter_chunked), and accumulates in a bytearray buffer. The while len(buffer) >= 2 loop reads each audio sample containing 2 bytes, from the buffer, and converts it to a numpy array of int-16 values, which is then put into the audio_queue. -The processed 2 byte sample is then removed from the front of the buffer. +The processed 2 byte sample is then removed from the front of the buffer. END_OF_STREAM is used as a sentinel value to signal the end of the audio stream, with no more audio data to process. If an error occurs during audio generation, the END_OF_STREAM sentinel value is still put into the queue to signal the end of the audio stream to prevent the consumer, audio_player(), from getting stuck in an infinite loop, and raises the exception. -### audio_player() +### audio_player() -This consumer function initialises a sounddevice OutputStream, which is responsible for streaming the audio data to the default audio output device. Within the outputstream, the while True loop means there is continous processing of the incoming audio data. -sample = await asyncio.wait_for(play_queue.get(), timeout=0.1) fetches the next sample from the queue, or waits for 0.1 seconds if the queue is empty. -If the sample is END_OF_STREAM, the while loop breaks and the audio player exits. +This consumer function initialises a sounddevice OutputStream, which is responsible for streaming the audio data to the default audio output device. Within the outputstream, the while True loop means there is continous processing of the incoming audio data. +sample = await asyncio.wait_for(play_queue.get(), timeout=0.1) fetches the next sample from the queue, or waits for 0.1 seconds if the queue is empty. +If the sample is END_OF_STREAM, the while loop breaks and the audio player exits. If the sample is not END_OF_STREAM, it is converted to a numpy array of int-16 values and written to the audio output device using the sounddevice OutputStream. -play_queue.task_done() is called to signal that the sample has been processed. +play_queue.task_done() is called to signal that the sample has been processed. If an error occurs during audio playback, the END_OF_STREAM sentinel value is still put into the queue to signal the end of the audio stream to prevent the audio_player() from getting stuck in an infinite loop, and raises the exception. ## Installation diff --git a/examples/tts/tts_autoplay/requirements.txt b/examples/tts/tts_autoplay/requirements.txt index 550abbc..d4ef65f 100644 --- a/examples/tts/tts_autoplay/requirements.txt +++ b/examples/tts/tts_autoplay/requirements.txt @@ -1,3 +1,3 @@ -numpy>=1.24.3 sounddevice>=0.4.6 +numpy>=1.24.3 speechmatics-tts>=0.1.0 diff --git a/examples/tts/tts_autoplay/tts_stream_example.py b/examples/tts/tts_autoplay/tts_stream_example.py index 6114f2f..ec4e860 100644 --- a/examples/tts/tts_autoplay/tts_stream_example.py +++ b/examples/tts/tts_autoplay/tts_stream_example.py @@ -42,10 +42,10 @@ async def audio_generator(audio_queue: asyncio.Queue, text: str, voice: str, out sample = int.from_bytes(buffer[:2], byteorder='little', signed=True) await audio_queue.put(sample) buffer = buffer[2:] - + await audio_queue.put(END_OF_STREAM) print("Audio generated and put into queue.") - + except Exception as e: print(f"[{'Generator'}] An error occurred in the audio generator: {e}") await audio_queue.put(END_OF_STREAM) @@ -71,22 +71,22 @@ async def audio_player(play_queue: asyncio.Queue) -> None: stream.write(audio_data) buffer=[] break - + buffer.append(sample) if len(buffer) >= CHUNK_SIZE: audio_data=np.array(buffer[:CHUNK_SIZE], dtype=np.int16) stream.write(audio_data) buffer=buffer[CHUNK_SIZE:] - + play_queue.task_done() - + except asyncio.TimeoutError: if buffer: audio_data=np.array(buffer, dtype=np.int16) stream.write(audio_data) buffer=[] continue - + except Exception as e: print(f"[{'Player'}] An error occurred playing audio chunk {e}") raise @@ -106,10 +106,10 @@ async def main() -> None: asyncio.create_task(audio_generator(play_queue, TEXT, VOICE, OUTPUT_FORMAT)), asyncio.create_task(audio_player(play_queue)) ] - + try: await asyncio.gather(*tasks) - + except Exception as e: for task in tasks: task.cancel() From 428b490b18e71a73d28bedcee2fa2f8ecd20aabd Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Thu, 4 Dec 2025 13:11:41 +0000 Subject: [PATCH 05/17] Fixes to tests after merge. --- tests/voice/_utils.py | 109 ++++++++++++------- tests/voice/test_05_utterance.py | 56 ++++------ tests/voice/test_07_languages.py | 13 ++- tests/voice/test_08_multiple_speakers.py | 2 +- tests/voice/test_12_smart_turn_with_files.py | 4 +- 5 files changed, 108 insertions(+), 76 deletions(-) diff --git a/tests/voice/_utils.py b/tests/voice/_utils.py index c663e99..4d2c661 100644 --- a/tests/voice/_utils.py +++ b/tests/voice/_utils.py @@ -1,4 +1,5 @@ import asyncio +import datetime import json import os import time @@ -85,35 +86,42 @@ async def send_audio_file( # Delay is based off 16kHz int16 and chunk size delay = chunk_size / sample_rate / sample_size - # Load the file - async with aiofiles.open(file, "rb") as wav_file: - # Trim off the WAV file header - await wav_file.seek(44) + # Catch errors - we can be lazy as this is only for testing + try: - # Send audio data - next_time = time.perf_counter() + delay - while not terminate_event.is_set() if terminate_event else True: - """Reads all chunks until the end of the file with precision delay.""" + # Load the file + async with aiofiles.open(file, "rb") as wav_file: + # Trim off the WAV file header + await wav_file.seek(44) - # Read chunk - chunk = await wav_file.read(chunk_size) + # Send audio data + next_time = time.perf_counter() + delay + while not terminate_event.is_set() if terminate_event else True: + """Reads all chunks until the end of the file with precision delay.""" - # End of file - if not chunk: - break + # Read chunk + chunk = await wav_file.read(chunk_size) - # Send audio to client - await client.send_audio(chunk) + # End of file + if not chunk: + break - # Do any callbacks - if progress_callback: - progress_callback(len(chunk)) + # Send audio to client + await client.send_audio(chunk) - # Precision delay - sleep_time = next_time - time.perf_counter() - if sleep_time > 0: - await asyncio.sleep(sleep_time) - next_time += delay + # Do any callbacks + if progress_callback: + progress_callback(len(chunk)) + + # Precision delay + sleep_time = next_time - time.perf_counter() + if sleep_time > 0: + await asyncio.sleep(sleep_time) + next_time += delay + + # Catch errors + except Exception: + pass async def load_audio_file(audio_file: str) -> bytes: @@ -165,23 +173,50 @@ async def send_silence( # Iterations required iterations = int(duration / delay) - # Keep sending - while (not terminate_event.is_set() if terminate_event else True) and iterations > 0: - # Send audio to client - await client.send_audio(silence) + # Catch errors - we can be lazy as this is only for testing + try: + + # Keep sending + while (not terminate_event.is_set() if terminate_event else True) and iterations > 0: + # Send audio to client + await client.send_audio(silence) + + # Do any callbacks + if progress_callback: + progress_callback(len(silence)) + + # Precision delay + sleep_time = next_time - time.perf_counter() + if sleep_time > 0: + await asyncio.sleep(sleep_time) + next_time += delay + + # Reduce iterations + iterations -= 1 + + # Catch errors - we can be lazy as this is only for testing + except Exception: + pass + + +def log_client_messages(client: VoiceAgentClient, messages: list[AgentServerMessageType] | None = None) -> None: + """Register and log client messages.""" + + # Start time + start_time = datetime.datetime.now() - # Do any callbacks - if progress_callback: - progress_callback(len(silence)) + # Callback for each message + def _log_message(message): + ts = (datetime.datetime.now() - start_time).total_seconds() + print(json.dumps({"ts": round(ts, 3), "payload": message})) - # Precision delay - sleep_time = next_time - time.perf_counter() - if sleep_time > 0: - await asyncio.sleep(sleep_time) - next_time += delay + # Set fo all agent messages, apart from AUDIO_ADDED + if messages is None: + messages = [message for message in AgentServerMessageType if message != AgentServerMessageType.AUDIO_ADDED] - # Reduce iterations - iterations -= 1 + # Add listeners + for message_type in messages: + client.on(message_type, _log_message) class ConversationLog: diff --git a/tests/voice/test_05_utterance.py b/tests/voice/test_05_utterance.py index 6026c79..9c3c660 100644 --- a/tests/voice/test_05_utterance.py +++ b/tests/voice/test_05_utterance.py @@ -7,6 +7,7 @@ import pytest from _utils import ConversationLog from _utils import get_client +from _utils import log_client_messages from speechmatics.voice import AgentServerMessageType from speechmatics.voice import EndOfTurnConfig @@ -36,12 +37,19 @@ async def test_speech_fragments(): start_time = datetime.datetime.now() # Create a client - client = await get_client(api_key="NONE", connect=False) + client = await get_client( + api_key="NONE", + connect=False, + ) assert client is not None # Start the queue client._start_stt_queue() + # Log messages + if SHOW_LOG: + log_client_messages(client) + # Event to wait event_rx: asyncio.Event = asyncio.Event() last_message: Optional[dict[str, Any]] = None @@ -77,7 +85,7 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True): # Add listener for first interim segment message_reset() - client.once(AgentServerMessageType.ADD_PARTIAL_SEGMENT, message_rx) + client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, message_rx) # Inject first partial await send_message(0, count=6, use_ttl=False) @@ -100,22 +108,14 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True): assert seg0["text"] == "Welcome" assert f"{seg0['speaker_id']}: {seg0['text']}" == "S1: Welcome" - # Add listener for final segment - message_reset() - client.once(AgentServerMessageType.ADD_SEGMENT, message_rx) - # Send a more partials and finals await send_message(5, count=8, use_ttl=False) - # Wait for final segment - try: - await asyncio.wait_for(event_rx.wait(), timeout=5.0) - assert last_message is not None - except asyncio.TimeoutError: - pytest.fail("ADD_SEGMENT event was not received within 5 seconds") + # Yield a short while + await asyncio.sleep(0.5) # Check the right message was received - assert last_message.get("message") == AgentServerMessageType.ADD_SEGMENT + assert last_message.get("message") == AgentServerMessageType.ADD_PARTIAL_SEGMENT # Check the segment segments = last_message.get("segments", []) @@ -154,14 +154,9 @@ async def test_end_of_utterance_fixed(): ) assert client is not None - # Debug + # Log messages if SHOW_LOG: - client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, lambda message: print(message)) - client.on(AgentServerMessageType.ADD_SEGMENT, lambda message: print(message)) - client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, lambda message: print(message)) - client.on(AgentServerMessageType.START_OF_TURN, lambda message: print(message)) - client.on(AgentServerMessageType.END_OF_TURN, lambda message: print(message)) - client.on(AgentServerMessageType.END_OF_UTTERANCE, lambda message: print(message)) + log_client_messages(client) # Start the queue client._start_stt_queue() @@ -272,14 +267,9 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True): # Emit the message client.emit(message["payload"]["message"], message["payload"]) - # Debug + # Log messages if SHOW_LOG: - client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, lambda message: print(message)) - client.on(AgentServerMessageType.ADD_SEGMENT, lambda message: print(message)) - client.on(AgentServerMessageType.START_OF_TURN, lambda message: print(message)) - client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, lambda message: print(message)) - client.on(AgentServerMessageType.END_OF_TURN, lambda message: print(message)) - client.on(AgentServerMessageType.END_OF_UTTERANCE, lambda message: print(message)) + log_client_messages(client) # Inject conversation await send_message(0, count=12, use_ttl=False) @@ -350,6 +340,10 @@ async def test_end_of_utterance_adaptive_vad(): ) assert client is not None + # Log messages + if SHOW_LOG: + log_client_messages(client) + # Start the queue client._start_stt_queue() @@ -398,14 +392,6 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True): # Add listener for end of turn client.once(AgentServerMessageType.END_OF_TURN, eot_rx) - # Debug - if SHOW_LOG: - client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, lambda message: print(message)) - client.on(AgentServerMessageType.ADD_SEGMENT, lambda message: print(message)) - client.on(AgentServerMessageType.START_OF_TURN, lambda message: print(message)) - client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, lambda message: print(message)) - client.on(AgentServerMessageType.END_OF_TURN, lambda message: print(message)) - # Inject conversation up to the penultimate final from the STT await send_message(0, count=12, use_ttl=True) diff --git a/tests/voice/test_07_languages.py b/tests/voice/test_07_languages.py index e892f89..c83428d 100644 --- a/tests/voice/test_07_languages.py +++ b/tests/voice/test_07_languages.py @@ -8,6 +8,7 @@ import pytest from _utils import get_client +from _utils import log_client_messages from _utils import send_audio_file from _utils import send_silence @@ -15,6 +16,7 @@ from speechmatics.voice import AgentServerMessageType from speechmatics.voice import EndOfTurnConfig from speechmatics.voice import EndOfUtteranceMode +from speechmatics.voice import SpeechSegmentConfig from speechmatics.voice import VoiceAgentConfig from speechmatics.voice._utils import TextUtils @@ -24,6 +26,7 @@ # Constants API_KEY = os.getenv("SPEECHMATICS_API_KEY") URL = "wss://eu2.rt.speechmatics.com/v2" +SHOW_LOG = os.getenv("SPEECHMATICS_SHOW_LOG", "0").lower() in ["1", "true"] @dataclass @@ -117,10 +120,14 @@ async def test_transcribe_languages(sample: AudioSample): connect=False, config=VoiceAgentConfig( max_delay=1.2, - end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL, + end_of_utterance_mode=EndOfUtteranceMode.FIXED, + end_of_utterance_silence_trigger=1.2, language=sample.language, additional_vocab=[AdditionalVocabEntry(content=vocab) for vocab in sample.vocab], end_of_turn_config=EndOfTurnConfig(use_forced_eou=False), + speech_segment_config=SpeechSegmentConfig( + emit_sentences=False, + ), ), ) assert client is not None @@ -136,6 +143,10 @@ async def test_transcribe_languages(sample: AudioSample): # Start time start_time = datetime.datetime.now() + # Log messages + if SHOW_LOG: + log_client_messages(client) + # Bytes logger def log_bytes_sent(bytes): nonlocal bytes_sent diff --git a/tests/voice/test_08_multiple_speakers.py b/tests/voice/test_08_multiple_speakers.py index 553d8ed..fa662aa 100644 --- a/tests/voice/test_08_multiple_speakers.py +++ b/tests/voice/test_08_multiple_speakers.py @@ -8,7 +8,6 @@ import pytest from _utils import get_client from _utils import send_audio_file -from pydantic import BaseModel from speechmatics.voice import AdditionalVocabEntry from speechmatics.voice import AgentServerMessageType @@ -17,6 +16,7 @@ from speechmatics.voice import SpeakerFocusMode from speechmatics.voice import SpeechSegmentConfig from speechmatics.voice import VoiceAgentConfig +from speechmatics.voice._models import BaseModel from speechmatics.voice._models import SpeakerSegment # Skip for CI testing diff --git a/tests/voice/test_12_smart_turn_with_files.py b/tests/voice/test_12_smart_turn_with_files.py index 90aa437..7b90ec4 100644 --- a/tests/voice/test_12_smart_turn_with_files.py +++ b/tests/voice/test_12_smart_turn_with_files.py @@ -71,8 +71,8 @@ async def test_prediction(sample: PredictionTest): # Run an inference result = await detector.predict(bytes_array, language=sample.language, sample_rate=16000, sample_width=2) - # Processing time < 100ms - assert result.processing_time < 0.1 + # Processing time < 200ms + assert result.processing_time < 0.2 # Check result assert result.prediction == sample.expected.prediction From 3f336f4bd1d228bc2374a3e168d1ffacb0260875 Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Thu, 4 Dec 2025 13:13:56 +0000 Subject: [PATCH 06/17] Optional[] fix in test util class. --- tests/voice/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/voice/_utils.py b/tests/voice/_utils.py index 4d2c661..8308e90 100644 --- a/tests/voice/_utils.py +++ b/tests/voice/_utils.py @@ -199,7 +199,7 @@ async def send_silence( pass -def log_client_messages(client: VoiceAgentClient, messages: list[AgentServerMessageType] | None = None) -> None: +def log_client_messages(client: VoiceAgentClient, messages: Optional[list[AgentServerMessageType]] = None) -> None: """Register and log client messages.""" # Start time From eca7e642e5667e85deb67f3a4e1d507a8baf96ab Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Thu, 4 Dec 2025 14:29:01 +0000 Subject: [PATCH 07/17] updated docs and extra params ported from transcript config --- sdk/voice/README.md | 98 +++++++++++++++--------- sdk/voice/speechmatics/voice/__init__.py | 2 + sdk/voice/speechmatics/voice/_client.py | 1 + sdk/voice/speechmatics/voice/_models.py | 31 ++++++-- sdk/voice/speechmatics/voice/_presets.py | 28 ++++++- 5 files changed, 116 insertions(+), 44 deletions(-) diff --git a/sdk/voice/README.md b/sdk/voice/README.md index a876ad1..7737f8b 100644 --- a/sdk/voice/README.md +++ b/sdk/voice/README.md @@ -27,10 +27,9 @@ The Voice SDK is a higher-level abstraction built on top of the Speechmatics Rea **Use Real-Time SDK when:** -- You need raw word-level events -- Building custom segmentation logic +- You only need raw word-level events +- Building custom segmentation / aggregation logic - You want fine-grained control over every event -- Processing batch files or custom workflows ## Installation @@ -38,16 +37,46 @@ The Voice SDK is a higher-level abstraction built on top of the Speechmatics Rea # Standard installation pip install speechmatics-voice -# With SMART_TURN (ML-based turn detection) +# With VAD and SMART_TURN (ML-based turn detection) pip install speechmatics-voice[smart] ``` -> **Note:** `SMART_TURN` requires additional ML dependencies (ONNX runtime, transformers). If not installed, it automatically falls back to `ADAPTIVE` mode. +> **Note:** Some features require additional ML dependencies (ONNX runtime, transformers). If not installed, these features will be unavailable and a warning will be shown. + +### Use within Docker + +If you are using a Docker container with the Voice SDK installed and you require the smart features, then you can use the following in your `Dockerfile` to make sure the ML models are included and not downloaded at runtime. + +```python +""" +Download required models +""" + +from speechmatics.voice import SileroVAD, SmartTurnDetector + + +def load_models(): + SileroVAD.download_model() + SmartTurnDetector.download_model() + + +if __name__ == "__main__": + load_models() +``` + +And then include the following in tour `Dockerfile`: + +``` +COPY ./models.py models.py +RUN uv run models.py +``` ## Quick Start ### Basic Example +A simple example that will show complete sentences as they have been finalized. Different speakers will be shown with a different ID. + ```python import asyncio import os @@ -70,7 +99,7 @@ async def main(): print(f"{speaker}: {text}") # Setup microphone - mic = Microphone(sample_rate=16000, chunk_size=320) + mic = Microphone(sample_rate=16000, chunk_size=160) if not mic.start(): print("Error: Microphone not available") return @@ -80,7 +109,7 @@ async def main(): try: while True: - audio_chunk = await mic.read(320) + audio_chunk = await mic.read(160) await client.send_audio(audio_chunk) except KeyboardInterrupt: pass @@ -96,13 +125,7 @@ if __name__ == "__main__": Presets provide optimized configurations for common use cases: ```python -# External end of turn preset - endpointing handled by the client -client = VoiceAgentClient(api_key=api_key, preset="external") - -# Scribe preset - for note-taking -client = VoiceAgentClient(api_key=api_key, preset="scribe") - -# Low latency preset - for fast responses +# Low latency preset - for fast responses (may split speech in to smaller segments) client = VoiceAgentClient(api_key=api_key, preset="fast") # Conversation preset - for natural dialogue @@ -111,6 +134,12 @@ client = VoiceAgentClient(api_key=api_key, preset="adaptive") # Advanced conversation with ML turn detection client = VoiceAgentClient(api_key=api_key, preset="smart_turn") +# External end of turn preset - endpointing handled by the client +client = VoiceAgentClient(api_key=api_key, preset="external") + +# Scribe preset - for note-taking +client = VoiceAgentClient(api_key=api_key, preset="scribe") + # Captions preset - for live captioning client = VoiceAgentClient(api_key=api_key, preset="captions") ``` @@ -146,8 +175,8 @@ Domain-specific model (e.g., `"finance"`, `"medical"`). See [supported languages **`output_locale`** (str, default: `None`) Output locale for formatting (e.g., `"en-GB"`, `"en-US"`). See [supported languages and locales](https://docs.speechmatics.com/speech-to-text/languages). -**`enable_diarization`** (bool, default: `False`) -Enable speaker diarization to identify and label different speakers. +**`max_delay`** (float, default: `0.7`) +Maximum transcription delay for word emission. ### Turn Detection Parameters @@ -156,20 +185,16 @@ Controls how turn endings are detected: - **`FIXED`** - Uses fixed silence threshold. Fast but may split slow speech. - **`ADAPTIVE`** - Adjusts delay based on speech rate, pauses, and disfluencies. Best for natural conversation. -- **`SMART_TURN`** - Uses ML model to detect acoustic turn-taking cues. Requires `[smart]` extras. - **`EXTERNAL`** - Manual control via `client.finalize()`. For custom turn logic. **`end_of_utterance_silence_trigger`** (float, default: `0.2`) -Silence duration in seconds to trigger turn end. - -**`end_of_utterance_max_delay`** (float, default: `10.0`) -Maximum delay before forcing turn end. - -**`max_delay`** (float, default: `0.7`) -Maximum transcription delay for word emission. +Silence duration in seconds to trigger turn end (also used for the basis of adaptive delay). ### Speaker Configuration +**`enable_diarization`** (bool, default: `False`) +Enable speaker diarization to identify and label different speakers. + **`speaker_sensitivity`** (float, default: `0.5`) Diarization sensitivity between 0.0 and 1.0. Higher values detect more speakers. @@ -185,7 +210,7 @@ Configure speaker focus/ignore rules. ```python from speechmatics.voice import SpeakerFocusConfig, SpeakerFocusMode -# Focus only on specific speakers +# Focus only on specific speakers, but keep words from other speakers config = VoiceAgentConfig( enable_diarization=True, speaker_config=SpeakerFocusConfig( @@ -198,8 +223,7 @@ config = VoiceAgentConfig( config = VoiceAgentConfig( enable_diarization=True, speaker_config=SpeakerFocusConfig( - ignore_speakers=["S3"], - focus_mode=SpeakerFocusMode.IGNORE + ignore_speakers=["S3"] ) ) ``` @@ -265,7 +289,7 @@ Configure SMART_TURN behavior (buffer length, threshold). Include word-level timing data in segments. **`include_partials`** (bool, default: `True`) -Emit partial segments. Set to `False` for final-only output. +Include interim (lower confidence) words in the emitted segments. Set to `False` for final-only output. ### Configuration with Overlays @@ -284,7 +308,7 @@ config = VoiceAgentConfigPreset.SCRIBE( # Available presets presets = VoiceAgentConfigPreset.list_presets() -# ['fast', 'adaptive', 'smart_turn', 'scribe', 'captions'] +# ['fast', 'adaptive', 'smart_turn', 'scribe', 'captions', '...'] ``` ### Configuration Serialization @@ -502,23 +526,27 @@ def on_speaker_end(message): #### SPEAKERS_RESULT -Emitted when speaker enrollment completes. +Emitted when speaker enrolment completes. ```python -# Request speaker IDs at end of session -await client.send_message({"message": "GetSpeakers", "final": True}) - +# Listen for the result @client.on(AgentServerMessageType.SPEAKERS_RESULT) def on_speakers(message): for speaker in message["speakers"]: print(f"Speaker {speaker['label']}: {speaker['speaker_identifiers']}") + +# Request speaker IDs at end of session +await client.send_message({"message": AgentClientMessageType.GET_SPEAKERS, "final": True}) + +# Request speaker IDs now +await client.send_message({"message": AgentClientMessageType.GET_SPEAKERS}) ``` ### Additional Events **`START_OF_TURN`** - Emitted at the beginning of a new turn. -**`END_OF_TURN_PREDICTION`** - Emitted during `ADAPTIVE` or `SMART_TURN` mode to predict turn completion (fires before `END_OF_TURN`). +**`END_OF_TURN_PREDICTION`** - Emitted during `ADAPTIVE` mode to predict turn completion (fires before `END_OF_TURN`). **`END_OF_UTTERANCE`** - Low-level STT engine event (fires when silence threshold is reached). @@ -655,7 +683,7 @@ See the `examples/voice/` directory for complete working examples: - **`scribe/`** - Note-taking with custom vocabulary - **`cli/`** - Full-featured CLI with all options -## API Reference +## SDK Class Reference ### VoiceAgentClient diff --git a/sdk/voice/speechmatics/voice/__init__.py b/sdk/voice/speechmatics/voice/__init__.py index 5d84048..21f517c 100644 --- a/sdk/voice/speechmatics/voice/__init__.py +++ b/sdk/voice/speechmatics/voice/__init__.py @@ -26,6 +26,7 @@ from ._models import EndOfTurnConfig from ._models import EndOfTurnPenaltyItem from ._models import EndOfUtteranceMode +from ._models import MaxDelayMode from ._models import SegmentMessage from ._models import SessionMetricsMessage from ._models import SmartTurnConfig @@ -54,6 +55,7 @@ "EndOfTurnConfig", "EndOfTurnPenaltyItem", "EndOfUtteranceMode", + "MaxDelayMode", "OperatingPoint", "SpeakerDiarizationConfig", "SpeakerFocusConfig", diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py index 2b1d838..fef7d02 100644 --- a/sdk/voice/speechmatics/voice/_client.py +++ b/sdk/voice/speechmatics/voice/_client.py @@ -400,6 +400,7 @@ def _prepare_config( operating_point=config.operating_point, diarization="speaker" if config.enable_diarization else None, enable_partials=True, + enable_entities=config.enable_entities, max_delay=config.max_delay, max_delay_mode="fixed", audio_filtering_config={ diff --git a/sdk/voice/speechmatics/voice/_models.py b/sdk/voice/speechmatics/voice/_models.py index dd72cd5..5b12dfa 100644 --- a/sdk/voice/speechmatics/voice/_models.py +++ b/sdk/voice/speechmatics/voice/_models.py @@ -67,6 +67,13 @@ class EndOfUtteranceMode(str, Enum): ADAPTIVE = "adaptive" +class MaxDelayMode(str, Enum): + """Max delay mode options for transcription.""" + + FIXED = "fixed" + FLEXIBLE = "flexible" + + class TranscriptionUpdatePreset(str, Enum): """Filter options for when to emit changes to transcription. @@ -512,11 +519,18 @@ class VoiceAgentConfig(BaseModel): than English. See documentation for more information. Defaults to `None`. - enable_diarization: Enable speaker diarization. When enabled, the STT engine will - determine and attribute words to unique speakers. The speaker_sensitivity - parameter can be used to adjust the sensitivity of diarization. + enable_entities: Enable entity detection. When enabled, the STT engine will + detect and attribute words to entities. This is useful for languages that use + different entities than English. See documentation for more information. Defaults to `False`. + max_delay_mode: Determines whether the threshold specified in max_delay can be exceeded + if a potential entity is detected. Flexible means if a potential entity + is detected, then the max_delay can be overriden until the end of that + entity. Fixed means that max_delay specified ignores any potential + entity that would not be completed within that threshold. + Defaults to `MaxDelayMode.FLEXIBLE`. + include_partials: Include partial segment fragments (words) in the output of AddPartialSegment messages. Partial fragments from the STT will always be used for speaker activity detection. If `include_results` is enabled, then partials will @@ -524,6 +538,11 @@ class VoiceAgentConfig(BaseModel): the formatted text output of individual segments. Defaults to `True`. + enable_diarization: Enable speaker diarization. When enabled, the STT engine will + determine and attribute words to unique speakers. The speaker_sensitivity + parameter can be used to adjust the sensitivity of diarization. + Defaults to `False`. + speaker_sensitivity: Diarization sensitivity. A higher value increases the sensitivity of diarization and helps when two or more speakers have similar voices. Defaults to `0.5`. @@ -654,10 +673,12 @@ class VoiceAgentConfig(BaseModel): end_of_utterance_mode: EndOfUtteranceMode = EndOfUtteranceMode.FIXED additional_vocab: list[AdditionalVocabEntry] = Field(default_factory=list) punctuation_overrides: Optional[dict] = None + enable_entities: bool = False + max_delay_mode: MaxDelayMode = MaxDelayMode.FLEXIBLE + include_partials: bool = True # Diarization enable_diarization: bool = False - include_partials: bool = True speaker_sensitivity: float = 0.5 max_speakers: Optional[int] = None prefer_current_speaker: bool = False @@ -1224,7 +1245,7 @@ class TurnPredictionMetadata(BaseModel): """ ttl: float - reasons: list[str] + reasons: list[str] = Field(default_factory=list, exclude=True) model_config = ConfigDict(extra="ignore") diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py index 37c9705..22c2734 100644 --- a/sdk/voice/speechmatics/voice/_presets.py +++ b/sdk/voice/speechmatics/voice/_presets.py @@ -26,7 +26,7 @@ def FAST(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noq delay to finalizing the spoken sentences. It is not recommended for conversation, as it will not account for pauses, slow speech or disfluencies. - Use of this will requite `pip install speechmatics-voice[smart]` and may not + Use of this will require `pip install speechmatics-voice[smart]` and may not be suited to low-power devices. """ return VoiceAgentConfigPreset._merge_configs( @@ -71,7 +71,7 @@ def ADAPTIVE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # finalizing the spoken sentences will be adjusted based on the words and whether there are any pauses, slow speech or disfluencies. - Use of this will requite `pip install speechmatics-voice[smart]` and may not + Use of this will require `pip install speechmatics-voice[smart]` and may not be suited to low-power devices. """ return VoiceAgentConfigPreset._merge_configs( @@ -100,7 +100,7 @@ def SMART_TURN(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: This preset will use a model to detect for acoustic indicators from the speaker to determine when a turn has ended. - Use of this will requite `pip install speechmatics-voice[smart]` and may not + Use of this will require `pip install speechmatics-voice[smart]` and may not be suited to low-power devices. """ return VoiceAgentConfigPreset._merge_configs( @@ -127,7 +127,7 @@ def SCRIBE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # n This mode will emit partial and final segments as they become available. The end of utterance is set to fixed. End of turn is not required for note-taking. - Use of this will requite `pip install speechmatics-voice[smart]` and may not + Use of this will require `pip install speechmatics-voice[smart]` and may not be suited to low-power devices. """ return VoiceAgentConfigPreset._merge_configs( @@ -147,6 +147,26 @@ def SCRIBE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # n overlay, ) + @staticmethod + def CAPTIONS(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 + """Best suited for captioning. + + This mode will emit final segments as they become available. The end of + utterance is set to fixed. End of turn is not required for captioning. + """ + return VoiceAgentConfigPreset._merge_configs( + VoiceAgentConfig( + operating_point=OperatingPoint.ENHANCED, + enable_diarization=True, + max_delay=0.7, + end_of_utterance_silence_trigger=0.5, + end_of_utterance_mode=EndOfUtteranceMode.FIXED, + speech_segment_config=SpeechSegmentConfig(emit_sentences=True), + include_partials=False, + ), + overlay, + ) + @staticmethod def EXTERNAL(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 """Best suited for external turn control. From ccbf8e41899ceeba11c239e16dc89a4d6b093603 Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Thu, 4 Dec 2025 14:34:37 +0000 Subject: [PATCH 08/17] docker --- sdk/voice/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sdk/voice/README.md b/sdk/voice/README.md index 7737f8b..13849e7 100644 --- a/sdk/voice/README.md +++ b/sdk/voice/README.md @@ -43,6 +43,10 @@ pip install speechmatics-voice[smart] > **Note:** Some features require additional ML dependencies (ONNX runtime, transformers). If not installed, these features will be unavailable and a warning will be shown. +
+ +👉 Click to see how to install with Docker. + ### Use within Docker If you are using a Docker container with the Voice SDK installed and you require the smart features, then you can use the following in your `Dockerfile` to make sure the ML models are included and not downloaded at runtime. @@ -71,6 +75,8 @@ COPY ./models.py models.py RUN uv run models.py ``` +
+ ## Quick Start ### Basic Example From c57acbdb9b40c85afc5fddffa1685e4c8bb1d42d Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Thu, 4 Dec 2025 14:35:19 +0000 Subject: [PATCH 09/17] doc fix --- sdk/voice/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/voice/README.md b/sdk/voice/README.md index 13849e7..d7b4de9 100644 --- a/sdk/voice/README.md +++ b/sdk/voice/README.md @@ -45,7 +45,7 @@ pip install speechmatics-voice[smart]
-👉 Click to see how to install with Docker. +👉 Click to see how to install with Docker. ### Use within Docker From a152277a00020e8c8fece89eab6121e5617910ea Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Fri, 5 Dec 2025 10:51:59 +0000 Subject: [PATCH 10/17] Squashed commit of the following: commit c119e3f8c6f0dfc44ac0d8bdf7c552463265b2ea Author: Lorna Armstrong Date: Fri Dec 5 10:05:59 2025 +0000 Updates to Voice SDK README (#67) * Trial Updates to README * Add Note Re No Provided Preset / Config * Update dependency on RT * Add Code Comment --------- Co-authored-by: Sam Sykes commit f4da9cf9aad1a59e88354544fde20ddcd78f2c41 Author: Sam Sykes Date: Thu Dec 4 22:52:24 2025 +0000 Update dependency on RT --- sdk/voice/README.md | 295 +++++++++++++++++++++------------------ sdk/voice/pyproject.toml | 2 +- 2 files changed, 157 insertions(+), 140 deletions(-) diff --git a/sdk/voice/README.md b/sdk/voice/README.md index d7b4de9..39e6e95 100644 --- a/sdk/voice/README.md +++ b/sdk/voice/README.md @@ -4,11 +4,26 @@ [![PyPI](https://img.shields.io/pypi/v/speechmatics-voice)](https://pypi.org/project/speechmatics-voice/) [![PythonSupport](https://img.shields.io/badge/Python-3.9%2B-green)](https://www.python.org/) -Python SDK for building voice-enabled applications with the Speechmatics Real-Time API. Optimized for conversational AI, voice agents, transcription services, and real-time captioning. +Python SDK for building voice-enabled applications using Speechmatics Real-Time API. Optimized for specific use cases: conversational AI, voice agents, transcription services, and real-time captioning. + +## Table of Contents +- [What is the Voice SDK?](#what-is-the-voice-sdk) +- [Installation](#installation) +- [Quick Start](#quick-start) +- [Configuration](#configuration) +- [Event Messages](#event-messages) +- [Common Usage Patterns](#common-usage-patterns) +- [Environment Variables](#environment-variables) +- [Examples](#examples) +- [SDK Class Reference](#sdk-class-reference) +- [Requirements](#requirements) +- [Documentation](#documentation) +- [License](#license) + ## What is the Voice SDK? -The Voice SDK is a higher-level abstraction built on top of the Speechmatics Real-Time API (`speechmatics-rt`). While the Real-Time SDK provides raw transcription events (words and utterances), the Voice SDK adds: +The Voice SDK is a higher-level abstraction built on top of the Speechmatics Real-Time API (`speechmatics-rt`). While the Real-Time API provides raw transcription events (words and utterances), the Voice SDK adds: - **Intelligent Segmentation** - Groups words into meaningful speech segments per speaker - **Turn Detection** - Automatically detects when speakers finish their turns using adaptive or ML-based methods @@ -16,19 +31,19 @@ The Voice SDK is a higher-level abstraction built on top of the Speechmatics Rea - **Preset Configurations** - Ready-to-use configs for common use cases (conversation, note-taking, captions) - **Simplified Event Handling** - Receive clean, structured segments instead of raw word-level events -### When to Use Voice SDK vs Real-Time SDK +### When to Use Voice SDK vs Real-Time API **Use Voice SDK when:** -- Building conversational AI or voice agents +- You are building conversational AI or voice agents - You need automatic turn detection - You want speaker-focused transcription - You need ready-to-use presets for common scenarios -**Use Real-Time SDK when:** +**Use Real-Time API when:** -- You only need raw word-level events -- Building custom segmentation / aggregation logic +- You only need raw, word-level events +- You are building custom segmentation / aggregation logic - You want fine-grained control over every event ## Installation @@ -45,15 +60,15 @@ pip install speechmatics-voice[smart]
-👉 Click to see how to install with Docker. +👉 Using Docker? Click to see how to install the required models. ### Use within Docker -If you are using a Docker container with the Voice SDK installed and you require the smart features, then you can use the following in your `Dockerfile` to make sure the ML models are included and not downloaded at runtime. +If you are using a Docker container with the Voice SDK installed and you require the smart features (`SMART_TURN`), then you can use the following in your `Dockerfile` to make sure the ML models are included and not downloaded at runtime. ```python """ -Download required models +Download the Voice SDK required models during the build process. """ from speechmatics.voice import SileroVAD, SmartTurnDetector @@ -68,12 +83,13 @@ if __name__ == "__main__": load_models() ``` -And then include the following in tour `Dockerfile`: +Then, in your `Dockerfile`, include the following: ``` COPY ./models.py models.py RUN uv run models.py ``` +This copies the script and runs it as part of the build.
@@ -81,7 +97,7 @@ RUN uv run models.py ### Basic Example -A simple example that will show complete sentences as they have been finalized. Different speakers will be shown with a different ID. +A simple example that shows complete sentences as they have been finalized, with different speakers shown with different IDs. ```python import asyncio @@ -90,13 +106,20 @@ from speechmatics.rt import Microphone from speechmatics.voice import VoiceAgentClient, AgentServerMessageType async def main(): + """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" + + # Audio configuration + SAMPLE_RATE = 16000 # Hz + CHUNK_SIZE = 160 # Samples per read + PRESET = "scribe" # Configuration preset + # Create client with preset client = VoiceAgentClient( api_key=os.getenv("SPEECHMATICS_API_KEY"), - preset="scribe" + preset=PRESET ) - # Handle final segments + # Print finalised segments of speech with speaker ID @client.on(AgentServerMessageType.ADD_SEGMENT) def on_segment(message): for segment in message["segments"]: @@ -105,17 +128,20 @@ async def main(): print(f"{speaker}: {text}") # Setup microphone - mic = Microphone(sample_rate=16000, chunk_size=160) + mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) if not mic.start(): print("Error: Microphone not available") return - # Connect and stream + # Connect to the Voice Agent await client.connect() + # Stream microphone audio (interruptable using keyboard) try: while True: - audio_chunk = await mic.read(160) + audio_chunk = await mic.read(CHUNK_SIZE) + if not audio_chunk: + break # Microphone stopped producing data await client.send_audio(audio_chunk) except KeyboardInterrupt: pass @@ -126,10 +152,10 @@ if __name__ == "__main__": asyncio.run(main()) ``` -### Using Presets - -Presets provide optimized configurations for common use cases: +### Configuring a Voice Agent Client +When creating a VoiceAgentClient, there are several ways to configure it: +1. **Presets** - optimised configurations for common use cases. These require no further configuration to be set. ```python # Low latency preset - for fast responses (may split speech in to smaller segments) client = VoiceAgentClient(api_key=api_key, preset="fast") @@ -148,13 +174,18 @@ client = VoiceAgentClient(api_key=api_key, preset="scribe") # Captions preset - for live captioning client = VoiceAgentClient(api_key=api_key, preset="captions") + +# To view all available presets, use: +presets = VoiceAgentConfigPreset.list_presets() ``` -### Custom Configuration + +2. **Custom Configuration** - for more control, you can also specify custom configuration in a `VoiceAgentConfig` object. ```python from speechmatics.voice import VoiceAgentClient, VoiceAgentConfig, EndOfUtteranceMode +# Define your custom configuration config = VoiceAgentConfig( language="en", enable_diarization=True, @@ -165,53 +196,70 @@ config = VoiceAgentConfig( client = VoiceAgentClient(api_key=api_key, config=config) ``` -## Configuration +3. **Custom Configuration with Overlays** - you can use presets as a starting point, and then customize with overlays. -### Basic Parameters +```python +from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig -**`language`** (str, default: `"en"`) -Language code for transcription (e.g., `"en"`, `"es"`, `"fr"`). See [supported languages](https://docs.speechmatics.com/speech-to-text/languages). +# Use preset with custom overrides +config = VoiceAgentConfigPreset.SCRIBE( + VoiceAgentConfig( + language="es", + max_delay=0.8 + ) +) +``` -**`operating_point`** (OperatingPoint, default: `ENHANCED`) -Balance accuracy vs latency. Options: `STANDARD` or `ENHANCED`. +> **Note:** If no config or preset is provided, the client will default to the `external` preset. -**`domain`** (str, default: `None`) -Domain-specific model (e.g., `"finance"`, `"medical"`). See [supported languages and domains](https://docs.speechmatics.com/speech-to-text/languages). +### Configuration Serialization +It can also be useful to export and import configuration as JSON: -**`output_locale`** (str, default: `None`) -Output locale for formatting (e.g., `"en-GB"`, `"en-US"`). See [supported languages and locales](https://docs.speechmatics.com/speech-to-text/languages). +```python +from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig -**`max_delay`** (float, default: `0.7`) -Maximum transcription delay for word emission. +# Export preset to JSON +config_json = VoiceAgentConfigPreset.SCRIBE().to_json() -### Turn Detection Parameters +# Load from JSON +config = VoiceAgentConfig.from_json(config_json) -**`end_of_utterance_mode`** (EndOfUtteranceMode, default: `FIXED`) -Controls how turn endings are detected: +# Or create from JSON string +config = VoiceAgentConfig.from_json('{"language": "en", "enable_diarization": true}') +``` -- **`FIXED`** - Uses fixed silence threshold. Fast but may split slow speech. -- **`ADAPTIVE`** - Adjusts delay based on speech rate, pauses, and disfluencies. Best for natural conversation. -- **`EXTERNAL`** - Manual control via `client.finalize()`. For custom turn logic. +## Configuration -**`end_of_utterance_silence_trigger`** (float, default: `0.2`) -Silence duration in seconds to trigger turn end (also used for the basis of adaptive delay). +### Basic Parameters -### Speaker Configuration +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `language` | str | `"en"` | Language code for transcription (e.g., `"en"`, `"es"`, `"fr"`).
See [supported languages](https://docs.speechmatics.com/speech-to-text/languages). | +| `operating_point` | OperatingPoint | `ENHANCED` | Balance accuracy vs latency. Options: `STANDARD` or `ENHANCED`. | +| `domain` | str | `None` | Domain-specific model (e.g., `"finance"`, `"medical"`).
See [supported languages and domains](https://docs.speechmatics.com/speech-to-text/languages). | +| `output_locale` | str | `None` | Output locale for formatting (e.g., `"en-GB"`, `"en-US"`).
See [supported languages and locales](https://docs.speechmatics.com/speech-to-text/languages). | +| `max_delay` | float | `0.7` | Maximum transcription delay for word emission. | -**`enable_diarization`** (bool, default: `False`) -Enable speaker diarization to identify and label different speakers. +### Turn Detection Parameters -**`speaker_sensitivity`** (float, default: `0.5`) -Diarization sensitivity between 0.0 and 1.0. Higher values detect more speakers. +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `end_of_utterance_mode` | EndOfUtteranceMode | `FIXED` | Controls how turn endings are detected. Options:
- `FIXED` - Uses fixed silence threshold. Fast but may split slow speech.
- `ADAPTIVE` - Adjusts delay based on speech rate, pauses, and disfluencies. Best for natural conversation.
- `EXTERNAL` - Manual control via `client.finalize()`. For custom turn logic. | +| `end_of_utterance_silence_trigger` | float | `0.2` | Silence duration in seconds to trigger turn end (also used for the basis of adaptive delay). | -**`max_speakers`** (int, default: `None`) -Limit maximum number of speakers to detect. +### Speaker Configuration -**`prefer_current_speaker`** (bool, default: `False`) -Give extra weight to current speaker for word grouping. +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `enable_diarization` | bool | `False` | Enable speaker diarization to identify and label different speakers. | +| `speaker_sensitivity` | float | `0.5` | Diarization sensitivity between 0.0 and 1.0. Higher values detect more speakers. | +| `max_speakers` | int | `None` | Limit maximum number of speakers to detect. | +| `prefer_current_speaker` | bool | `False` | Give extra weight to current speaker for word grouping. | +| `speaker_config` | SpeakerFocusConfig | `SpeakerFocusConfig()` | Configure speaker focus/ignore rules. | +| `known_speakers` | list[SpeakerIdentifier] | `[]` | Pre-enrolled speaker identifiers for speaker identification. | -**`speaker_config`** (SpeakerFocusConfig, default: `SpeakerFocusConfig()`) -Configure speaker focus/ignore rules. +#### Usage Examples +Using `speaker_config`, you can focus on only specific speakers but keep words from others, or ignore specific speakers. ```python from speechmatics.voice import SpeakerFocusConfig, SpeakerFocusMode @@ -234,12 +282,12 @@ config = VoiceAgentConfig( ) ``` -**`known_speakers`** (list[SpeakerIdentifier], default: `[]`) -Pre-enrolled speaker identifiers for speaker identification. +Using `known_speakers`, you can use pre-enrolled speaker identifiers to identify specific speakers. ```python from speechmatics.voice import SpeakerIdentifier +# Use known speakers from previous session config = VoiceAgentConfig( enable_diarization=True, known_speakers=[ @@ -251,8 +299,14 @@ config = VoiceAgentConfig( ### Language & Vocabulary -**`additional_vocab`** (list[AdditionalVocabEntry], default: `[]`) -Custom vocabulary for domain-specific terms. +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `additional_vocab` | list[AdditionalVocabEntry] | `[]` | Custom vocabulary for domain-specific terms. | +| `punctuation_overrides` | dict | `None` | Custom punctuation rules. | + +#### Usage Examples + +Using `additional_vocab`, you can specify a dictionary of domain-specific terms. ```python from speechmatics.voice import AdditionalVocabEntry @@ -269,83 +323,68 @@ config = VoiceAgentConfig( ) ``` -**`punctuation_overrides`** (dict, default: `None`) -Custom punctuation rules. - ### Audio Parameters -**`sample_rate`** (int, default: `16000`) -Audio sample rate in Hz. - -**`audio_encoding`** (AudioEncoding, default: `PCM_S16LE`) -Audio encoding format. +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `sample_rate` | int | `16000` | Audio sample rate in Hz. | +| `audio_encoding` | AudioEncoding | `PCM_S16LE` | Audio encoding format. | ### Advanced Parameters -**`transcription_update_preset`** (TranscriptionUpdatePreset, default: `COMPLETE`) -Controls when to emit updates: `COMPLETE`, `COMPLETE_PLUS_TIMING`, `WORDS`, `WORDS_PLUS_TIMING`, or `TIMING`. - -**`speech_segment_config`** (SpeechSegmentConfig, default: `SpeechSegmentConfig()`) -Fine-tune segment generation and post-processing. - -**`smart_turn_config`** (SmartTurnConfig, default: `None`) -Configure SMART_TURN behavior (buffer length, threshold). +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `transcription_update_preset` | TranscriptionUpdatePreset | `COMPLETE` | Controls when to emit updates: `COMPLETE`, `COMPLETE_PLUS_TIMING`, `WORDS`, `WORDS_PLUS_TIMING`, or `TIMING`. | +| `speech_segment_config` | SpeechSegmentConfig | `SpeechSegmentConfig()` | Fine-tune segment generation and post-processing. | +| `smart_turn_config` | SmartTurnConfig | `None` | Configure SMART_TURN behavior (buffer length, threshold). | +| `include_results` | bool | `False` | Include word-level timing data in segments. | +| `include_partials` | bool | `True` | Include interim (lower confidence) words in emitted segments. Set to `False` for final-only output. | -**`include_results`** (bool, default: `False`) -Include word-level timing data in segments. - -**`include_partials`** (bool, default: `True`) -Include interim (lower confidence) words in the emitted segments. Set to `False` for final-only output. - -### Configuration with Overlays - -Use presets as a starting point and customize with overlays: - -```python -from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig - -# Use preset with custom overrides -config = VoiceAgentConfigPreset.SCRIBE( - VoiceAgentConfig( - language="es", - max_delay=0.8 - ) -) +## Event Messages -# Available presets -presets = VoiceAgentConfigPreset.list_presets() -# ['fast', 'adaptive', 'smart_turn', 'scribe', 'captions', '...'] -``` +The Voice SDK emits real-time, structured events as a session progresses via `AgentServerMessageType`. -### Configuration Serialization +These events fall into three main categories: +1. **Core Events** - high-level session and transcription updates. +2. **Speaker Events** - detected speech activity. +3. **Additional** - detailed, low-level events. -Export and import configurations as JSON: +To handle events, register a callback using `@client.on()` decorator or `client.on()` method. -```python -from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig +> **Note:** The payloads shown below are the actual message payloads from the Voice SDK. When using the CLI example with `--output-file`, messages also include a `ts` timestamp field (e.g., `"ts": "2025-11-11 23:18:35.909"`), which is added by the CLI for logging purposes and is not part of the SDK payload. -# Export preset to JSON -config_json = VoiceAgentConfigPreset.SCRIBE().to_json() +### High Level Overview -# Load from JSON -config = VoiceAgentConfig.from_json(config_json) +#### Core Events -# Or create from JSON string -config = VoiceAgentConfig.from_json('{"language": "en", "enable_diarization": true}') -``` +| Event | Description | Notes / Purpose | +| ----------------------- | ----------------------------------------- | ------------------------------------------------------------ | +| `RECOGNITION_STARTED` | Fired when a transcription session starts | Contains session ID, language pack info | +| `ADD_PARTIAL_SEGMENT` | Emitted continuously during speech | Provides interim, real-time transcription text | +| `ADD_SEGMENT` | Fired when a segment is finalized | Provides stable, final transcription text | +| `END_OF_TURN` | Fired when a speaker’s turn ends | Depends on `end_of_utterance_mode`; useful for turn tracking | -## Event Messages +#### Speaker Events +| Event | When it fires | Purpose | +| --------------- | -------------------- | ------------------------------- | +| `SPEAKER_STARTED` | Voice detected | Marks start of speech | +| `SPEAKER_ENDED` | Silence detected | Marks end of speech | +| `SPEAKERS_RESULT` | Enrollment completes | Provides speaker IDs and labels | -The Voice SDK emits structured events via `AgentServerMessageType`. Register handlers using the `@client.on()` decorator or `client.on()` method. +#### Additional Events +| Event | When it fires | Purpose | +| ---------------------- | ----------------------------- | ------------------------------------------- | +| `START_OF_TURN` | New turn begins | Optional, low-level event for turn tracking | +| `END_OF_TURN_PREDICTION` | Predicts turn completion | Fires before END_OF_TURN in adaptive mode | +| `END_OF_UTTERANCE` | Silence threshold reached | Low-level STT engine trigger | +| `ADD_PARTIAL_TRANSCRIPT` | Word-level partial transcript | Legacy; use ADD_PARTIAL_SEGMENT instead | +| `ADD_TRANSCRIPT` | Word-level final transcript | Legacy; use ADD_SEGMENT instead | -> **Note:** The payloads shown below are the actual message payloads from the Voice SDK. When using the CLI example with `--output-file`, messages also include a `ts` timestamp field (e.g., `"ts": "2025-11-11 23:18:35.909"`), which is added by the CLI for logging purposes and is not part of the SDK payload. -### Core Events +### Core Events - Examples and Payloads #### RECOGNITION_STARTED -Emitted when transcription session starts. Contains session ID and language pack info. - ```python @client.on(AgentServerMessageType.RECOGNITION_STARTED) def on_started(message): @@ -373,8 +412,6 @@ def on_started(message): #### ADD_PARTIAL_SEGMENT -Emitted continuously as speech is being processed. Contains interim text that updates in real-time. - ```python @client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT) def on_partial(message): @@ -420,8 +457,6 @@ Top-level `metadata` contains the same timing plus `processing_time`. #### ADD_SEGMENT -Emitted when a segment is finalized. Contains stable, final transcription text. - ```python @client.on(AgentServerMessageType.ADD_SEGMENT) def on_segment(message): @@ -460,8 +495,6 @@ def on_segment(message): #### END_OF_TURN -Emitted when a speaker's turn is complete. Timing depends on `end_of_utterance_mode`. - ```python @client.on(AgentServerMessageType.END_OF_TURN) def on_turn_end(message): @@ -482,12 +515,10 @@ def on_turn_end(message): } ``` -### Speaker Events +### Speaker Events - Examples and Payloads #### SPEAKER_STARTED -Emitted when a speaker starts speaking (voice activity detected). - ```python @client.on(AgentServerMessageType.SPEAKER_STARTED) def on_speaker_start(message): @@ -509,8 +540,6 @@ def on_speaker_start(message): #### SPEAKER_ENDED -Emitted when a speaker stops speaking (silence detected). - ```python @client.on(AgentServerMessageType.SPEAKER_ENDED) def on_speaker_end(message): @@ -532,8 +561,6 @@ def on_speaker_end(message): #### SPEAKERS_RESULT -Emitted when speaker enrolment completes. - ```python # Listen for the result @client.on(AgentServerMessageType.SPEAKERS_RESULT) @@ -548,16 +575,6 @@ await client.send_message({"message": AgentClientMessageType.GET_SPEAKERS, "fina await client.send_message({"message": AgentClientMessageType.GET_SPEAKERS}) ``` -### Additional Events - -**`START_OF_TURN`** - Emitted at the beginning of a new turn. - -**`END_OF_TURN_PREDICTION`** - Emitted during `ADAPTIVE` mode to predict turn completion (fires before `END_OF_TURN`). - -**`END_OF_UTTERANCE`** - Low-level STT engine event (fires when silence threshold is reached). - -**`ADD_PARTIAL_TRANSCRIPT` / `ADD_TRANSCRIPT`** - Legacy word-level events from underlying Real-Time API (not typically needed with Voice SDK). - ## Common Usage Patterns ### Simple Transcription @@ -784,13 +801,13 @@ class VoiceAgentClient: ## Requirements - Python 3.9+ -- Speechmatics API key ([Get one here](https://portal.speechmatics.com/)) +- Speechmatics API key (Get one through: [Speechmatics Portal](https://portal.speechmatics.com/)) ## Documentation -- [Speechmatics Documentation](https://docs.speechmatics.com/) +- [Speechmatics Documentation Homepage](https://docs.speechmatics.com/) - [Real-Time Quickstart](https://docs.speechmatics.com/speech-to-text/realtime/quickstart) -- [Authentication](https://docs.speechmatics.com/get-started/authentication) +- [Getting Started with Authentication](https://docs.speechmatics.com/get-started/authentication) ## License diff --git a/sdk/voice/pyproject.toml b/sdk/voice/pyproject.toml index 239d395..a339b22 100644 --- a/sdk/voice/pyproject.toml +++ b/sdk/voice/pyproject.toml @@ -11,7 +11,7 @@ authors = [{ name = "Speechmatics", email = "support@speechmatics.com" }] license = "MIT" requires-python = ">=3.9" dependencies = [ - "speechmatics-rt>=0.5.1", + "speechmatics-rt>=0.5.2", "pydantic>=2.10.6,<3", "numpy>=1.26.4,<3" ] From b253faf452b15e7fdc17c15940addfd3264408dc Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Fri, 5 Dec 2025 17:03:17 +0000 Subject: [PATCH 11/17] change to adaptive for scribe --- sdk/voice/speechmatics/voice/_presets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py index 22c2734..01104e1 100644 --- a/sdk/voice/speechmatics/voice/_presets.py +++ b/sdk/voice/speechmatics/voice/_presets.py @@ -136,7 +136,7 @@ def SCRIBE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # n enable_diarization=True, max_delay=2.0, end_of_utterance_silence_trigger=1.0, - end_of_utterance_mode=EndOfUtteranceMode.FIXED, + end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, speech_segment_config=SpeechSegmentConfig(emit_sentences=True), smart_turn_config=SmartTurnConfig( enabled=True, From 2335d8762edc748127f50b94d3cc8e219e23bb53 Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Fri, 5 Dec 2025 17:52:01 +0000 Subject: [PATCH 12/17] updated `fast` preset to use fixed end of utterance at a low timeout. --- sdk/voice/speechmatics/voice/_presets.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py index 01104e1..7243f65 100644 --- a/sdk/voice/speechmatics/voice/_presets.py +++ b/sdk/voice/speechmatics/voice/_presets.py @@ -33,11 +33,10 @@ def FAST(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noq VoiceAgentConfig( operating_point=OperatingPoint.STANDARD, enable_diarization=True, - max_delay=0.7, + max_delay=2.0, + end_of_utterance_silence_trigger=0.25, end_of_utterance_mode=EndOfUtteranceMode.FIXED, - speech_segment_config=SpeechSegmentConfig(emit_sentences=True), - vad_config=VoiceActivityConfig(enabled=True, silence_duration=0.18), - end_of_turn_config=EndOfTurnConfig(use_forced_eou=True), + speech_segment_config=SpeechSegmentConfig(emit_sentences=False), ), overlay, ) From 0b89531f59aa3a638d418a5571868fa127279eb3 Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Fri, 5 Dec 2025 17:54:38 +0000 Subject: [PATCH 13/17] updated doc for `fast` preset. --- sdk/voice/speechmatics/voice/_presets.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py index 7243f65..2e4df55 100644 --- a/sdk/voice/speechmatics/voice/_presets.py +++ b/sdk/voice/speechmatics/voice/_presets.py @@ -26,6 +26,9 @@ def FAST(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noq delay to finalizing the spoken sentences. It is not recommended for conversation, as it will not account for pauses, slow speech or disfluencies. + Note that this uses our standard operating point so will have marginally lower + accuracy that the enhanced operating point. + Use of this will require `pip install speechmatics-voice[smart]` and may not be suited to low-power devices. """ From 524f7bacb31ad05c515d2b3ba70e2e9d4df95d4f Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Fri, 5 Dec 2025 18:38:02 +0000 Subject: [PATCH 14/17] updated to smart turn 3.1 --- sdk/voice/speechmatics/voice/_smart_turn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/voice/speechmatics/voice/_smart_turn.py b/sdk/voice/speechmatics/voice/_smart_turn.py index c4667b4..9ce44a0 100644 --- a/sdk/voice/speechmatics/voice/_smart_turn.py +++ b/sdk/voice/speechmatics/voice/_smart_turn.py @@ -45,9 +45,9 @@ def _create_ssl_context(*args: Any, **kwargs: Any) -> ssl.SSLContext: # Base model from HuggingFace SMART_TURN_MODEL_URL = os.getenv( - "SMART_TURN_HF_URL", "https://huggingface.co/pipecat-ai/smart-turn-v3/resolve/main/smart-turn-v3.0.onnx" + "SMART_TURN_HF_URL", "https://huggingface.co/pipecat-ai/smart-turn-v3/resolve/main/smart-turn-v3.1-cpu.onnx" ) -SMART_TURN_MODEL_LOCAL_PATH = os.getenv("SMART_TURN_MODEL_PATH", ".models/smart-turn-v3.0.onnx") +SMART_TURN_MODEL_LOCAL_PATH = os.getenv("SMART_TURN_MODEL_PATH", ".models/smart-turn-v3.1-cpu.onnx") # Hint for when dependencies are not available SMART_TURN_INSTALL_HINT = "SMART_TURN mode unavailable. Install `speechmatics-voice[smart]` to enable SMART_TURN mode." From 3ed9fbd10b6fc9114e088fc711532cb58209bd79 Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Fri, 5 Dec 2025 18:57:39 +0000 Subject: [PATCH 15/17] updated `fast` preset --- sdk/voice/speechmatics/voice/_presets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py index 2e4df55..dfe6f5d 100644 --- a/sdk/voice/speechmatics/voice/_presets.py +++ b/sdk/voice/speechmatics/voice/_presets.py @@ -39,7 +39,7 @@ def FAST(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noq max_delay=2.0, end_of_utterance_silence_trigger=0.25, end_of_utterance_mode=EndOfUtteranceMode.FIXED, - speech_segment_config=SpeechSegmentConfig(emit_sentences=False), + speech_segment_config=SpeechSegmentConfig(emit_sentences=True), ), overlay, ) From b686dacbe698c47864df70dede3c42ad5e15e8b5 Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Fri, 12 Dec 2025 11:09:10 +0000 Subject: [PATCH 16/17] Squashed commit of the following: commit ef55f54980e38f6a5a0067678476f7c7c21d8168 Author: Sam Sykes Date: Sun Dec 7 22:34:46 2025 +0000 updated VAD events for annotations --- sdk/voice/speechmatics/voice/_client.py | 80 ++++++++++++++++-------- sdk/voice/speechmatics/voice/_models.py | 29 ++++++--- sdk/voice/speechmatics/voice/_presets.py | 9 +-- 3 files changed, 75 insertions(+), 43 deletions(-) diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py index fef7d02..978d130 100644 --- a/sdk/voice/speechmatics/voice/_client.py +++ b/sdk/voice/speechmatics/voice/_client.py @@ -1220,7 +1220,7 @@ async def _process_speech_fragments(self, change_filter: Optional[list[Annotatio async def fn() -> None: ttl = await self._calculate_finalize_delay() - if ttl: + if ttl is not None: self._turn_handler.update_timer(ttl) self._run_background_eot_calculation(fn, "speech_fragments") @@ -1474,7 +1474,7 @@ async def _calculate_fixed_finalize_delay(self) -> Optional[float]: async def _calculate_finalize_delay( self, - smart_turn_prediction: Optional[SmartTurnPredictionResult] = None, + annotation: Optional[AnnotationResult] = None, ) -> Optional[float]: """Calculate the delay before finalizing / end of turn. @@ -1483,7 +1483,7 @@ async def _calculate_finalize_delay( and smart turn predictions to calculate appropriate delay. Args: - smart_turn_prediction: The smart turn prediction result to use for evaluation. + annotations: The annotations to include for evaluation. Returns: Optional[float]: The delay before finalizing / end of turn. @@ -1510,24 +1510,33 @@ async def _calculate_finalize_delay( # Track penalty multipliers and reasons reasons: list[tuple[float, str]] = [] + annotation = annotation or AnnotationResult() - # Apply penalties based on last active segment annotations + # VAD enabled + if self._silero_detector: + annotation.add(AnnotationFlags.VAD_ACTIVE) + else: + annotation.add(AnnotationFlags.VAD_INACTIVE) + + # Smart Turn enabled + if self._smart_turn_detector: + annotation.add(AnnotationFlags.SMART_TURN_ACTIVE) + else: + annotation.add(AnnotationFlags.SMART_TURN_INACTIVE) + + # Result to validate against if last_active_segment: + annotation.add(*[AnnotationFlags(flag) for flag in last_active_segment.annotation]) + + # Apply penalties based on last active segment annotations + if len(annotation) > 0: for p in self._config.end_of_turn_config.penalties: description = "__".join(p.annotation) - has_annotation = last_active_segment.annotation.has(*p.annotation) - + has_annotation = annotation.has(*p.annotation) if (not p.is_not and has_annotation) or (p.is_not and not has_annotation): reason = f"not__{description}" if p.is_not else description reasons.append((p.penalty, reason)) - # Apply smart turn prediction penalty - if smart_turn_prediction and self._config.smart_turn_config: - if smart_turn_prediction.prediction: - reasons.append((self._config.smart_turn_config.positive_penalty, "smart_turn_true")) - else: - reasons.append((self._config.smart_turn_config.negative_penalty, "smart_turn_false")) - # Calculate final multiplier (compound multiplication) multiplier = self._config.end_of_turn_config.base_multiplier for penalty, _ in reasons: @@ -1558,17 +1567,27 @@ async def _calculate_finalize_delay( # Return the calculated delay return finalize_delay - async def _eot_prediction(self, end_time: Optional[float] = None, speaker: Optional[str] = None) -> float: + async def _eot_prediction( + self, + end_time: Optional[float] = None, + speaker: Optional[str] = None, + annotation: Optional[AnnotationResult] = None, + ) -> float: """Handle end of turn prediction.""" + # Initialize the annotation + annotation = annotation or AnnotationResult() + # Wait for Smart Turn result if self._smart_turn_detector and end_time is not None: result = await self._smart_turn_prediction(end_time, self._config.language, speaker=speaker) - else: - result = None + if result.prediction: + annotation.add(AnnotationFlags.SMART_TURN_TRUE) + else: + annotation.add(AnnotationFlags.SMART_TURN_FALSE) # Create a new task to evaluate the finalize delay - delay = await self._calculate_finalize_delay(smart_turn_prediction=result) + delay = await self._calculate_finalize_delay(annotation=annotation) # Return the result return max(delay or 0, self._config.end_of_turn_config.min_end_of_turn_delay) @@ -1797,22 +1816,29 @@ def _handle_silero_vad_result(self, result: SileroVADResult) -> None: # Emit VAD status message self._emit_message(message) + # Create the annotation + annotation = AnnotationResult() + + # VAD annotation + if result.speech_ended: + annotation.add(AnnotationFlags.VAD_STOPPED) + else: + annotation.add(AnnotationFlags.VAD_STARTED) + # If speech has ended, we need to predict the end of turn if result.speech_ended and self._uses_eot_prediction: """VAD-based end of turn prediction.""" - # Only proceed if there are fragments to finalize - has_fragments = bool(self._speech_fragments) + # Set cutoff to prevent late transcripts from cancelling finalization + self._smart_turn_pending_cutoff = event_time - if has_fragments: - # Set cutoff to prevent late transcripts from cancelling finalization - self._smart_turn_pending_cutoff = event_time - - async def fn() -> None: - ttl = await self._eot_prediction(end_time=event_time, speaker=self._current_speaker) - self._turn_handler.update_timer(ttl) + async def fn() -> None: + ttl = await self._eot_prediction( + end_time=event_time, speaker=self._current_speaker, annotation=annotation + ) + self._turn_handler.update_timer(ttl) - self._run_background_eot_calculation(fn, "silero_vad") + self._run_background_eot_calculation(fn, "silero_vad") async def _handle_speaker_started(self, speaker: Optional[str], event_time: float) -> None: """Reset timers when a new speaker starts speaking after silence.""" diff --git a/sdk/voice/speechmatics/voice/_models.py b/sdk/voice/speechmatics/voice/_models.py index 5b12dfa..5f819b9 100644 --- a/sdk/voice/speechmatics/voice/_models.py +++ b/sdk/voice/speechmatics/voice/_models.py @@ -250,6 +250,18 @@ class AnnotationFlags(str, Enum): # End of utterance detection END_OF_UTTERANCE = "end_of_utterance" + # VAD + VAD_ACTIVE = "vad_active" + VAD_INACTIVE = "vad_inactive" + VAD_STARTED = "vad_started" + VAD_STOPPED = "vad_stopped" + + # Smart Turn + SMART_TURN_ACTIVE = "smart_turn_active" + SMART_TURN_INACTIVE = "smart_turn_inactive" + SMART_TURN_TRUE = "smart_turn_true" + SMART_TURN_FALSE = "smart_turn_false" + # ============================================================================== # CONFIGURATION MODELS @@ -419,6 +431,11 @@ class EndOfTurnConfig(BaseModel): EndOfTurnPenaltyItem( penalty=0.5, annotation=[AnnotationFlags.ENDS_WITH_FINAL, AnnotationFlags.ENDS_WITH_EOS] ), + # Smart Turn + VAD + EndOfTurnPenaltyItem(penalty=0.2, annotation=[AnnotationFlags.SMART_TURN_TRUE]), + EndOfTurnPenaltyItem( + penalty=0.2, annotation=[AnnotationFlags.VAD_STOPPED, AnnotationFlags.SMART_TURN_INACTIVE] + ), ] ) use_forced_eou: bool = False @@ -446,15 +463,9 @@ class SmartTurnConfig(BaseModel): Parameters: enabled: Whether smart turn is enabled. - smart_turn_threshold: Smart turn threshold. Defaults to 0.5. - max_audio_length: Maximum length of audio to analyze in seconds. Defaults to 8.0. - positive_penalty: Positive penalty for smart turn. Defaults to -1.0. - - negative_penalty: Negative penalty for smart turn. Defaults to 2.5. - Examples: >>> config = SmartTurnConfig( ... audio_buffer_length=15.0, @@ -466,8 +477,6 @@ class SmartTurnConfig(BaseModel): enabled: bool = False smart_turn_threshold: float = 0.5 max_audio_length: float = 8.0 - positive_penalty: float = 0.0 - negative_penalty: float = 1.0 class VoiceAgentConfig(BaseModel): @@ -1245,7 +1254,7 @@ class TurnPredictionMetadata(BaseModel): """ ttl: float - reasons: list[str] = Field(default_factory=list, exclude=True) + reasons: list[str] = Field(default_factory=list, exclude=False) model_config = ConfigDict(extra="ignore") @@ -1313,7 +1322,7 @@ class SegmentMessageSegment(BaseModel): language: Optional[str] = None text: Optional[str] = None fragments: Optional[list[SegmentMessageSegmentFragment]] = None - annotation: list[AnnotationFlags] = Field(default_factory=list, exclude=True) + annotation: list[AnnotationFlags] = Field(default_factory=list, exclude=False) metadata: MessageTimeMetadata model_config = ConfigDict(extra="ignore") diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py index dfe6f5d..18b1183 100644 --- a/sdk/voice/speechmatics/voice/_presets.py +++ b/sdk/voice/speechmatics/voice/_presets.py @@ -28,9 +28,6 @@ def FAST(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noq Note that this uses our standard operating point so will have marginally lower accuracy that the enhanced operating point. - - Use of this will require `pip install speechmatics-voice[smart]` and may not - be suited to low-power devices. """ return VoiceAgentConfigPreset._merge_configs( VoiceAgentConfig( @@ -39,7 +36,7 @@ def FAST(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noq max_delay=2.0, end_of_utterance_silence_trigger=0.25, end_of_utterance_mode=EndOfUtteranceMode.FIXED, - speech_segment_config=SpeechSegmentConfig(emit_sentences=True), + speech_segment_config=SpeechSegmentConfig(emit_sentences=False), ), overlay, ) @@ -80,8 +77,8 @@ def ADAPTIVE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # VoiceAgentConfig( operating_point=OperatingPoint.ENHANCED, enable_diarization=True, - max_delay=0.7, - end_of_utterance_silence_trigger=0.6, + max_delay=2.0, + end_of_utterance_silence_trigger=0.7, end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE, speech_segment_config=SpeechSegmentConfig(emit_sentences=False), vad_config=VoiceActivityConfig(enabled=True), From 796b38c82342b775e473439eafc1247234c9e36f Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Fri, 12 Dec 2025 11:12:23 +0000 Subject: [PATCH 17/17] test fix --- tests/voice/test_14_presets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/voice/test_14_presets.py b/tests/voice/test_14_presets.py index 2f51da0..a5cc898 100644 --- a/tests/voice/test_14_presets.py +++ b/tests/voice/test_14_presets.py @@ -13,7 +13,7 @@ async def test_presets(): # Create a preset preset: VoiceAgentConfig = VoiceAgentConfigPreset.FAST() assert preset is not None - assert preset.speech_segment_config.emit_sentences is True + assert preset.speech_segment_config.emit_sentences is False # Overlay #1 preset: VoiceAgentConfig = VoiceAgentConfigPreset.FAST(VoiceAgentConfig(max_delay=12.34, enable_diarization=False))