diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index d899f655..00000000
--- a/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-*.wav filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/RELEASE.md b/.github/RELEASE.md
index 9b6ef51f..34095833 100644
--- a/.github/RELEASE.md
+++ b/.github/RELEASE.md
@@ -8,8 +8,8 @@ The Speechmatics Python SDK repository contains two separate packages:
- `speechmatics-rt` - Real-Time API Client
- `speechmatics-batch` - Batch API Client
-- `speechmatics-flow` - Flow API Client
- `speechmatics-voice` - Voice Agent API Client
+- `speechmatics-tts` - TTS API Client
Each package is released independently with its own versioning and release workflow.
@@ -91,55 +91,55 @@ To release a new version of the Batch SDK:
- Update GitHub release notes
- Announce the release
-### 3. Flow SDK Release
+### 3. Voice Agent SDK Release
-To release a new version of the Flow SDK:
+To release a new version of the Voice Agent SDK:
1. **Create a Release Tag**
```bash
- git tag flow/v1.0.0
- git push origin flow/v1.0.0
+ git tag voice/v1.0.0
+ git push origin voice/v1.0.0
```
2. **Automated Workflow**
- The `release-flow.yaml` workflow will automatically:
+ The `release-voice.yaml` workflow will automatically:
- - Extract version from tag (e.g., `flow/v1.0.0` → `1.0.0`)
+ - Extract version from tag (e.g., `voice/v1.0.0` → `1.0.0`)
- Run comprehensive tests across Python versions
- - Update version in `sdk/flow/speechmatics/flow/__init__.py`
+ - Update version in `sdk/voice/speechmatics/voice/__init__.py`
- Build the package
- Publish to PyPI
3. **Manual Steps After Release**
- Verify the package is available on PyPI
- - Test installation: `pip install speechmatics-flow==1.0.0`
+ - Test installation: `pip install speechmatics-voice==1.0.0`
- Update GitHub release notes
- Announce the release
-### 4. Voice Agent SDK Release
+### 4. TTS SDK Release
-To release a new version of the Voice Agent SDK:
+To release a new version of the TTS SDK:
1. **Create a Release Tag**
```bash
- git tag voice/v1.0.0
- git push origin voice/v1.0.0
+ git tag tts/v1.0.0
+ git push origin tts/v1.0.0
```
2. **Automated Workflow**
- The `release-voice.yaml` workflow will automatically:
+ The `release-tts.yaml` workflow will automatically:
- - Extract version from tag (e.g., `voice/v1.0.0` → `1.0.0`)
+ - Extract version from tag (e.g., `tts/v1.0.0` → `1.0.0`)
- Run comprehensive tests across Python versions
- - Update version in `sdk/voice/speechmatics/voice/__init__.py`
+ - Update version in `sdk/tts/speechmatics/tts/__init__.py`
- Build the package
- Publish to PyPI
3. **Manual Steps After Release**
- Verify the package is available on PyPI
- - Test installation: `pip install speechmatics-voice==1.0.0`
+ - Test installation: `pip install speechmatics-tts==1.0.0`
- Update GitHub release notes
- Announce the release
@@ -162,8 +162,8 @@ Both packages follow semantic versioning (SemVer):
- RT SDK: `rt/v{version}` (e.g., `rt/v1.0.0`)
- Batch SDK: `batch/v{version}` (e.g., `batch/v1.0.0`)
-- Flow SDK: `flow/v{version}` (e.g., `flow/v1.0.0`)
- Voice Agent SDK: `voice/v{version}` (e.g., `voice/v1.0.0`)
+- TTS SDK: `tts/v{version}` (e.g., `tts/v1.0.0`)
## Environment Setup
@@ -173,8 +173,8 @@ Both packages are published to PyPI using GitHub Actions with OpenID Connect (OI
- RT SDK: Uses `pypi-rt` environment
- Batch SDK: Uses `pypi-batch` environment
-- Flow SDK: Uses `pypi-flow` environment
- Voice Agent SDK: Uses `pypi-voice` environment
+- TTS SDK: Uses `pypi-tts` environment
### Required Secrets
diff --git a/.gitignore b/.gitignore
index d69fb176..14e50121 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,6 +153,7 @@ cython_debug/
# Ruff stuff:
.ruff_cache/
+**/output.wav
# PyPI configuration file
.pypirc
diff --git a/README.md b/README.md
index 5b866f51..c66cf2d0 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
# Speechmatics Python SDK
[](https://github.com/speechmatics/speechmatics-python-sdk/blob/master/LICENSE)
+[](https://www.python.org/)
A collection of Python clients for Speechmatics APIs packaged as separate installable packages. These packages replace the old [speechmatics-python](https://pypi.org/project/speechmatics-python) package, which will be deprecated soon.
@@ -10,7 +11,7 @@ Each client targets a specific Speechmatics API (e.g. real-time, batch transcrip
This repository contains the following packages:
-### (Beta) Real-Time Client (`speechmatics-rt`)
+### Real-Time Client (`speechmatics-rt`)
A Python client for Speechmatics Real-Time API.
@@ -18,7 +19,7 @@ A Python client for Speechmatics Real-Time API.
pip install speechmatics-rt
```
-### (Beta) Batch Client (`speechmatics-batch`)
+### Batch Client (`speechmatics-batch`)
An async Python client for Speechmatics Batch API.
@@ -26,15 +27,7 @@ An async Python client for Speechmatics Batch API.
pip install speechmatics-batch
```
-### (Beta) Flow Client (`speechmatics-flow`)
-
-An async Python client for Speechmatics Flow API.
-
-```bash
-pip install speechmatics-flow
-```
-
-### (Beta) Voice Agent Client (`speechmatics-voice`)
+### Voice Agent Client (`speechmatics-voice`)
A Voice Agent Python client for Speechmatics Real-Time API.
@@ -46,7 +39,7 @@ pip install speechmatics-voice
pip install speechmatics-voice[smart]
```
-### (Beta) TTS Client (`speechmatics-tts`)
+### TTS Client (`speechmatics-tts`)
An async Python client for Speechmatics TTS API.
@@ -69,10 +62,6 @@ speechmatics-python-sdk/
│ │ ├── pyproject.toml
│ │ └── README.md
│ │
-│ ├── flow/
-│ │ ├── pyproject.toml
-│ │ └── README.md
-│ │
│ ├── voice/
│ │ ├── pyproject.toml
│ │ └── README.md
@@ -84,7 +73,6 @@ speechmatics-python-sdk/
├── tests/
│ ├── batch/
│ ├── rt/
-│ ├── flow/
│ ├── voice/
│ └── tts/
│
@@ -126,7 +114,6 @@ Each package can be installed separately:
```bash
pip install speechmatics-rt
pip install speechmatics-batch
-pip install speechmatics-flow
pip install speechmatics-voice[smart]
pip install speechmatics-tts
```
diff --git a/examples/tts/tts_autoplay/README.md b/examples/tts/tts_autoplay/README.md
new file mode 100644
index 00000000..5d540fb9
--- /dev/null
+++ b/examples/tts/tts_autoplay/README.md
@@ -0,0 +1,43 @@
+# Speechmatics TTS Async Streaming API Client
+
+This example shows how to use the Speechmatics TTS API to generate audio from text and autoplay it using sounddevice through the systems default audio output device.
+You must have an audio output device configured on their system for this example to work.
+## How it Works
+
+There are two main components in this example, an audio generator and an audio player. These components are run concurrently using asyncio as tasks, ochestrated by the main() function, to generate and play audio in real-time.
+### audio_generator()
+
+This producer function connects to the Speechmatics TTS API using the AsyncClient. It calls client.generate() with your text, the voice you want to use, and the output format - RAW_PCM_16000 in this example.
+The code iterates over the audio data as it is streamed in chunks (iter_chunked), and accumulates in a bytearray buffer.
+The while len(buffer) >= 2 loop reads each audio sample containing 2 bytes, from the buffer, and converts it to a numpy array of int-16 values, which is then put into the audio_queue.
+The processed 2 byte sample is then removed from the front of the buffer.
+END_OF_STREAM is used as a sentinel value to signal the end of the audio stream, with no more audio data to process.
+If an error occurs during audio generation, the END_OF_STREAM sentinel value is still put into the queue to signal the end of the audio stream to prevent the consumer, audio_player(), from getting stuck in an infinite loop, and raises the exception.
+### audio_player()
+
+This consumer function initialises a sounddevice OutputStream, which is responsible for streaming the audio data to the default audio output device. Within the outputstream, the while True loop means there is continous processing of the incoming audio data.
+sample = await asyncio.wait_for(play_queue.get(), timeout=0.1) fetches the next sample from the queue, or waits for 0.1 seconds if the queue is empty.
+If the sample is END_OF_STREAM, the while loop breaks and the audio player exits.
+If the sample is not END_OF_STREAM, it is converted to a numpy array of int-16 values and written to the audio output device using the sounddevice OutputStream.
+play_queue.task_done() is called to signal that the sample has been processed.
+If an error occurs during audio playback, the END_OF_STREAM sentinel value is still put into the queue to signal the end of the audio stream to prevent the audio_player() from getting stuck in an infinite loop, and raises the exception.
+
+## Installation
+
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+
+To run the example, use the following command:
+
+```bash
+python tts_stream_example.py
+```
+
+## Environment Variables
+
+The client supports the following environment variables:
+
+- `SPEECHMATICS_API_KEY`: Your Speechmatics API key
diff --git a/examples/tts/tts_autoplay/requirements.txt b/examples/tts/tts_autoplay/requirements.txt
new file mode 100644
index 00000000..d4ef65fc
--- /dev/null
+++ b/examples/tts/tts_autoplay/requirements.txt
@@ -0,0 +1,3 @@
+sounddevice>=0.4.6
+numpy>=1.24.3
+speechmatics-tts>=0.1.0
diff --git a/examples/tts/tts_autoplay/tts_stream_example.py b/examples/tts/tts_autoplay/tts_stream_example.py
new file mode 100644
index 00000000..ec4e860e
--- /dev/null
+++ b/examples/tts/tts_autoplay/tts_stream_example.py
@@ -0,0 +1,119 @@
+import asyncio
+import sounddevice as sd
+import numpy as np
+from speechmatics.tts import AsyncClient, Voice, OutputFormat
+
+# Configuration
+TEXT = "Welcome to the future of audio generation from text! This audio is a demo of the async streaming Speechmatics' text to speech API."
+VOICE = Voice.JACK
+OUTPUT_FORMAT = OutputFormat.RAW_PCM_16000
+
+# Audio Parameters
+SAMPLE_RATE = 16000 #Hz
+SAMPLE_WIDTH = 2 # 16-bit audio
+CHANNELS = 1 # Mono audio
+CHUNK_SIZE = 2048 # Size of audio chunks
+BUFFER_SIZE = 4096 # Size of buffer
+
+# Sentinel value to signal end of stream
+END_OF_STREAM = None
+
+
+# Core Async Functions
+
+# 1. Producer: Generates audio and puts chunks into the queue:
+
+async def audio_generator(audio_queue: asyncio.Queue, text: str, voice: str, output_format: str) -> None:
+ try:
+ async with AsyncClient() as client, await client.generate(
+ text=text,
+ voice=voice,
+ output_format=output_format
+ ) as response:
+ buffer=bytearray()
+ async for chunk in response.content.iter_chunked(BUFFER_SIZE):
+ if not chunk:
+ continue
+ buffer.extend(chunk)
+
+ # Process complete frames (2 bytes per sample for 16-bit audio)
+ # Convert little-endian 16-bit signed int to np.int-16
+ while len(buffer) >= 2:
+ sample = int.from_bytes(buffer[:2], byteorder='little', signed=True)
+ await audio_queue.put(sample)
+ buffer = buffer[2:]
+
+ await audio_queue.put(END_OF_STREAM)
+ print("Audio generated and put into queue.")
+
+ except Exception as e:
+ print(f"[{'Generator'}] An error occurred in the audio generator: {e}")
+ await audio_queue.put(END_OF_STREAM)
+ raise
+
+# 2. Consumer: Read audio data from queue and play it in real-time using sounddevice.
+async def audio_player(play_queue: asyncio.Queue) -> None:
+ try:
+ with sd.OutputStream(
+ samplerate=SAMPLE_RATE,
+ channels=CHANNELS,
+ dtype='int16', # 16-bit PCM
+ blocksize=CHUNK_SIZE,
+ latency='high',
+ ) as stream:
+ buffer=[]
+ while True:
+ try:
+ sample = await asyncio.wait_for(play_queue.get(), timeout=0.1)
+ if sample is END_OF_STREAM:
+ if buffer:
+ audio_data=np.array(buffer, dtype=np.int16)
+ stream.write(audio_data)
+ buffer=[]
+ break
+
+ buffer.append(sample)
+ if len(buffer) >= CHUNK_SIZE:
+ audio_data=np.array(buffer[:CHUNK_SIZE], dtype=np.int16)
+ stream.write(audio_data)
+ buffer=buffer[CHUNK_SIZE:]
+
+ play_queue.task_done()
+
+ except asyncio.TimeoutError:
+ if buffer:
+ audio_data=np.array(buffer, dtype=np.int16)
+ stream.write(audio_data)
+ buffer=[]
+ continue
+
+ except Exception as e:
+ print(f"[{'Player'}] An error occurred playing audio chunk {e}")
+ raise
+
+ except Exception as e:
+ print(f"[{'Player'}] An error occurred in the audio player: {e}")
+ raise
+ finally:
+ sd.stop()
+
+# 3. Main Function: Orchestrate audio generation and audio stream
+async def main() -> None:
+ play_queue = asyncio.Queue()
+
+ # Create tasks
+ tasks = [
+ asyncio.create_task(audio_generator(play_queue, TEXT, VOICE, OUTPUT_FORMAT)),
+ asyncio.create_task(audio_player(play_queue))
+ ]
+
+ try:
+ await asyncio.gather(*tasks)
+
+ except Exception as e:
+ for task in tasks:
+ task.cancel()
+ await asyncio.gather(*tasks, return_exceptions=True)
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/examples/voice/cli/README.md b/examples/voice/cli/README.md
index ffc89f42..7da08bcb 100644
--- a/examples/voice/cli/README.md
+++ b/examples/voice/cli/README.md
@@ -11,7 +11,7 @@ Real-time transcription tool using the Speechmatics Voice SDK. Supports micropho
python cli.py -k YOUR_API_KEY -p
# Example that saves the output in verbose mode using a preset
-python cli.py -k YOUR_API_KEY -vvvvvpDSr -P conversation_smart_turn
+python cli.py -k YOUR_API_KEY -vvvvvpDSr -P smart_turn
```
Output saved to `./output/YYYYMMDD_HHMMSS/log.jsonl`
@@ -38,9 +38,8 @@ Press `CTRL+C` to stop.
Common short codes:
- `-k` API key | `-i` input file | `-o` output dir | `-p` pretty print | `-v` verbose
-- `-r` record | `-S` save slices | `-P` preset | `-W` show config
-- `-l` language | `-m` mode | `-d` max delay | `-t` silence trigger
-- `-f` focus speakers | `-s` known speakers | `-E` enrol
+- `-r` record | `-P` preset | `-w` show compact config | `-W` show complete config
+- `-s` known speakers | `-E` enrol
### Core
@@ -55,9 +54,7 @@ Common short codes:
- Inside session directory:
- `log.jsonl` - All events with timestamps
- `recording.wav` - Microphone recording (if `-r` is used)
- - `slice_*.wav` and `slice_*.json` - Audio slices (if `-S` is used)
- `-r, --record` - Record microphone audio to recording.wav (microphone input only)
-- `-S, --save-slices` - Save audio slices on SPEAKER_ENDED events (SMART_TURN mode only)
- `-p, --pretty` - Formatted console output with colors
- `-v, --verbose` - Increase verbosity (can repeat: `-v`, `-vv`, `-vvv`, `-vvvv`, `-vvvvv`)
- `-v` - Add speaker VAD events
@@ -67,48 +64,35 @@ Common short codes:
- `-vvvvv` - Add STT events
- `-L, --legacy` - Show only legacy transcript messages
- `-D, --default-device` - Use default audio device (skip selection)
-- `-w, --results` - Include word-level results in segments
+- `--results` - Include word-level results in segments
### Audio
-- `-R, --sample-rate` - Sample rate in Hz (default: 16000)
-- `-C, --chunk-size` - Chunk size in bytes (default: 320)
+- `--sample-rate` - Sample rate in Hz (default: 16000)
+- `--chunk-size` - Chunk size in bytes (default: 320)
- `-M, --mute` - Mute audio playback for file input
### Voice Agent Config
-**Configuration Priority:**
+**Configuration (Required):**
-1. Use `--preset` to start with a preset configuration (recommended)
-2. Use `-c/--config` to provide a complete JSON configuration
-3. Use individual parameters (`-l`, `-d`, `-t`, `-m`) to override preset settings or create custom config
-
-**Preset Options:**
-
-- `-P, --preset` - Use preset configuration: `scribe`, `low_latency`, `conversation_adaptive`, `conversation_smart_turn`, or `captions`
-- `--list-presets` - List available presets and exit
-- `-W, --show` - Display the final configuration as JSON and exit (after applying preset/config and overrides)
-
-**Configuration Options:**
+You must provide either a preset or a config file:
+- `-P, --preset` - Use preset configuration: `scribe`, `fast`, `adaptive`, `smart_turn`, or `captions`
- `-c, --config` - JSON config string or file path (complete configuration)
-- `-l, --language` - Language code (overrides preset if used together)
-- `-d, --max-delay` - Max transcription delay in seconds (overrides preset if used together)
-- `-t, --end-of-utterance-silence-trigger` - Silence duration for turn end in seconds (overrides preset if used together)
-- `-m, --end-of-utterance-mode` - Turn detection mode: `FIXED`, `ADAPTIVE`, `SMART_TURN`, or `EXTERNAL` (overrides preset if used together)
+- `--list-presets` - List available presets and exit
-**Note:** When using `-c/--config`, you cannot use `-l`, `-d`, `-t`, `-m`, `-f`, `-I`, `-x`, or `-s` as the config JSON should contain all settings.
+**Note:** `--preset` and `--config` are mutually exclusive. You cannot use both together.
-### Speaker Management
+**Display Configuration:**
-- `-f, --focus-speakers` - Speakers to focus on (e.g., `S1 S2`)
-- `-I, --ignore-speakers` - Speakers to ignore (e.g., `S1 S2`)
-- `-x, --ignore-mode` - Use ignore mode (instead of retain) for focus speakers
+- `-w, --show-compact` - Display compact configuration as JSON and exit (excludes unset and None values)
+- `-W, --show-complete` - Display complete configuration as JSON and exit (includes all defaults)
### Speaker Identification
- `-E, --enrol` - Enrol speakers and output identifiers at end
-- `-s, --speakers` - Known speakers JSON string or file path
+- `-s, --speakers` - Known speakers JSON string or file path (can be used with preset or config)
## Examples
@@ -118,16 +102,16 @@ Common short codes:
python cli.py --list-presets
```
-**Show config (from preset):**
+**Show compact config (from preset):**
```bash
-python cli.py -P scribe -W
+python cli.py -P scribe -w
```
-**Show config (with overrides):**
+**Show complete config (from preset):**
```bash
-python cli.py -P scribe -l fr -d 1.0 -W
+python cli.py -P scribe -W
```
**Use preset:**
@@ -136,16 +120,10 @@ python cli.py -P scribe -l fr -d 1.0 -W
python cli.py -k YOUR_KEY -P scribe -p
```
-**Use preset with overrides:**
+**Basic microphone (requires preset or config):**
```bash
-python cli.py -k YOUR_KEY -P scribe -l fr -d 1.0 -p
-```
-
-**Basic microphone:**
-
-```bash
-python cli.py -k YOUR_KEY -p
+python cli.py -k YOUR_KEY -P adaptive -p
```
Output saved to `./output/YYYYMMDD_HHMMSS/log.jsonl`
@@ -153,7 +131,7 @@ Output saved to `./output/YYYYMMDD_HHMMSS/log.jsonl`
**Record microphone audio:**
```bash
-python cli.py -k YOUR_KEY -r -p
+python cli.py -k YOUR_KEY -P adaptive -r -p
```
Recording saved to `./output/YYYYMMDD_HHMMSS/recording.wav`
@@ -161,57 +139,35 @@ Recording saved to `./output/YYYYMMDD_HHMMSS/recording.wav`
**Custom output directory:**
```bash
-python cli.py -k YOUR_KEY -o ./my_sessions -p
+python cli.py -k YOUR_KEY -P adaptive -o ./my_sessions -p
```
Output saved to `./my_sessions/YYYYMMDD_HHMMSS/log.jsonl`
-**EXTERNAL mode with manual turn control:**
-
-```bash
-python cli.py -k YOUR_KEY -m EXTERNAL -p
-```
-
-Press 't' or 'T' to manually signal end of turn.
-
-**Save audio slices (SMART_TURN mode):**
-
-```bash
-python cli.py -k YOUR_KEY -P conversation_smart_turn -S -p
-```
-
-Audio slices (~8 seconds) saved to `./output/YYYYMMDD_HHMMSS/slice_*.wav` with matching `.json` metadata files on each SPEAKER_ENDED event.
-
**Audio file:**
```bash
-python cli.py -k YOUR_KEY -i audio.wav -p
+python cli.py -k YOUR_KEY -P scribe -i audio.wav -p
```
**Audio file (muted):**
```bash
-python cli.py -k YOUR_KEY -i audio.wav -Mp
+python cli.py -k YOUR_KEY -P scribe -i audio.wav -Mp
```
**Verbose logging:**
```bash
-python cli.py -k YOUR_KEY -vv -p
+python cli.py -k YOUR_KEY -P adaptive -vv -p
```
Shows additional events (speaker VAD, turn predictions, etc.)
-**Focus on speakers:**
-
-```bash
-python cli.py -k YOUR_KEY -f S1 S2 -p
-```
-
**Enrol speakers:**
```bash
-python cli.py -k YOUR_KEY -Ep
+python cli.py -k YOUR_KEY -P adaptive -Ep
```
Press `CTRL+C` when done to see speaker identifiers.
@@ -219,7 +175,7 @@ Press `CTRL+C` when done to see speaker identifiers.
**Use known speakers:**
```bash
-python cli.py -k YOUR_KEY -s speakers.json -p
+python cli.py -k YOUR_KEY -P adaptive -s speakers.json -p
```
Example `speakers.json`:
@@ -231,12 +187,18 @@ Example `speakers.json`:
]
```
-**Custom config:**
+**Custom config file:**
```bash
python cli.py -k YOUR_KEY -c config.json -p
```
+**Custom config with known speakers:**
+
+```bash
+python cli.py -k YOUR_KEY -c config.json -s speakers.json -p
+```
+
## Notes
- Output directory (`-o`) defaults to `./output`
@@ -244,10 +206,7 @@ python cli.py -k YOUR_KEY -c config.json -p
- Session directory contains:
- `log.jsonl` - All events with timestamps
- `recording.wav` - Microphone recording (if `-r` is used)
- - `slice_*.wav` and `slice_*.json` - Audio slices (if `--save-slices` is used in SMART_TURN mode)
- Session subdirectories prevent accidental data loss from multiple runs
-- Audio slices are ~8 seconds and saved on each SPEAKER_ENDED event
-- JSON metadata includes event details, speaker ID, timing, and slice duration
- Speaker identifiers are encrypted and unique to your API key
- Allow speakers to say at least 20 words before enrolling
- Avoid labels `S1`, `S2` (reserved by engine)
diff --git a/examples/voice/cli/cli.py b/examples/voice/cli/cli.py
index 67d79583..5dc53ab2 100644
--- a/examples/voice/cli/cli.py
+++ b/examples/voice/cli/cli.py
@@ -24,8 +24,6 @@
from speechmatics.voice import AdditionalVocabEntry
from speechmatics.voice import AgentServerMessageType
from speechmatics.voice import EndOfUtteranceMode
-from speechmatics.voice import SpeakerFocusConfig
-from speechmatics.voice import SpeakerFocusMode
from speechmatics.voice import SpeakerIdentifier
from speechmatics.voice import VoiceAgentClient
from speechmatics.voice import VoiceAgentConfig
@@ -35,8 +33,6 @@
# CONSTANTS
# ==============================================================================
-# Audio slice duration (seconds of audio to capture before speaker ends)
-AUDIO_SLICE_DURATION = 8.0
# Default output directory
DEFAULT_OUTPUT_DIR = "./output"
@@ -48,6 +44,7 @@
# Console colors for message types
COLORS = {
# Segments
+ "Diagnostics": "\033[90m",
"AddPartialSegment": "\033[93m",
"AddSegment": "\033[1;92m",
# Speaker events
@@ -59,6 +56,8 @@
"StartOfTurn": "\033[91m",
"EndOfTurnPrediction": "\033[95m",
"EndOfTurn": "\033[1;91m",
+ # VAD status
+ "VadStatus": "\033[41;97m",
# Transcript events
"AddPartialTranscript": "\033[90m",
"AddTranscript": "\033[90m",
@@ -94,7 +93,7 @@ async def main() -> None:
return
# Setup audio source (microphone or file) - skip if just showing config
- if not args.show:
+ if not args.show_compact and not args.show_complete:
audio_source = setup_audio_source(args)
if not audio_source:
return
@@ -125,15 +124,10 @@ async def main() -> None:
# Setup file paths
log_file = output_dir / LOG_FILENAME
record_file = output_dir / RECORDING_FILENAME if args.record else None
- slices_dir = output_dir if args.save_slices else None
# Store in args for easy access
args.log_file = str(log_file)
args.record_file = str(record_file) if record_file else None
- args.slices_dir = str(slices_dir) if slices_dir else None
-
- # Create speaker configuration
- speaker_config = create_speaker_config(args)
# Known speakers
known_speakers: list[SpeakerIdentifier] = [SpeakerIdentifier(**s) for s in args.speakers] if args.speakers else []
@@ -141,7 +135,7 @@ async def main() -> None:
# Use JSON config
if args.config is not None:
try:
- config = VoiceAgentConfig.model_validate(args.config)
+ config = VoiceAgentConfig.from_dict(args.config)
except Exception as e:
print(f"Error validating config: {e}")
return
@@ -162,27 +156,22 @@ async def main() -> None:
]
)
- # Copy in overrides
- if args.language:
- config.language = args.language
- if args.end_of_utterance_silence_trigger:
- config.end_of_utterance_silence_trigger = args.end_of_utterance_silence_trigger
- if args.max_delay:
- config.max_delay = args.max_delay
- if args.end_of_utterance_mode:
- config.end_of_utterance_mode = args.end_of_utterance_mode
-
- # Copy speaker settings
- config.speaker_config = speaker_config
+ # Copy speaker settings (only known_speakers can be overridden)
config.known_speakers = known_speakers
config.include_results = args.results
+ # Set chunk size
+ config.chunk_size = args.chunk_size
+
# Set common items
config.enable_diarization = True
# Handle config display
- if args.show:
- print(config.model_dump_json(indent=2, exclude_unset=True, exclude_none=True))
+ if args.show_compact:
+ print(config.to_json(indent=2, exclude_unset=True, exclude_none=True))
+ return
+ if args.show_complete:
+ print(config.to_json(indent=2, exclude_unset=False, exclude_none=False))
return
# Set the audio sample rate
@@ -362,31 +351,6 @@ def setup_audio_output(audio_source: dict, args) -> AudioPlayer | None:
return audio_player
-# ==============================================================================
-# SPEAKER CONFIGURATION
-# ==============================================================================
-
-
-def create_speaker_config(args) -> SpeakerFocusConfig:
- """Create speaker diarization configuration from arguments.
-
- Args:
- args: Command-line arguments
-
- Returns:
- SpeakerFocusConfig instance.
- """
- if args.focus_speakers or args.ignore_speakers:
- focus_mode = SpeakerFocusMode.IGNORE if args.ignore_mode else SpeakerFocusMode.RETAIN
- return SpeakerFocusConfig(
- focus_speakers=args.focus_speakers or [],
- ignore_speakers=args.ignore_speakers or [],
- focus_mode=focus_mode,
- )
- else:
- return SpeakerFocusConfig()
-
-
# ==============================================================================
# EVENT HANDLERS
# ==============================================================================
@@ -401,70 +365,6 @@ def register_event_handlers(client: VoiceAgentClient, args, start_time: datetime
start_time: Start time for timestamp calculation
"""
- # Audio slice counter
- slice_counter = {"count": 0}
-
- async def async_save_audio_slice(message: dict) -> None:
- """Save audio slice when speaker ends (SMART_TURN mode only)."""
- if not args.slices_dir:
- return
-
- # Only save slices in SMART_TURN mode
- if client._config.end_of_utterance_mode != "smart_turn":
- return
-
- # Get time from message
- event_time = message.get("time")
- if not event_time:
- return
-
- speaker_id = message.get("speaker_id", "unknown")
-
- # Get audio slice from buffer
- # Capture audio leading up to the speaker ending
- start_time = event_time - AUDIO_SLICE_DURATION
- end_time = event_time
-
- try:
- audio_data = await client._audio_buffer.get_frames(
- start_time=start_time,
- end_time=end_time,
- )
-
- if audio_data:
- # Generate filenames
- slice_counter["count"] += 1
- base_filename = f"slice_{slice_counter['count']:04d}_{speaker_id}_{event_time:.2f}"
- wav_filepath = Path(args.slices_dir) / f"{base_filename}.wav"
- json_filepath = Path(args.slices_dir) / f"{base_filename}.json"
-
- # Save audio file
- async with AudioFileWriter(
- str(wav_filepath), client._audio_sample_rate, client._audio_sample_width
- ) as writer:
- await writer.write(audio_data)
-
- # Save JSON metadata
- metadata = {
- "message": message,
- "speaker_id": speaker_id,
- "is_active": message.get("is_active"),
- "time": event_time,
- "slice_start_time": start_time,
- "slice_end_time": end_time,
- "slice_duration": end_time - start_time,
- "audio_file": f"{base_filename}.wav",
- }
- with open(json_filepath, "w") as f:
- json.dump(metadata, f, indent=2)
-
- except Exception as e:
- print(f"Error saving audio slice: {e}")
-
- def save_audio_slice(message: dict) -> None:
- """Save audio slice when speaker ends (SMART_TURN mode only)."""
- asyncio.create_task(async_save_audio_slice(message))
-
def console_print(ts: datetime.datetime, message: dict) -> None:
"""Print message to console with optional formatting."""
if not args.pretty:
@@ -482,14 +382,11 @@ def console_print(ts: datetime.datetime, message: dict) -> None:
_segs = []
for segment in message["segments"]:
suffix = "" if segment["is_active"] else " (background)"
- if args.verbose >= 3:
- _segs.append(f"@{segment['speaker_id']}{suffix}: `{segment['text']}` {segment['annotation']}")
- else:
- _segs.append(f"@{segment['speaker_id']}{suffix}: `{segment['text']}`")
+ _segs.append(f"@{segment['speaker_id']}{suffix}: `{segment['text']}` {segment.get('annotation', '')}")
payload = {"segments": _segs}
# Print to console
- print(f"{color}{ts_str} {msg_type:<24} {json.dumps(payload)}\033[0m")
+ print(f"{color}{ts_str} {client._total_time:>7.3f} {msg_type:<24} {json.dumps(payload)}\033[0m")
def log_message(message: dict[str, Any]) -> None:
"""Log message to console and optional JSONL file."""
@@ -502,8 +399,8 @@ def log_message(message: dict[str, Any]) -> None:
# Register standard handlers
client.on(AgentServerMessageType.INFO, log_message)
- client.on(AgentServerMessageType.RECOGNITION_STARTED, log_message)
- client.on(AgentServerMessageType.END_OF_TRANSCRIPT, log_message)
+ client.once(AgentServerMessageType.RECOGNITION_STARTED, log_message)
+ client.once(AgentServerMessageType.END_OF_TRANSCRIPT, log_message)
# Voice SDK messages
if not args.legacy:
@@ -518,24 +415,22 @@ def log_message(message: dict[str, Any]) -> None:
if args.verbose >= 1:
client.on(AgentServerMessageType.SPEAKER_STARTED, log_message)
client.on(AgentServerMessageType.SPEAKER_ENDED, log_message)
-
- # Save audio slices on SPEAKER_ENDED (SMART_TURN mode only)
- if args.slices_dir:
- client.on(AgentServerMessageType.SPEAKER_ENDED, save_audio_slice)
+ client.on(AgentServerMessageType.VAD_STATUS, log_message)
+ client.on(AgentServerMessageType.DIAGNOSTICS, log_message)
# Verbose turn prediction
if args.verbose >= 2:
client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, log_message)
+ client.on(AgentServerMessageType.SMART_TURN_RESULT, log_message)
# Metrics
- if args.verbose >= 4:
+ if args.verbose >= 3:
client.on(AgentServerMessageType.SESSION_METRICS, log_message)
client.on(AgentServerMessageType.SPEAKER_METRICS, log_message)
# Verbose STT events
- if args.verbose >= 5:
+ if args.verbose >= 4:
client.on(AgentServerMessageType.END_OF_UTTERANCE, log_message)
- client.on("ForcedEndOfUtterance", log_message)
client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, log_message)
client.on(AgentServerMessageType.ADD_TRANSCRIPT, log_message)
@@ -550,7 +445,7 @@ def log_message(message: dict[str, Any]) -> None:
log_message(
{
"message": "VoiceAgentClientConfig",
- "config": client._config.model_dump(exclude_none=True, exclude_unset=True),
+ "config": client._config.to_dict(exclude_none=True, exclude_unset=True),
}
)
@@ -751,18 +646,24 @@ def parse_args():
"-P",
"--preset",
type=str,
- help="Preset configuration name (e.g., scribe, low_latency, conversation_adaptive)",
+ help="Preset configuration name (e.g., scribe, fast, adaptive)",
)
parser.add_argument(
"--list-presets",
action="store_true",
help="List available preset configurations and exit",
)
+ parser.add_argument(
+ "-w",
+ "--show-compact",
+ action="store_true",
+ help="Display the compact configuration as JSON and exit (excludes unset and None values)",
+ )
parser.add_argument(
"-W",
- "--show",
+ "--show-complete",
action="store_true",
- help="Display the final configuration as JSON and exit (after applying preset/config and overrides)",
+ help="Display the complete configuration as JSON and exit (includes all defaults)",
)
parser.add_argument(
"-c",
@@ -800,18 +701,16 @@ def parse_args():
# ==============================================================================
parser.add_argument(
- "-R",
"--sample-rate",
type=int,
default=16000,
help="Audio sample rate in Hz (default: 16000)",
)
parser.add_argument(
- "-C",
"--chunk-size",
type=int,
- default=320,
- help="Audio chunk size in bytes (default: 320)",
+ default=160,
+ help="Audio chunk size in bytes (default: 160)",
)
parser.add_argument(
"-M",
@@ -824,12 +723,6 @@ def parse_args():
# Output options
# ==============================================================================
- parser.add_argument(
- "-S",
- "--save-slices",
- action="store_true",
- help="Save audio slices to output directory on SPEAKER_ENDED events (SMART_TURN mode only)",
- )
parser.add_argument(
"-p",
"--pretty",
@@ -856,65 +749,11 @@ def parse_args():
help="Use default device (default: False)",
)
parser.add_argument(
- "-w",
"--results",
action="store_true",
help="Include word-level transcription results in output (default: False)",
)
- # ==============================================================================
- # Voice Agent configuration overrides
- # ==============================================================================
-
- parser.add_argument(
- "-l",
- "--language",
- type=str,
- help="Language code (default: en)",
- )
- parser.add_argument(
- "-d",
- "--max-delay",
- type=float,
- help="Maximum delay for transcription results in seconds (default: 0.7)",
- )
- parser.add_argument(
- "-t",
- "--end-of-utterance-silence-trigger",
- type=float,
- help="Silence duration to trigger end of utterance in seconds (default: 0.5)",
- )
- parser.add_argument(
- "-m",
- "--end-of-utterance-mode",
- type=lambda s: s.upper(),
- choices=["FIXED", "ADAPTIVE", "EXTERNAL", "SMART_TURN"],
- help="End of utterance detection mode (default: ADAPTIVE)",
- )
-
- # ==============================================================================
- # Speaker management
- # ==============================================================================
-
- parser.add_argument(
- "-f",
- "--focus-speakers",
- nargs="*",
- help="Speakers to focus on (e.g., S1 S2). Use with --ignore-mode to ignore these speakers instead",
- )
- parser.add_argument(
- "-I",
- "--ignore-speakers",
- nargs="*",
- help="Specific speakers to ignore (e.g., S1 S2)",
- )
- parser.add_argument(
- "-x",
- "--ignore-mode",
- action="store_true",
- help="Use IGNORE mode instead of RETAIN mode for non-focus speakers",
- )
-
# ==============================================================================
# Speaker identification
# ==============================================================================
@@ -938,26 +777,21 @@ def parse_args():
args = parser.parse_args()
- mutually_excludive = [
- "preset",
- "end-of-utterance-mode",
- "end-of-utterance-silence-trigger",
- "focus-speakers",
- "ignore-mode",
- "ignore-speakers",
- "language",
- "max-delay",
- "speakers",
- ]
-
- if args.config is not None:
- conflicts: list[str] = []
- for arg in mutually_excludive:
- if getattr(args, arg.replace("-", "_")):
- conflicts.append(arg)
- if conflicts:
- print(f"**ERROR** -> You cannot use {[f'--{arg}' for arg in conflicts]} in combination with -c/--config")
- exit(1)
+ # Either preset or config must be provided
+ if (
+ args.config is None
+ and args.preset is None
+ and not args.list_presets
+ and not args.show_compact
+ and not args.show_complete
+ ):
+ print("**ERROR** -> You must provide either --preset or --config")
+ exit(1)
+
+ # Preset and config are mutually exclusive
+ if args.config is not None and args.preset is not None:
+ print("**ERROR** -> You cannot use both --preset and --config")
+ exit(1)
# Return the parsed arguments
return args
diff --git a/examples/voice/simple/simple.py b/examples/voice/simple/simple.py
index aa0afe93..00312cab 100644
--- a/examples/voice/simple/simple.py
+++ b/examples/voice/simple/simple.py
@@ -10,7 +10,10 @@
from speechmatics.rt import Microphone
from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import SpeechSegmentConfig
from speechmatics.voice import VoiceAgentClient
+from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice import VoiceAgentConfigPreset
async def main() -> None:
@@ -28,15 +31,20 @@ async def main() -> None:
print("Error: PyAudio not available - install with: pip install pyaudio")
return
+ # Config
+ config = VoiceAgentConfigPreset.FAST(
+ VoiceAgentConfig(speech_segment_config=SpeechSegmentConfig(emit_sentences=False))
+ )
+
# Create client
- client = VoiceAgentClient(api_key=api_key, preset="scribe")
+ client = VoiceAgentClient(api_key=api_key, config=config)
# Handle partial segments (interim results)
- @client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT)
- def on_partial_segment(message):
- segments = message.get("segments", [])
- for segment in segments:
- print(f"[PARTIAL] {segment['speaker_id']}: {segment['text']}")
+ # @client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT)
+ # def on_partial_segment(message):
+ # segments = message.get("segments", [])
+ # for segment in segments:
+ # print(f"[PARTIAL] {segment['speaker_id']}: {segment['text']}")
# Handle final segments
@client.on(AgentServerMessageType.ADD_SEGMENT)
diff --git a/sdk/rt/speechmatics/rt/_auth.py b/sdk/rt/speechmatics/rt/_auth.py
index 6968b367..ee75bcac 100644
--- a/sdk/rt/speechmatics/rt/_auth.py
+++ b/sdk/rt/speechmatics/rt/_auth.py
@@ -44,9 +44,6 @@ class StaticKeyAuth(AuthBase):
def __init__(self, api_key: Optional[str] = None):
self._api_key = api_key or os.environ.get("SPEECHMATICS_API_KEY")
- if not self._api_key:
- raise ValueError("API key required: provide api_key or set SPEECHMATICS_API_KEY")
-
async def get_auth_headers(self) -> dict[str, str]:
return {"Authorization": f"Bearer {self._api_key}"}
diff --git a/sdk/rt/speechmatics/rt/_models.py b/sdk/rt/speechmatics/rt/_models.py
index 63680b74..84e57204 100644
--- a/sdk/rt/speechmatics/rt/_models.py
+++ b/sdk/rt/speechmatics/rt/_models.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+import ssl
from dataclasses import asdict
from dataclasses import dataclass
from dataclasses import field
@@ -462,9 +463,9 @@ class ConnectionConfig:
close_timeout: Timeout for closing WebSocket connection.
max_size: Maximum message size in bytes.
max_queue: Maximum number of messages in receive queue.
- read_limit: Maximum number of bytes to read from WebSocket.
- write_limit: Maximum number of bytes to write to WebSocket.
-
+ read_limit: Maximum number of bytes to read from WebSocket (legacy websockets only).
+ write_limit: Maximum number of bytes to write to WebSocket (legacy websockets only).
+ ssl_context: SSL context for the WebSocket connection.
Returns:
Websocket connection configuration as a dict while excluding None values.
"""
@@ -477,9 +478,29 @@ class ConnectionConfig:
max_queue: Optional[int] = None
read_limit: Optional[int] = None
write_limit: Optional[int] = None
+ ssl_context: ssl.SSLContext = field(default_factory=ssl.create_default_context)
def to_dict(self) -> dict[str, Any]:
- return asdict(self, dict_factory=lambda x: {k: v for (k, v) in x if v is not None})
+ """Convert to dict, excluding ssl field to avoid pickle errors."""
+ result = {}
+ if self.open_timeout is not None:
+ result["open_timeout"] = self.open_timeout
+ if self.ping_interval is not None:
+ result["ping_interval"] = self.ping_interval
+ if self.ping_timeout is not None:
+ result["ping_timeout"] = self.ping_timeout
+ if self.close_timeout is not None:
+ result["close_timeout"] = self.close_timeout
+ if self.max_size is not None:
+ result["max_size"] = self.max_size
+ if self.max_queue is not None:
+ result["max_queue"] = self.max_queue
+ if self.read_limit is not None:
+ result["read_limit"] = self.read_limit
+ if self.write_limit is not None:
+ result["write_limit"] = self.write_limit
+
+ return result
@dataclass
diff --git a/sdk/rt/speechmatics/rt/_transport.py b/sdk/rt/speechmatics/rt/_transport.py
index 4501dbcc..e33a0203 100644
--- a/sdk/rt/speechmatics/rt/_transport.py
+++ b/sdk/rt/speechmatics/rt/_transport.py
@@ -24,12 +24,14 @@
from websockets.asyncio.client import connect
WS_HEADERS_KEY = "additional_headers"
+ IS_LEGACY_WEBSOCKETS = False
except ImportError:
# Fall back to legacy websockets
from websockets.legacy.client import WebSocketClientProtocol
from websockets.legacy.client import connect # type: ignore
WS_HEADERS_KEY = "extra_headers"
+ IS_LEGACY_WEBSOCKETS = True
class Transport:
@@ -116,8 +118,14 @@ async def connect(self, ws_headers: Optional[dict] = None) -> None:
ws_kwargs: dict = {
WS_HEADERS_KEY: ws_headers,
**self._conn_config.to_dict(),
+ "ssl": self._conn_config.ssl_context,
}
+ # Filter out parameters not supported by new websockets >=13.0
+ if not IS_LEGACY_WEBSOCKETS:
+ ws_kwargs.pop("read_limit", None)
+ ws_kwargs.pop("write_limit", None)
+
self._websocket = await connect(
url_with_params,
**ws_kwargs,
diff --git a/sdk/tts/speechmatics/tts/_models.py b/sdk/tts/speechmatics/tts/_models.py
index fdbca0e6..572e598c 100644
--- a/sdk/tts/speechmatics/tts/_models.py
+++ b/sdk/tts/speechmatics/tts/_models.py
@@ -50,8 +50,10 @@ class Voice(str, Enum):
sarah: English (UK) female voice.
theo: English (UK) male voice.
megan: English (UK) female voice.
+ jack: English (US) male voice.
"""
SARAH = "sarah"
THEO = "theo"
MEGAN = "megan"
+ JACK = "jack"
diff --git a/sdk/voice/README.md b/sdk/voice/README.md
index 9ff5d7d2..39e6e953 100644
--- a/sdk/voice/README.md
+++ b/sdk/voice/README.md
@@ -4,11 +4,26 @@
[](https://pypi.org/project/speechmatics-voice/)
[](https://www.python.org/)
-Python SDK for building voice-enabled applications with the Speechmatics Real-Time API. Optimized for conversational AI, voice agents, transcription services, and real-time captioning.
+Python SDK for building voice-enabled applications using Speechmatics Real-Time API. Optimized for specific use cases: conversational AI, voice agents, transcription services, and real-time captioning.
+
+## Table of Contents
+- [What is the Voice SDK?](#what-is-the-voice-sdk)
+- [Installation](#installation)
+- [Quick Start](#quick-start)
+- [Configuration](#configuration)
+- [Event Messages](#event-messages)
+- [Common Usage Patterns](#common-usage-patterns)
+- [Environment Variables](#environment-variables)
+- [Examples](#examples)
+- [SDK Class Reference](#sdk-class-reference)
+- [Requirements](#requirements)
+- [Documentation](#documentation)
+- [License](#license)
+
## What is the Voice SDK?
-The Voice SDK is a higher-level abstraction built on top of the Speechmatics Real-Time API (`speechmatics-rt`). While the Real-Time SDK provides raw transcription events (words and utterances), the Voice SDK adds:
+The Voice SDK is a higher-level abstraction built on top of the Speechmatics Real-Time API (`speechmatics-rt`). While the Real-Time API provides raw transcription events (words and utterances), the Voice SDK adds:
- **Intelligent Segmentation** - Groups words into meaningful speech segments per speaker
- **Turn Detection** - Automatically detects when speakers finish their turns using adaptive or ML-based methods
@@ -16,21 +31,20 @@ The Voice SDK is a higher-level abstraction built on top of the Speechmatics Rea
- **Preset Configurations** - Ready-to-use configs for common use cases (conversation, note-taking, captions)
- **Simplified Event Handling** - Receive clean, structured segments instead of raw word-level events
-### When to Use Voice SDK vs Real-Time SDK
+### When to Use Voice SDK vs Real-Time API
**Use Voice SDK when:**
-- Building conversational AI or voice agents
+- You are building conversational AI or voice agents
- You need automatic turn detection
- You want speaker-focused transcription
- You need ready-to-use presets for common scenarios
-**Use Real-Time SDK when:**
+**Use Real-Time API when:**
-- You need raw word-level events
-- Building custom segmentation logic
+- You only need raw, word-level events
+- You are building custom segmentation / aggregation logic
- You want fine-grained control over every event
-- Processing batch files or custom workflows
## Installation
@@ -38,16 +52,53 @@ The Voice SDK is a higher-level abstraction built on top of the Speechmatics Rea
# Standard installation
pip install speechmatics-voice
-# With SMART_TURN (ML-based turn detection)
+# With VAD and SMART_TURN (ML-based turn detection)
pip install speechmatics-voice[smart]
```
-> **Note:** `SMART_TURN` requires additional ML dependencies (ONNX runtime, transformers). If not installed, it automatically falls back to `ADAPTIVE` mode.
+> **Note:** Some features require additional ML dependencies (ONNX runtime, transformers). If not installed, these features will be unavailable and a warning will be shown.
+
+
+
+👉 Using Docker? Click to see how to install the required models.
+
+### Use within Docker
+
+If you are using a Docker container with the Voice SDK installed and you require the smart features (`SMART_TURN`), then you can use the following in your `Dockerfile` to make sure the ML models are included and not downloaded at runtime.
+
+```python
+"""
+Download the Voice SDK required models during the build process.
+"""
+
+from speechmatics.voice import SileroVAD, SmartTurnDetector
+
+
+def load_models():
+ SileroVAD.download_model()
+ SmartTurnDetector.download_model()
+
+
+if __name__ == "__main__":
+ load_models()
+```
+
+Then, in your `Dockerfile`, include the following:
+
+```
+COPY ./models.py models.py
+RUN uv run models.py
+```
+This copies the script and runs it as part of the build.
+
+
## Quick Start
### Basic Example
+A simple example that shows complete sentences as they have been finalized, with different speakers shown with different IDs.
+
```python
import asyncio
import os
@@ -55,13 +106,20 @@ from speechmatics.rt import Microphone
from speechmatics.voice import VoiceAgentClient, AgentServerMessageType
async def main():
+ """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset"""
+
+ # Audio configuration
+ SAMPLE_RATE = 16000 # Hz
+ CHUNK_SIZE = 160 # Samples per read
+ PRESET = "scribe" # Configuration preset
+
# Create client with preset
client = VoiceAgentClient(
api_key=os.getenv("SPEECHMATICS_API_KEY"),
- preset="scribe"
+ preset=PRESET
)
- # Handle final segments
+ # Print finalised segments of speech with speaker ID
@client.on(AgentServerMessageType.ADD_SEGMENT)
def on_segment(message):
for segment in message["segments"]:
@@ -70,17 +128,20 @@ async def main():
print(f"{speaker}: {text}")
# Setup microphone
- mic = Microphone(sample_rate=16000, chunk_size=320)
+ mic = Microphone(SAMPLE_RATE, CHUNK_SIZE)
if not mic.start():
print("Error: Microphone not available")
return
- # Connect and stream
+ # Connect to the Voice Agent
await client.connect()
+ # Stream microphone audio (interruptable using keyboard)
try:
while True:
- audio_chunk = await mic.read(320)
+ audio_chunk = await mic.read(CHUNK_SIZE)
+ if not audio_chunk:
+ break # Microphone stopped producing data
await client.send_audio(audio_chunk)
except KeyboardInterrupt:
pass
@@ -91,35 +152,40 @@ if __name__ == "__main__":
asyncio.run(main())
```
-### Using Presets
-
-Presets provide optimized configurations for common use cases:
+### Configuring a Voice Agent Client
+When creating a VoiceAgentClient, there are several ways to configure it:
+1. **Presets** - optimised configurations for common use cases. These require no further configuration to be set.
```python
+# Low latency preset - for fast responses (may split speech in to smaller segments)
+client = VoiceAgentClient(api_key=api_key, preset="fast")
+
+# Conversation preset - for natural dialogue
+client = VoiceAgentClient(api_key=api_key, preset="adaptive")
+
+# Advanced conversation with ML turn detection
+client = VoiceAgentClient(api_key=api_key, preset="smart_turn")
+
# External end of turn preset - endpointing handled by the client
client = VoiceAgentClient(api_key=api_key, preset="external")
# Scribe preset - for note-taking
client = VoiceAgentClient(api_key=api_key, preset="scribe")
-# Low latency preset - for fast responses
-client = VoiceAgentClient(api_key=api_key, preset="low_latency")
-
-# Conversation preset - for natural dialogue
-client = VoiceAgentClient(api_key=api_key, preset="conversation_adaptive")
-
-# Advanced conversation with ML turn detection
-client = VoiceAgentClient(api_key=api_key, preset="conversation_smart_turn")
-
# Captions preset - for live captioning
client = VoiceAgentClient(api_key=api_key, preset="captions")
+
+# To view all available presets, use:
+presets = VoiceAgentConfigPreset.list_presets()
```
-### Custom Configuration
+
+2. **Custom Configuration** - for more control, you can also specify custom configuration in a `VoiceAgentConfig` object.
```python
from speechmatics.voice import VoiceAgentClient, VoiceAgentConfig, EndOfUtteranceMode
+# Define your custom configuration
config = VoiceAgentConfig(
language="en",
enable_diarization=True,
@@ -130,62 +196,75 @@ config = VoiceAgentConfig(
client = VoiceAgentClient(api_key=api_key, config=config)
```
-## Configuration
+3. **Custom Configuration with Overlays** - you can use presets as a starting point, and then customize with overlays.
-### Basic Parameters
-
-**`language`** (str, default: `"en"`)
-Language code for transcription (e.g., `"en"`, `"es"`, `"fr"`). See [supported languages](https://docs.speechmatics.com/speech-to-text/languages).
+```python
+from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig
-**`operating_point`** (OperatingPoint, default: `ENHANCED`)
-Balance accuracy vs latency. Options: `STANDARD` or `ENHANCED`.
+# Use preset with custom overrides
+config = VoiceAgentConfigPreset.SCRIBE(
+ VoiceAgentConfig(
+ language="es",
+ max_delay=0.8
+ )
+)
+```
-**`domain`** (str, default: `None`)
-Domain-specific model (e.g., `"finance"`, `"medical"`). See [supported languages and domains](https://docs.speechmatics.com/speech-to-text/languages).
+> **Note:** If no config or preset is provided, the client will default to the `external` preset.
-**`output_locale`** (str, default: `None`)
-Output locale for formatting (e.g., `"en-GB"`, `"en-US"`). See [supported languages and locales](https://docs.speechmatics.com/speech-to-text/languages).
+### Configuration Serialization
+It can also be useful to export and import configuration as JSON:
-**`enable_diarization`** (bool, default: `False`)
-Enable speaker diarization to identify and label different speakers.
+```python
+from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig
-### Turn Detection Parameters
+# Export preset to JSON
+config_json = VoiceAgentConfigPreset.SCRIBE().to_json()
-**`end_of_utterance_mode`** (EndOfUtteranceMode, default: `FIXED`)
-Controls how turn endings are detected:
+# Load from JSON
+config = VoiceAgentConfig.from_json(config_json)
-- **`FIXED`** - Uses fixed silence threshold. Fast but may split slow speech.
-- **`ADAPTIVE`** - Adjusts delay based on speech rate, pauses, and disfluencies. Best for natural conversation.
-- **`SMART_TURN`** - Uses ML model to detect acoustic turn-taking cues. Requires `[smart]` extras.
-- **`EXTERNAL`** - Manual control via `client.finalize()`. For custom turn logic.
+# Or create from JSON string
+config = VoiceAgentConfig.from_json('{"language": "en", "enable_diarization": true}')
+```
-**`end_of_utterance_silence_trigger`** (float, default: `0.2`)
-Silence duration in seconds to trigger turn end.
+## Configuration
-**`end_of_utterance_max_delay`** (float, default: `10.0`)
-Maximum delay before forcing turn end.
+### Basic Parameters
-**`max_delay`** (float, default: `0.7`)
-Maximum transcription delay for word emission.
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `language` | str | `"en"` | Language code for transcription (e.g., `"en"`, `"es"`, `"fr"`).
See [supported languages](https://docs.speechmatics.com/speech-to-text/languages). |
+| `operating_point` | OperatingPoint | `ENHANCED` | Balance accuracy vs latency. Options: `STANDARD` or `ENHANCED`. |
+| `domain` | str | `None` | Domain-specific model (e.g., `"finance"`, `"medical"`).
See [supported languages and domains](https://docs.speechmatics.com/speech-to-text/languages). |
+| `output_locale` | str | `None` | Output locale for formatting (e.g., `"en-GB"`, `"en-US"`).
See [supported languages and locales](https://docs.speechmatics.com/speech-to-text/languages). |
+| `max_delay` | float | `0.7` | Maximum transcription delay for word emission. |
-### Speaker Configuration
+### Turn Detection Parameters
-**`speaker_sensitivity`** (float, default: `0.5`)
-Diarization sensitivity between 0.0 and 1.0. Higher values detect more speakers.
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `end_of_utterance_mode` | EndOfUtteranceMode | `FIXED` | Controls how turn endings are detected. Options:
- `FIXED` - Uses fixed silence threshold. Fast but may split slow speech.
- `ADAPTIVE` - Adjusts delay based on speech rate, pauses, and disfluencies. Best for natural conversation.
- `EXTERNAL` - Manual control via `client.finalize()`. For custom turn logic. |
+| `end_of_utterance_silence_trigger` | float | `0.2` | Silence duration in seconds to trigger turn end (also used for the basis of adaptive delay). |
-**`max_speakers`** (int, default: `None`)
-Limit maximum number of speakers to detect.
+### Speaker Configuration
-**`prefer_current_speaker`** (bool, default: `False`)
-Give extra weight to current speaker for word grouping.
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `enable_diarization` | bool | `False` | Enable speaker diarization to identify and label different speakers. |
+| `speaker_sensitivity` | float | `0.5` | Diarization sensitivity between 0.0 and 1.0. Higher values detect more speakers. |
+| `max_speakers` | int | `None` | Limit maximum number of speakers to detect. |
+| `prefer_current_speaker` | bool | `False` | Give extra weight to current speaker for word grouping. |
+| `speaker_config` | SpeakerFocusConfig | `SpeakerFocusConfig()` | Configure speaker focus/ignore rules. |
+| `known_speakers` | list[SpeakerIdentifier] | `[]` | Pre-enrolled speaker identifiers for speaker identification. |
-**`speaker_config`** (SpeakerFocusConfig, default: `SpeakerFocusConfig()`)
-Configure speaker focus/ignore rules.
+#### Usage Examples
+Using `speaker_config`, you can focus on only specific speakers but keep words from others, or ignore specific speakers.
```python
from speechmatics.voice import SpeakerFocusConfig, SpeakerFocusMode
-# Focus only on specific speakers
+# Focus only on specific speakers, but keep words from other speakers
config = VoiceAgentConfig(
enable_diarization=True,
speaker_config=SpeakerFocusConfig(
@@ -198,18 +277,17 @@ config = VoiceAgentConfig(
config = VoiceAgentConfig(
enable_diarization=True,
speaker_config=SpeakerFocusConfig(
- ignore_speakers=["S3"],
- focus_mode=SpeakerFocusMode.IGNORE
+ ignore_speakers=["S3"]
)
)
```
-**`known_speakers`** (list[SpeakerIdentifier], default: `[]`)
-Pre-enrolled speaker identifiers for speaker identification.
+Using `known_speakers`, you can use pre-enrolled speaker identifiers to identify specific speakers.
```python
from speechmatics.voice import SpeakerIdentifier
+# Use known speakers from previous session
config = VoiceAgentConfig(
enable_diarization=True,
known_speakers=[
@@ -221,8 +299,14 @@ config = VoiceAgentConfig(
### Language & Vocabulary
-**`additional_vocab`** (list[AdditionalVocabEntry], default: `[]`)
-Custom vocabulary for domain-specific terms.
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `additional_vocab` | list[AdditionalVocabEntry] | `[]` | Custom vocabulary for domain-specific terms. |
+| `punctuation_overrides` | dict | `None` | Custom punctuation rules. |
+
+#### Usage Examples
+
+Using `additional_vocab`, you can specify a dictionary of domain-specific terms.
```python
from speechmatics.voice import AdditionalVocabEntry
@@ -239,83 +323,68 @@ config = VoiceAgentConfig(
)
```
-**`punctuation_overrides`** (dict, default: `None`)
-Custom punctuation rules.
-
### Audio Parameters
-**`sample_rate`** (int, default: `16000`)
-Audio sample rate in Hz.
-
-**`audio_encoding`** (AudioEncoding, default: `PCM_S16LE`)
-Audio encoding format.
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `sample_rate` | int | `16000` | Audio sample rate in Hz. |
+| `audio_encoding` | AudioEncoding | `PCM_S16LE` | Audio encoding format. |
### Advanced Parameters
-**`transcription_update_preset`** (TranscriptionUpdatePreset, default: `COMPLETE`)
-Controls when to emit updates: `COMPLETE`, `COMPLETE_PLUS_TIMING`, `WORDS`, `WORDS_PLUS_TIMING`, or `TIMING`.
-
-**`speech_segment_config`** (SpeechSegmentConfig, default: `SpeechSegmentConfig()`)
-Fine-tune segment generation and post-processing.
-
-**`smart_turn_config`** (SmartTurnConfig, default: `None`)
-Configure SMART_TURN behavior (buffer length, threshold).
-
-**`include_results`** (bool, default: `False`)
-Include word-level timing data in segments.
-
-**`include_partials`** (bool, default: `True`)
-Emit partial segments. Set to `False` for final-only output.
-
-### Configuration with Overlays
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `transcription_update_preset` | TranscriptionUpdatePreset | `COMPLETE` | Controls when to emit updates: `COMPLETE`, `COMPLETE_PLUS_TIMING`, `WORDS`, `WORDS_PLUS_TIMING`, or `TIMING`. |
+| `speech_segment_config` | SpeechSegmentConfig | `SpeechSegmentConfig()` | Fine-tune segment generation and post-processing. |
+| `smart_turn_config` | SmartTurnConfig | `None` | Configure SMART_TURN behavior (buffer length, threshold). |
+| `include_results` | bool | `False` | Include word-level timing data in segments. |
+| `include_partials` | bool | `True` | Include interim (lower confidence) words in emitted segments. Set to `False` for final-only output. |
-Use presets as a starting point and customize with overlays:
-
-```python
-from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig
-
-# Use preset with custom overrides
-config = VoiceAgentConfigPreset.SCRIBE(
- VoiceAgentConfig(
- language="es",
- max_delay=0.8
- )
-)
+## Event Messages
-# Available presets
-presets = VoiceAgentConfigPreset.list_presets()
-# ['low_latency', 'conversation_adaptive', 'conversation_smart_turn', 'scribe', 'captions']
-```
+The Voice SDK emits real-time, structured events as a session progresses via `AgentServerMessageType`.
-### Configuration Serialization
+These events fall into three main categories:
+1. **Core Events** - high-level session and transcription updates.
+2. **Speaker Events** - detected speech activity.
+3. **Additional** - detailed, low-level events.
-Export and import configurations as JSON:
+To handle events, register a callback using `@client.on()` decorator or `client.on()` method.
-```python
-from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig
+> **Note:** The payloads shown below are the actual message payloads from the Voice SDK. When using the CLI example with `--output-file`, messages also include a `ts` timestamp field (e.g., `"ts": "2025-11-11 23:18:35.909"`), which is added by the CLI for logging purposes and is not part of the SDK payload.
-# Export preset to JSON
-config_json = VoiceAgentConfigPreset.SCRIBE().to_json()
+### High Level Overview
-# Load from JSON
-config = VoiceAgentConfig.from_json(config_json)
+#### Core Events
-# Or create from JSON string
-config = VoiceAgentConfig.from_json('{"language": "en", "enable_diarization": true}')
-```
+| Event | Description | Notes / Purpose |
+| ----------------------- | ----------------------------------------- | ------------------------------------------------------------ |
+| `RECOGNITION_STARTED` | Fired when a transcription session starts | Contains session ID, language pack info |
+| `ADD_PARTIAL_SEGMENT` | Emitted continuously during speech | Provides interim, real-time transcription text |
+| `ADD_SEGMENT` | Fired when a segment is finalized | Provides stable, final transcription text |
+| `END_OF_TURN` | Fired when a speaker’s turn ends | Depends on `end_of_utterance_mode`; useful for turn tracking |
-## Event Messages
+#### Speaker Events
+| Event | When it fires | Purpose |
+| --------------- | -------------------- | ------------------------------- |
+| `SPEAKER_STARTED` | Voice detected | Marks start of speech |
+| `SPEAKER_ENDED` | Silence detected | Marks end of speech |
+| `SPEAKERS_RESULT` | Enrollment completes | Provides speaker IDs and labels |
-The Voice SDK emits structured events via `AgentServerMessageType`. Register handlers using the `@client.on()` decorator or `client.on()` method.
+#### Additional Events
+| Event | When it fires | Purpose |
+| ---------------------- | ----------------------------- | ------------------------------------------- |
+| `START_OF_TURN` | New turn begins | Optional, low-level event for turn tracking |
+| `END_OF_TURN_PREDICTION` | Predicts turn completion | Fires before END_OF_TURN in adaptive mode |
+| `END_OF_UTTERANCE` | Silence threshold reached | Low-level STT engine trigger |
+| `ADD_PARTIAL_TRANSCRIPT` | Word-level partial transcript | Legacy; use ADD_PARTIAL_SEGMENT instead |
+| `ADD_TRANSCRIPT` | Word-level final transcript | Legacy; use ADD_SEGMENT instead |
-> **Note:** The payloads shown below are the actual message payloads from the Voice SDK. When using the CLI example with `--output-file`, messages also include a `ts` timestamp field (e.g., `"ts": "2025-11-11 23:18:35.909"`), which is added by the CLI for logging purposes and is not part of the SDK payload.
-### Core Events
+### Core Events - Examples and Payloads
#### RECOGNITION_STARTED
-Emitted when transcription session starts. Contains session ID and language pack info.
-
```python
@client.on(AgentServerMessageType.RECOGNITION_STARTED)
def on_started(message):
@@ -343,8 +412,6 @@ def on_started(message):
#### ADD_PARTIAL_SEGMENT
-Emitted continuously as speech is being processed. Contains interim text that updates in real-time.
-
```python
@client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT)
def on_partial(message):
@@ -364,7 +431,6 @@ def on_partial(message):
"timestamp": "2025-11-11T23:18:37.189+00:00",
"language": "en",
"text": "Welcome to",
- "annotation": ["has_partial"],
"metadata": {
"start_time": 1.28,
"end_time": 1.6
@@ -384,7 +450,6 @@ def on_partial(message):
- `speaker_id` - Speaker label (e.g., `"S1"`, `"S2"`)
- `is_active` - `true` if speaker is in focus (based on `speaker_config`)
- `text` - Current partial transcription text
-- `annotation` - Status flags (see annotation section below)
- `metadata.start_time` - Segment start time (seconds since session start)
- `metadata.end_time` - Segment end time (seconds since session start)
@@ -392,8 +457,6 @@ Top-level `metadata` contains the same timing plus `processing_time`.
#### ADD_SEGMENT
-Emitted when a segment is finalized. Contains stable, final transcription text.
-
```python
@client.on(AgentServerMessageType.ADD_SEGMENT)
def on_segment(message):
@@ -416,13 +479,6 @@ def on_segment(message):
"timestamp": "2025-11-11T23:18:37.189+00:00",
"language": "en",
"text": "Welcome to Speechmatics.",
- "annotation": [
- "has_final",
- "starts_with_final",
- "ends_with_final",
- "ends_with_eos",
- "ends_with_punctuation"
- ],
"metadata": {
"start_time": 1.28,
"end_time": 8.04
@@ -437,21 +493,8 @@ def on_segment(message):
}
```
-**Annotation Flags:**
-
-- `has_final` - Contains finalized words
-- `has_partial` - Contains partial (interim) words
-- `starts_with_final` - First word is finalized
-- `ends_with_final` - Last word is finalized
-- `ends_with_eos` - Ends with end-of-sentence
-- `ends_with_punctuation` - Ends with punctuation
-- `fast_speaker` - Speaker is speaking quickly (may appear in some segments)
-- `has_disfluency` - Contains disfluencies like "um", "er" (may appear in some segments)
-
#### END_OF_TURN
-Emitted when a speaker's turn is complete. Timing depends on `end_of_utterance_mode`.
-
```python
@client.on(AgentServerMessageType.END_OF_TURN)
def on_turn_end(message):
@@ -472,12 +515,10 @@ def on_turn_end(message):
}
```
-### Speaker Events
+### Speaker Events - Examples and Payloads
#### SPEAKER_STARTED
-Emitted when a speaker starts speaking (voice activity detected).
-
```python
@client.on(AgentServerMessageType.SPEAKER_STARTED)
def on_speaker_start(message):
@@ -499,8 +540,6 @@ def on_speaker_start(message):
#### SPEAKER_ENDED
-Emitted when a speaker stops speaking (silence detected).
-
```python
@client.on(AgentServerMessageType.SPEAKER_ENDED)
def on_speaker_end(message):
@@ -522,27 +561,19 @@ def on_speaker_end(message):
#### SPEAKERS_RESULT
-Emitted when speaker enrollment completes.
-
```python
-# Request speaker IDs at end of session
-await client.send_message({"message": "GetSpeakers", "final": True})
-
+# Listen for the result
@client.on(AgentServerMessageType.SPEAKERS_RESULT)
def on_speakers(message):
for speaker in message["speakers"]:
print(f"Speaker {speaker['label']}: {speaker['speaker_identifiers']}")
-```
-
-### Additional Events
-
-**`START_OF_TURN`** - Emitted at the beginning of a new turn.
-**`END_OF_TURN_PREDICTION`** - Emitted during `ADAPTIVE` or `SMART_TURN` mode to predict turn completion (fires before `END_OF_TURN`).
-
-**`END_OF_UTTERANCE`** - Low-level STT engine event (fires when silence threshold is reached).
+# Request speaker IDs at end of session
+await client.send_message({"message": AgentClientMessageType.GET_SPEAKERS, "final": True})
-**`ADD_PARTIAL_TRANSCRIPT` / `ADD_TRANSCRIPT`** - Legacy word-level events from underlying Real-Time API (not typically needed with Voice SDK).
+# Request speaker IDs now
+await client.send_message({"message": AgentClientMessageType.GET_SPEAKERS})
+```
## Common Usage Patterns
@@ -675,7 +706,7 @@ See the `examples/voice/` directory for complete working examples:
- **`scribe/`** - Note-taking with custom vocabulary
- **`cli/`** - Full-featured CLI with all options
-## API Reference
+## SDK Class Reference
### VoiceAgentClient
@@ -698,7 +729,7 @@ class VoiceAgentClient:
url: Custom WebSocket URL (defaults to SPEECHMATICS_RT_URL env var)
app: Optional application name for endpoint URL
config: Voice Agent configuration (optional)
- preset: Preset name ("scribe", "low_latency", etc.) (optional)
+ preset: Preset name ("scribe", "fast", etc.) (optional)
"""
async def connect(self) -> None:
@@ -770,13 +801,13 @@ class VoiceAgentClient:
## Requirements
- Python 3.9+
-- Speechmatics API key ([Get one here](https://portal.speechmatics.com/))
+- Speechmatics API key (Get one through: [Speechmatics Portal](https://portal.speechmatics.com/))
## Documentation
-- [Speechmatics Documentation](https://docs.speechmatics.com/)
+- [Speechmatics Documentation Homepage](https://docs.speechmatics.com/)
- [Real-Time Quickstart](https://docs.speechmatics.com/speech-to-text/realtime/quickstart)
-- [Authentication](https://docs.speechmatics.com/get-started/authentication)
+- [Getting Started with Authentication](https://docs.speechmatics.com/get-started/authentication)
## License
diff --git a/sdk/voice/pyproject.toml b/sdk/voice/pyproject.toml
index 239d3958..a339b221 100644
--- a/sdk/voice/pyproject.toml
+++ b/sdk/voice/pyproject.toml
@@ -11,7 +11,7 @@ authors = [{ name = "Speechmatics", email = "support@speechmatics.com" }]
license = "MIT"
requires-python = ">=3.9"
dependencies = [
- "speechmatics-rt>=0.5.1",
+ "speechmatics-rt>=0.5.2",
"pydantic>=2.10.6,<3",
"numpy>=1.26.4,<3"
]
diff --git a/sdk/voice/speechmatics/voice/__init__.py b/sdk/voice/speechmatics/voice/__init__.py
index 6cb66fd6..21f517c0 100644
--- a/sdk/voice/speechmatics/voice/__init__.py
+++ b/sdk/voice/speechmatics/voice/__init__.py
@@ -26,18 +26,23 @@
from ._models import EndOfTurnConfig
from ._models import EndOfTurnPenaltyItem
from ._models import EndOfUtteranceMode
+from ._models import MaxDelayMode
from ._models import SegmentMessage
from ._models import SessionMetricsMessage
from ._models import SmartTurnConfig
from ._models import SpeakerFocusConfig
from ._models import SpeakerFocusMode
from ._models import SpeakerMetricsMessage
+from ._models import SpeakerStatusMessage
from ._models import SpeechSegmentConfig
from ._models import TurnPredictionMessage
from ._models import TurnStartEndResetMessage
from ._models import VADStatusMessage
+from ._models import VoiceActivityConfig
from ._models import VoiceAgentConfig
from ._presets import VoiceAgentConfigPreset
+from ._smart_turn import SmartTurnDetector
+from ._vad import SileroVAD
__all__ = [
"__version__",
@@ -50,6 +55,7 @@
"EndOfTurnConfig",
"EndOfTurnPenaltyItem",
"EndOfUtteranceMode",
+ "MaxDelayMode",
"OperatingPoint",
"SpeakerDiarizationConfig",
"SpeakerFocusConfig",
@@ -57,8 +63,12 @@
"SpeakerIdentifier",
"SmartTurnConfig",
"SpeechSegmentConfig",
+ "VoiceActivityConfig",
"VoiceAgentConfig",
"VoiceAgentConfigPreset",
+ # Models
+ "SmartTurnDetector",
+ "SileroVAD",
# Client messages
"AgentClientMessageType",
# Server messages
@@ -66,6 +76,7 @@
"SegmentMessage",
"SessionMetricsMessage",
"SpeakerMetricsMessage",
+ "SpeakerStatusMessage",
"TurnPredictionMessage",
"TurnStartEndResetMessage",
"VADStatusMessage",
diff --git a/sdk/voice/speechmatics/voice/_audio.py b/sdk/voice/speechmatics/voice/_audio.py
index 486c18e8..6653db9f 100644
--- a/sdk/voice/speechmatics/voice/_audio.py
+++ b/sdk/voice/speechmatics/voice/_audio.py
@@ -42,6 +42,7 @@ def __init__(self, sample_rate: int, frame_size: int, sample_width: int = 2, tot
self._sample_width: int = sample_width
self._frame_size: int = frame_size
self._frame_bytes: int = frame_size * sample_width
+ self._frame_duration: float = round(frame_size / sample_rate, 3)
# Queue
self._frames: list[bytes] = []
@@ -63,7 +64,7 @@ def _get_time_from_frame(self, frame_index: int) -> float:
Returns:
The time in seconds.
"""
- return frame_index / (self._sample_rate / self._frame_size)
+ return frame_index * self._frame_duration
def _get_frame_from_time(self, time: float) -> int:
"""Get the frame index from a time.
@@ -77,7 +78,7 @@ def _get_frame_from_time(self, time: float) -> int:
Returns:
The frame index.
"""
- return int(time * (self._sample_rate / self._frame_size) + 1e-9)
+ return int(time / self._frame_duration) # + 1e-9)
async def put_bytes(self, data: bytes) -> None:
"""Add data to the buffer.
@@ -230,7 +231,7 @@ def total_frames(self) -> int:
@property
def total_time(self) -> float:
"""Get the total time added to the buffer."""
- return self._get_time_from_frame(self._total_frames)
+ return self._total_frames * self._frame_duration
@property
def size(self) -> int:
diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py
index 32770907..978d1306 100644
--- a/sdk/voice/speechmatics/voice/_client.py
+++ b/sdk/voice/speechmatics/voice/_client.py
@@ -49,6 +49,7 @@
from ._models import SpeakerMetricsMessage
from ._models import SpeakerSegment
from ._models import SpeakerSegmentView
+from ._models import SpeakerStatusMessage
from ._models import SpeechFragment
from ._models import TranscriptionUpdatePreset
from ._models import TurnPredictionMessage
@@ -62,6 +63,9 @@
from ._smart_turn import SmartTurnPredictionResult
from ._turn import TurnTaskProcessor
from ._utils import FragmentUtils
+from ._vad import SILERO_INSTALL_HINT
+from ._vad import SileroVAD
+from ._vad import SileroVADResult
class VoiceAgentClient(AsyncClient):
@@ -114,12 +118,12 @@ def __init__(
>>> client = VoiceAgentClient(
... api_key="your_api_key",
... url="wss://custom.endpoint.com/v2",
- ... preset="conversation_adaptive"
+ ... preset="adaptive"
... )
Using a preset (utility class):
>>> from speechmatics.voice import VoiceAgentClient, VoiceAgentConfigPreset
- >>> config=VoiceAgentConfigPreset.CONVERSATION_ADAPTIVE()
+ >>> config=VoiceAgentConfigPreset.ADAPTIVE()
>>> client = VoiceAgentClient(
... api_key="your_api_key",
... url="wss://custom.endpoint.com/v2",
@@ -182,7 +186,7 @@ def __init__(
config=self._config,
session_id="NOT_SET",
base_time=datetime.datetime.now(datetime.timezone.utc),
- language_pack_info=LanguagePackInfo.model_validate({}),
+ language_pack_info=LanguagePackInfo.from_dict({}),
)
# -------------------------------------
@@ -238,49 +242,89 @@ def __init__(
self._current_view: Optional[SpeakerSegmentView] = None
self._previous_view: Optional[SpeakerSegmentView] = None
+ # -------------------------------------
+ # VAD
+ # -------------------------------------
+
+ # Handlers
+ self._uses_silero_vad: bool = False
+ self._silero_detector: Optional[SileroVAD] = None
+
+ # Silero VAD detector
+ if self._config.vad_config and self._config.vad_config.enabled:
+ if not SileroVAD.dependencies_available():
+ self._logger.warning(SILERO_INSTALL_HINT)
+ else:
+ silero_detector = SileroVAD(
+ silence_duration=self._config.vad_config.silence_duration,
+ threshold=self._config.vad_config.threshold,
+ auto_init=True,
+ on_state_change=self._handle_silero_vad_result,
+ )
+ if silero_detector.model_exists():
+ self._silero_detector = silero_detector
+ self._uses_silero_vad = True
+ if not self._uses_silero_vad:
+ self._logger.warning("Silero model not available and VAD will be disabled.")
+
# -------------------------------------
# EOU / EOT
# -------------------------------------
# Handlers
- self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize)
+ self._uses_smart_turn: bool = False
self._smart_turn_detector: Optional[SmartTurnDetector] = None
- self._eot_calculation_task: Optional[asyncio.Task] = None
# Current turn
self._turn_start_time: Optional[float] = None
self._turn_active: bool = False
+ # Smart turn cutoff time - filters late transcripts during finalization
+ self._smart_turn_pending_cutoff: Optional[float] = None
+
# Start turn detector if SMART_TURN requested
- if self._config.end_of_utterance_mode == EndOfUtteranceMode.SMART_TURN:
- eou_mode_ok: bool = False
+ if self._config.smart_turn_config and self._config.smart_turn_config.enabled:
if not SmartTurnDetector.dependencies_available():
self._logger.warning(SMART_TURN_INSTALL_HINT)
else:
- detector = SmartTurnDetector(
- auto_init=True,
- threshold=self._config.smart_turn_config.smart_turn_threshold,
+ smart_turn_detector = SmartTurnDetector(
+ auto_init=True, threshold=self._config.smart_turn_config.smart_turn_threshold
)
- if detector.model_exists():
- self._smart_turn_detector = detector
- self._config.smart_turn_config.audio_buffer_length = 10.0
- eou_mode_ok = True
- if not eou_mode_ok:
+ if smart_turn_detector.model_exists():
+ self._smart_turn_detector = smart_turn_detector
+ self._uses_smart_turn = True
+ if not self._uses_smart_turn:
self._logger.warning("Smart Turn model not available. Falling back to ADAPTIVE.")
self._config.end_of_utterance_mode = EndOfUtteranceMode.ADAPTIVE
+ # -------------------------------------
+ # Turn / End of Utterance Handling
+ # -------------------------------------
+
# EOU mode
self._eou_mode: EndOfUtteranceMode = self._config.end_of_utterance_mode
- # Uses fixed EndOfUtterance message
- self._uses_fixed_eou: bool = self._eou_mode == EndOfUtteranceMode.FIXED
+ # Handlers
+ self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize)
+ self._eot_calculation_task: Optional[asyncio.Task] = None
+
+ # Uses fixed EndOfUtterance message from STT
+ self._uses_fixed_eou: bool = (
+ self._eou_mode == EndOfUtteranceMode.FIXED
+ and not self._silero_detector
+ and not self._config.end_of_turn_config.use_forced_eou
+ )
# Uses ForceEndOfUtterance message
- self._uses_forced_eou: bool = self._eou_mode in [
- EndOfUtteranceMode.ADAPTIVE,
- EndOfUtteranceMode.SMART_TURN,
- ]
+ self._uses_forced_eou: bool = not self._uses_fixed_eou
self._forced_eou_active: bool = False
+ self._last_forced_eou_latency: float = 0.0
+
+ # Emit EOT prediction (uses _uses_forced_eou)
+ self._uses_eot_prediction: bool = self._eou_mode not in [
+ EndOfUtteranceMode.FIXED,
+ EndOfUtteranceMode.EXTERNAL,
+ ]
# -------------------------------------
# Diarization / Speakers
@@ -291,6 +335,9 @@ def __init__(
self._current_speaker: Optional[str] = None
self._dz_enabled: bool = self._config.enable_diarization
self._dz_config = self._config.speaker_config
+ self._last_speak_start_time: Optional[float] = None
+ self._last_speak_end_time: Optional[float] = None
+ self._last_speak_end_latency: float = 0
# -------------------------------------
# Metrics
@@ -310,12 +357,17 @@ def __init__(
AudioEncoding.PCM_S16LE: 2,
}.get(self._audio_format.encoding, 1)
+ # Default audio buffer
+ if not self._config.audio_buffer_length and (self._uses_smart_turn or self._uses_silero_vad):
+ self._config.audio_buffer_length = 15.0
+
# Audio buffer
- if self._config.smart_turn_config.audio_buffer_length > 0:
+ if self._config.audio_buffer_length > 0:
self._audio_buffer: AudioBuffer = AudioBuffer(
sample_rate=self._audio_format.sample_rate,
frame_size=self._audio_format.chunk_size,
- total_seconds=self._config.smart_turn_config.audio_buffer_length,
+ sample_width=self._audio_sample_width,
+ total_seconds=self._config.audio_buffer_length,
)
# Register handlers
@@ -348,6 +400,7 @@ def _prepare_config(
operating_point=config.operating_point,
diarization="speaker" if config.enable_diarization else None,
enable_partials=True,
+ enable_entities=config.enable_entities,
max_delay=config.max_delay,
max_delay_mode="fixed",
audio_filtering_config={
@@ -390,8 +443,10 @@ def _prepare_config(
speakers=dz_speakers or None,
)
- # End of Utterance (for fixed)
- if config.end_of_utterance_silence_trigger and config.end_of_utterance_mode == EndOfUtteranceMode.FIXED:
+ # Fixed end of Utterance
+ if bool(
+ config.end_of_utterance_mode == EndOfUtteranceMode.FIXED and not config.end_of_turn_config.use_forced_eou
+ ):
transcription_config.conversation_config = ConversationConfig(
end_of_utterance_silence_trigger=config.end_of_utterance_silence_trigger,
)
@@ -404,7 +459,7 @@ def _prepare_config(
audio_format = AudioFormat(
encoding=config.audio_encoding,
sample_rate=config.sample_rate,
- chunk_size=320,
+ chunk_size=config.chunk_size,
)
# Return the config objects
@@ -602,8 +657,12 @@ async def send_audio(self, payload: bytes) -> None:
await self.disconnect()
return
+ # Process with Silero VAD
+ if self._silero_detector:
+ asyncio.create_task(self._silero_detector.process_audio(payload))
+
# Add to audio buffer (use put_bytes to handle variable chunk sizes)
- if self._config.smart_turn_config.audio_buffer_length > 0:
+ if self._config.audio_buffer_length > 0:
await self._audio_buffer.put_bytes(payload)
# Calculate the time (in seconds) for the payload
@@ -645,6 +704,12 @@ def update_diarization_config(self, config: SpeakerFocusConfig) -> None:
... SpeakerFocusConfig(focus_speakers=["main_speaker"])
... )
"""
+
+ # Only allow updates if diarization is enabled
+ if not self._config.enable_diarization:
+ raise ValueError("Diarization is not enabled")
+
+ # Update the diarization config
self._dz_config = config
# ============================================================================
@@ -661,6 +726,9 @@ def finalize(self, end_of_turn: bool = False) -> None:
end_of_turn: Whether to emit an end of turn message.
"""
+ # Clear smart turn cutoff
+ self._smart_turn_pending_cutoff = None
+
# Current turn
_turn_id = self._turn_handler.handler_id
@@ -669,11 +737,7 @@ async def emit() -> None:
"""Wait for EndOfUtterance if needed, then emit segments."""
# Forced end of utterance message (only when no speaker is detected)
- if (
- self._config.use_forced_eou_message
- and self._current_view
- and (self._eou_mode == EndOfUtteranceMode.EXTERNAL or not self._is_speaking)
- ) and not (self._current_view.fragments[-1].is_eos and self._current_view.fragments[-1].is_final):
+ if self._config.end_of_turn_config.use_forced_eou:
await self._await_forced_eou()
# Check if the turn has changed
@@ -683,8 +747,9 @@ async def emit() -> None:
# Emit the segments
self._stt_message_queue.put_nowait(lambda: self._emit_segments(finalize=True))
- # Call async task
- asyncio.create_task(emit())
+ # Call async task (only if not already waiting for forced EOU)
+ if not (self._config.end_of_turn_config.use_forced_eou and self._forced_eou_active):
+ asyncio.create_task(emit())
# ============================================================================
# EVENT REGISTRATION & HANDLERS
@@ -705,7 +770,7 @@ def _evt_on_recognition_started(message: dict[str, Any]) -> None:
config=self._config,
session_id=message.get("id", "UNKNOWN"),
base_time=datetime.datetime.now(datetime.timezone.utc),
- language_pack_info=LanguagePackInfo.model_validate(message.get("language_pack_info", {})),
+ language_pack_info=LanguagePackInfo.from_dict(message.get("language_pack_info", {})),
)
# Partial transcript event
@@ -746,13 +811,13 @@ def _emit_message(self, message: BaseMessage) -> None:
"""
# Forward to the emit() method
- self.emit(message.message, message.model_dump())
+ self.emit(message.message, message.to_dict())
- def _emit_info_message(self, message: Union[str, dict[str, Any]]) -> None:
- """Emit an info message to the client."""
+ def _emit_diagnostic_message(self, message: Union[str, dict[str, Any]]) -> None:
+ """Emit a diagnostic message to the client."""
if isinstance(message, str):
message = {"msg": message}
- self.emit(AgentServerMessageType.INFO, {"message": AgentServerMessageType.INFO.value, **message})
+ self.emit(AgentServerMessageType.DIAGNOSTICS, {"message": AgentServerMessageType.DIAGNOSTICS.value, **message})
# ============================================================================
# QUEUE PROCESSING
@@ -1140,17 +1205,28 @@ async def _process_speech_fragments(self, change_filter: Optional[list[Annotatio
if change_filter and not changes.any(*change_filter):
return
+ # Skip re-evaluation if transcripts are older than smart turn cutoff
+ if self._smart_turn_pending_cutoff is not None and self._current_view:
+ latest_end_time = max(
+ (f.end_time for f in self._current_view.fragments if f.end_time is not None), default=0.0
+ )
+
+ # If all fragments end before or at the cutoff, skip re-evaluation
+ if latest_end_time <= self._smart_turn_pending_cutoff:
+ return
+
# Turn prediction
- if self._uses_forced_eou:
+ if self._uses_eot_prediction and self._uses_forced_eou and not self._forced_eou_active:
async def fn() -> None:
ttl = await self._calculate_finalize_delay()
- if ttl:
+ if ttl is not None:
self._turn_handler.update_timer(ttl)
- self._run_background_eot_calculation(fn)
+ self._run_background_eot_calculation(fn, "speech_fragments")
# Check for gaps
+ # TODO - implement gap-filling
# FragmentUtils.find_segment_pauses(self._client_session, self._current_view)
# Emit the segments
@@ -1363,7 +1439,7 @@ async def _emit_end_of_turn(self) -> None:
# TURN DETECTION & FINALIZATION
# ============================================================================
- def _run_background_eot_calculation(self, fn: Callable) -> None:
+ def _run_background_eot_calculation(self, fn: Callable, source: Optional[str] = None) -> None:
"""Run the calculation async."""
# Existing task takes precedence
@@ -1373,9 +1449,32 @@ def _run_background_eot_calculation(self, fn: Callable) -> None:
# Create new task
self._eot_calculation_task = asyncio.create_task(fn())
+ async def _calculate_fixed_finalize_delay(self) -> Optional[float]:
+ """Will return the end of utterance delay as a default."""
+
+ # Delay defined in config
+ delay = self._config.end_of_utterance_silence_trigger
+
+ # Adjust to compensate for known latencies
+ delay = delay - self._last_forced_eou_latency - self._last_speak_end_latency
+
+ # Emit prediction message
+ self._emit_message(
+ TurnPredictionMessage(
+ turn_id=self._turn_handler.handler_id,
+ metadata=TurnPredictionMetadata(
+ ttl=delay,
+ reasons=["fixed_eou"],
+ ),
+ ),
+ )
+
+ # Return the delay
+ return delay
+
async def _calculate_finalize_delay(
self,
- smart_turn_prediction: Optional[SmartTurnPredictionResult] = None,
+ annotation: Optional[AnnotationResult] = None,
) -> Optional[float]:
"""Calculate the delay before finalizing / end of turn.
@@ -1384,7 +1483,7 @@ async def _calculate_finalize_delay(
and smart turn predictions to calculate appropriate delay.
Args:
- smart_turn_prediction: The smart turn prediction result to use for evaluation.
+ annotations: The annotations to include for evaluation.
Returns:
Optional[float]: The delay before finalizing / end of turn.
@@ -1401,41 +1500,55 @@ async def _calculate_finalize_delay(
if not view:
return None
+ # If FIXED EOU mode, use the fixed EOU delay
+ if self._eou_mode == EndOfUtteranceMode.FIXED:
+ return await self._calculate_fixed_finalize_delay()
+
# Get last active segment
last_active_segment_index = view.last_active_segment_index
last_active_segment = view.segments[last_active_segment_index] if last_active_segment_index > -1 else None
# Track penalty multipliers and reasons
reasons: list[tuple[float, str]] = []
+ annotation = annotation or AnnotationResult()
- # Apply penalties based on last active segment annotations
+ # VAD enabled
+ if self._silero_detector:
+ annotation.add(AnnotationFlags.VAD_ACTIVE)
+ else:
+ annotation.add(AnnotationFlags.VAD_INACTIVE)
+
+ # Smart Turn enabled
+ if self._smart_turn_detector:
+ annotation.add(AnnotationFlags.SMART_TURN_ACTIVE)
+ else:
+ annotation.add(AnnotationFlags.SMART_TURN_INACTIVE)
+
+ # Result to validate against
if last_active_segment:
+ annotation.add(*[AnnotationFlags(flag) for flag in last_active_segment.annotation])
+
+ # Apply penalties based on last active segment annotations
+ if len(annotation) > 0:
for p in self._config.end_of_turn_config.penalties:
description = "__".join(p.annotation)
- has_annotation = last_active_segment.annotation.has(*p.annotation)
-
+ has_annotation = annotation.has(*p.annotation)
if (not p.is_not and has_annotation) or (p.is_not and not has_annotation):
reason = f"not__{description}" if p.is_not else description
reasons.append((p.penalty, reason))
- # Apply smart turn prediction penalty
- if smart_turn_prediction:
- if smart_turn_prediction.prediction:
- reasons.append((self._config.smart_turn_config.positive_penalty, "smart_turn_true"))
- else:
- reasons.append((self._config.smart_turn_config.negative_penalty, "smart_turn_false"))
-
# Calculate final multiplier (compound multiplication)
- multiplier = (
- self._config.end_of_turn_config.base_multiplier
- * self._config.end_of_turn_config.end_of_turn_adjustment_factor
- )
+ multiplier = self._config.end_of_turn_config.base_multiplier
for penalty, _ in reasons:
multiplier *= penalty
# Calculate delay with minimum of 25ms
delay = round(self._config.end_of_utterance_silence_trigger * multiplier, 3)
+ # Trim off the most recent forced EOU delay if we're in forced EOU mode
+ if self._uses_forced_eou:
+ delay -= self._last_forced_eou_latency
+
# Clamp to max delay and adjust for TTFB
clamped_delay = min(delay, self._config.end_of_utterance_max_delay)
finalize_delay = max(clamped_delay - self._last_ttfb, self._config.end_of_turn_config.min_end_of_turn_delay)
@@ -1451,24 +1564,37 @@ async def _calculate_finalize_delay(
),
)
+ # Return the calculated delay
return finalize_delay
- async def _eot_prediction(self, end_time: Optional[float] = None) -> float:
+ async def _eot_prediction(
+ self,
+ end_time: Optional[float] = None,
+ speaker: Optional[str] = None,
+ annotation: Optional[AnnotationResult] = None,
+ ) -> float:
"""Handle end of turn prediction."""
+ # Initialize the annotation
+ annotation = annotation or AnnotationResult()
+
# Wait for Smart Turn result
- if self._eou_mode == EndOfUtteranceMode.SMART_TURN and end_time is not None:
- result = await self._smart_turn_prediction(end_time, self._config.language)
- else:
- result = None
+ if self._smart_turn_detector and end_time is not None:
+ result = await self._smart_turn_prediction(end_time, self._config.language, speaker=speaker)
+ if result.prediction:
+ annotation.add(AnnotationFlags.SMART_TURN_TRUE)
+ else:
+ annotation.add(AnnotationFlags.SMART_TURN_FALSE)
# Create a new task to evaluate the finalize delay
- delay = await self._calculate_finalize_delay(smart_turn_prediction=result)
+ delay = await self._calculate_finalize_delay(annotation=annotation)
# Return the result
- return delay or 0.005
+ return max(delay or 0, self._config.end_of_turn_config.min_end_of_turn_delay)
- async def _smart_turn_prediction(self, end_time: float, language: str) -> SmartTurnPredictionResult:
+ async def _smart_turn_prediction(
+ self, end_time: float, language: str, start_time: float = 0.0, speaker: Optional[str] = None
+ ) -> SmartTurnPredictionResult:
"""Predict when to emit the end of turn.
This will give an acoustic prediction of when the turn has completed using
@@ -1483,14 +1609,28 @@ async def _smart_turn_prediction(self, end_time: float, language: str) -> SmartT
"""
# Check we have smart turn enabled
- if not self._smart_turn_detector:
+ if not self._smart_turn_detector or not self._config.smart_turn_config:
return SmartTurnPredictionResult(error="Smart turn is not enabled")
+ # Calculate the times
+ start_time = max(start_time, end_time - self._config.smart_turn_config.max_audio_length)
+ total_time = self._total_time
+
+ # Find the start / end times for the current speaker for this turn ...
+ if self._current_view:
+ """Extract the audio for this speaker only."""
+
+ # Filter segments that match the current speaker
+ speaker_segments: list[SpeakerSegment] = [
+ seg for seg in self._current_view.segments if seg.speaker_id == speaker
+ ]
+
+ # Get the LAST segment
+ if speaker_segments:
+ start_time = speaker_segments[-1].start_time
+
# Get audio slice (add small margin of 100ms to the end of the audio)
- segment_audio = await self._audio_buffer.get_frames(
- start_time=end_time - self._config.smart_turn_config.audio_buffer_length,
- end_time=end_time + self._config.smart_turn_config.slice_margin,
- )
+ segment_audio = await self._audio_buffer.get_frames(start_time=start_time, end_time=end_time)
# Evaluate
prediction = await self._smart_turn_detector.predict(
@@ -1500,10 +1640,29 @@ async def _smart_turn_prediction(self, end_time: float, language: str) -> SmartT
sample_width=self._audio_sample_width,
)
+ # Metadata
+ metadata = {
+ "start_time": round(start_time, 3),
+ "end_time": round(end_time, 3),
+ "language": language,
+ "speaker_id": speaker,
+ "total_time": round(total_time, 3),
+ }
+
+ # Emit smart turn info
+ self.emit(
+ AgentServerMessageType.SMART_TURN_RESULT,
+ {
+ "message": AgentServerMessageType.SMART_TURN_RESULT.value,
+ "prediction": prediction.to_dict(),
+ "metadata": metadata,
+ },
+ )
+
# Return the prediction
return prediction
- async def _await_forced_eou(self, timeout: float = 2.0) -> None:
+ async def _await_forced_eou(self, timeout: float = 1.0) -> None:
"""Await the forced end of utterance."""
# Received EOU
@@ -1513,13 +1672,22 @@ async def _await_forced_eou(self, timeout: float = 2.0) -> None:
self.once(AgentServerMessageType.END_OF_UTTERANCE, lambda message: eou_received.set())
# Trigger EOU message
- self._emit_info_message("ForceEndOfUtterance sent")
- await self.force_end_of_utterance()
+ self._emit_diagnostic_message("ForceEndOfUtterance sent - waiting for EndOfUtterance")
# Wait for EOU
try:
+ # Track the start time
+ start_time = time.time()
self._forced_eou_active = True
+
+ # Send the force EOU and wait for the response
+ await self.force_end_of_utterance()
await asyncio.wait_for(eou_received.wait(), timeout=timeout)
+
+ # Record the latency
+ self._last_forced_eou_latency = time.time() - start_time
+ self._emit_diagnostic_message(f"EndOfUtterance received after {self._last_forced_eou_latency:.3f}s")
+
except asyncio.TimeoutError:
pass
finally:
@@ -1549,29 +1717,11 @@ async def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
for frag in fragments
if frag.speaker in self._dz_config.focus_speakers and frag.type_ == "word" and not frag.is_final
]
- pre_partials = [
- frag
- for frag in self._speech_fragments
- if frag.speaker in self._dz_config.focus_speakers and frag.type_ == "word" and not frag.is_final
- ]
else:
new_partials = [frag for frag in fragments if frag.type_ == "word" and not frag.is_final]
- pre_partials = [frag for frag in self._speech_fragments if frag.type_ == "word" and not frag.is_final]
-
- # Check if last new partial matches the last pre partial
- if len(pre_partials) > 0 and len(new_partials) > 0:
- has_valid_partial = not all(
- [
- pre_partials[-1].speaker == new_partials[-1].speaker,
- pre_partials[-1].start_time == new_partials[-1].start_time,
- pre_partials[-1].end_time == new_partials[-1].end_time,
- pre_partials[-1].content == new_partials[-1].content,
- ]
- )
- # Evaluate if any valid partial words exist
- else:
- has_valid_partial = len(new_partials) > 0
+ # Check if we have new partials
+ has_valid_partial = len(new_partials) > 0
# Current states
current_is_speaking = self._is_speaking
@@ -1602,7 +1752,7 @@ async def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
# Check if speaker is different to the current speaker
if current_is_speaking and speaker_changed:
self._emit_message(
- VADStatusMessage(
+ SpeakerStatusMessage(
message=AgentServerMessageType.SPEAKER_ENDED,
speaker_id=current_speaker,
is_active=False,
@@ -1610,13 +1760,14 @@ async def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
),
)
self._emit_message(
- VADStatusMessage(
+ SpeakerStatusMessage(
message=AgentServerMessageType.SPEAKER_STARTED,
speaker_id=latest_speaker,
is_active=True,
time=speaker_end_time,
),
)
+ self._last_speak_start_time = speaker_end_time
# Update current speaker
self._current_speaker = latest_speaker
@@ -1641,9 +1792,63 @@ async def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
else:
await self._handle_speaker_stopped(latest_speaker, speaker_end_time)
+ def _handle_silero_vad_result(self, result: SileroVADResult) -> None:
+ """Handle VAD state change events.
+
+ Args:
+ result: VAD result containing state change information.
+ """
+
+ # Time of event
+ event_time = self._total_time
+
+ # Create the message
+ message = VADStatusMessage(
+ is_speech=result.is_speech,
+ probability=result.probability,
+ transition_duration_ms=result.transition_duration_ms,
+ metadata=MessageTimeMetadata(
+ start_time=round(max(0, event_time - 8), 4),
+ end_time=round(event_time, 4),
+ ),
+ )
+
+ # Emit VAD status message
+ self._emit_message(message)
+
+ # Create the annotation
+ annotation = AnnotationResult()
+
+ # VAD annotation
+ if result.speech_ended:
+ annotation.add(AnnotationFlags.VAD_STOPPED)
+ else:
+ annotation.add(AnnotationFlags.VAD_STARTED)
+
+ # If speech has ended, we need to predict the end of turn
+ if result.speech_ended and self._uses_eot_prediction:
+ """VAD-based end of turn prediction."""
+
+ # Set cutoff to prevent late transcripts from cancelling finalization
+ self._smart_turn_pending_cutoff = event_time
+
+ async def fn() -> None:
+ ttl = await self._eot_prediction(
+ end_time=event_time, speaker=self._current_speaker, annotation=annotation
+ )
+ self._turn_handler.update_timer(ttl)
+
+ self._run_background_eot_calculation(fn, "silero_vad")
+
async def _handle_speaker_started(self, speaker: Optional[str], event_time: float) -> None:
"""Reset timers when a new speaker starts speaking after silence."""
+ # Clear smart turn cutoff for new speech
+ self._smart_turn_pending_cutoff = None
+
+ # Update last speak start time
+ self._last_speak_start_time = event_time
+
# Emit start of turn (not when using EXTERNAL)
if self._is_speaking and not self._turn_active:
await self._emit_start_of_turn(event_time)
@@ -1654,7 +1859,7 @@ async def _handle_speaker_started(self, speaker: Optional[str], event_time: floa
# Emit the event
self._emit_message(
- VADStatusMessage(
+ SpeakerStatusMessage(
message=AgentServerMessageType.SPEAKER_STARTED,
speaker_id=speaker,
is_active=True,
@@ -1668,18 +1873,22 @@ async def _handle_speaker_started(self, speaker: Optional[str], event_time: floa
async def _handle_speaker_stopped(self, speaker: Optional[str], event_time: float) -> None:
"""Reset the current speaker and do smart turn detection (if enabled)."""
+ # Update last speak end time
+ self._last_speak_end_time = event_time
+ self._last_speak_end_latency = self._total_time - event_time
+
# Turn prediction
- if self._uses_forced_eou:
+ if self._uses_eot_prediction and not self._forced_eou_active:
async def fn() -> None:
- ttl = await self._eot_prediction(event_time)
+ ttl = await self._eot_prediction(event_time, speaker)
self._turn_handler.update_timer(ttl)
- self._run_background_eot_calculation(fn)
+ self._run_background_eot_calculation(fn, "speaker_stopped")
# Emit the event
self._emit_message(
- VADStatusMessage(
+ SpeakerStatusMessage(
message=AgentServerMessageType.SPEAKER_ENDED,
speaker_id=speaker,
is_active=False,
diff --git a/sdk/voice/speechmatics/voice/_models.py b/sdk/voice/speechmatics/voice/_models.py
index fa9c1ea3..5f819b93 100644
--- a/sdk/voice/speechmatics/voice/_models.py
+++ b/sdk/voice/speechmatics/voice/_models.py
@@ -10,9 +10,11 @@
from typing import Literal
from typing import Optional
-from pydantic import BaseModel
+from pydantic import BaseModel as PydanticBaseModel
from pydantic import ConfigDict
from pydantic import Field
+from pydantic import model_validator
+from typing_extensions import Self
from speechmatics.rt import AudioEncoding
from speechmatics.rt import OperatingPoint
@@ -38,10 +40,6 @@ class EndOfUtteranceMode(str, Enum):
based on the content of what the most recent speaker has said, such as
rate of speech and whether they have any pauses or disfluencies.
- - `SMART_TURN`: Smart turn end of turn delay. The STT engine will use a combination
- of silence detection, adaptive delay and smart turn detection using machine learning
- to determine the end of turn.
-
Examples:
Using fixed mode (default):
>>> config = VoiceAgentConfig(
@@ -55,12 +53,6 @@ class EndOfUtteranceMode(str, Enum):
... end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE
... )
- Using smart turn detection:
- >>> config = VoiceAgentConfig(
- ... language="en",
- ... end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN,
- ... )
-
External control (manual finalization):
>>> config = VoiceAgentConfig(
... language="en",
@@ -73,7 +65,13 @@ class EndOfUtteranceMode(str, Enum):
EXTERNAL = "external"
FIXED = "fixed"
ADAPTIVE = "adaptive"
- SMART_TURN = "smart_turn"
+
+
+class MaxDelayMode(str, Enum):
+ """Max delay mode options for transcription."""
+
+ FIXED = "fixed"
+ FLEXIBLE = "flexible"
class TranscriptionUpdatePreset(str, Enum):
@@ -125,11 +123,15 @@ class AgentServerMessageType(str, Enum):
Speechmatics RT API / Voice Agent SDK can send to the client.
Attributes:
- RecognitionStarted: The recognition session has started.
- EndOfTranscript: The recognition session has ended.
- Info: Informational message.
- Warning: Warning message.
- Error: Error message.
+ RecognitionStarted: Server response to 'StartRecognition',
+ acknowledging that a recognition session has started.
+ EndOfTranscript: Indicates the server has finished sending all messages.
+ Info: Informational messages from the server.
+ Warning: Warning messages that don't stop transcription.
+ Error: Error messages indicating transcription failure.
+ AudioAdded: Server response to 'AddAudio', indicating
+ that audio has been added successfully.
+ Diagnostics: Diagnostic messages for development and troubleshooting.
AddPartialTranscript: Partial transcript has been added.
AddTranscript: Transcript has been added.
EndOfUtterance: End of utterance has been detected (from STT engine).
@@ -140,6 +142,7 @@ class AgentServerMessageType(str, Enum):
StartOfTurn: Start of turn has been detected.
EndOfTurnPrediction: End of turn prediction timing.
EndOfTurn: End of turn has been detected.
+ SmartTurn: Smart turn metadata.
SpeakersResult: Speakers result has been detected.
Metrics: Metrics for the STT engine.
SpeakerMetrics: Metrics relating to speakers.
@@ -171,6 +174,8 @@ class AgentServerMessageType(str, Enum):
INFO = "Info"
WARNING = "Warning"
ERROR = "Error"
+ AUDIO_ADDED = "AudioAdded"
+ DIAGNOSTICS = "Diagnostics"
# Raw transcription messages
ADD_PARTIAL_TRANSCRIPT = "AddPartialTranscript"
@@ -186,10 +191,11 @@ class AgentServerMessageType(str, Enum):
ADD_SEGMENT = "AddSegment"
# Turn messages
+ VAD_STATUS = "VadStatus"
START_OF_TURN = "StartOfTurn"
END_OF_TURN_PREDICTION = "EndOfTurnPrediction"
END_OF_TURN = "EndOfTurn"
- SMART_TURN_AUDIO = "SmartTurnAudio"
+ SMART_TURN_RESULT = "SmartTurnResult"
# Speaker messages
SPEAKERS_RESULT = "SpeakersResult"
@@ -238,23 +244,67 @@ class AnnotationFlags(str, Enum):
ONLY_PUNCTUATION = "only_punctuation"
MULTIPLE_SPEAKERS = "multiple_speakers"
NO_TEXT = "no_text"
+ HAS_PAUSE = "has_pause"
+ ENDS_WITH_PAUSE = "ends_with_pause"
# End of utterance detection
END_OF_UTTERANCE = "end_of_utterance"
+ # VAD
+ VAD_ACTIVE = "vad_active"
+ VAD_INACTIVE = "vad_inactive"
+ VAD_STARTED = "vad_started"
+ VAD_STOPPED = "vad_stopped"
+
+ # Smart Turn
+ SMART_TURN_ACTIVE = "smart_turn_active"
+ SMART_TURN_INACTIVE = "smart_turn_inactive"
+ SMART_TURN_TRUE = "smart_turn_true"
+ SMART_TURN_FALSE = "smart_turn_false"
+
# ==============================================================================
# CONFIGURATION MODELS
# ==============================================================================
-class BaseConfigModel(BaseModel):
+class BaseModel(PydanticBaseModel):
"""Base configuration model."""
model_config = ConfigDict(extra="forbid")
+ @classmethod
+ def from_dict(cls, data: dict, **kwargs: Any) -> Self:
+ """Convert a dictionary to a config object."""
+ return cls.model_validate(data, **kwargs) # type: ignore[no-any-return]
+
+ def to_dict(
+ self, exclude_none: bool = True, exclude_defaults: bool = False, exclude_unset: bool = False, **kwargs: Any
+ ) -> dict[str, Any]:
+ """Convert the model to a dictionary."""
+ return super().model_dump( # type: ignore[no-any-return]
+ mode="json",
+ exclude_none=exclude_none,
+ exclude_defaults=exclude_defaults,
+ exclude_unset=exclude_unset,
+ **kwargs,
+ )
+
+ @classmethod
+ def from_json(cls, json_data: str, **kwargs: Any) -> Self:
+ """Convert a JSON string to a config object."""
+ return cls.model_validate_json(json_data, **kwargs) # type: ignore[no-any-return]
-class AdditionalVocabEntry(BaseConfigModel):
+ def to_json(
+ self, exclude_none: bool = True, exclude_defaults: bool = False, exclude_unset: bool = False, **kwargs: Any
+ ) -> str:
+ """Convert the model to a JSON string."""
+ return self.model_dump_json( # type: ignore[no-any-return]
+ exclude_none=exclude_none, exclude_defaults=exclude_defaults, exclude_unset=exclude_unset, **kwargs
+ )
+
+
+class AdditionalVocabEntry(BaseModel):
"""Additional vocabulary entry.
Parameters:
@@ -280,10 +330,10 @@ class AdditionalVocabEntry(BaseConfigModel):
"""
content: str
- sounds_like: list[str] = Field(default_factory=list)
+ sounds_like: Optional[list[str]] = None
-class SpeakerFocusConfig(BaseConfigModel):
+class SpeakerFocusConfig(BaseModel):
"""Speaker Focus Config.
List of speakers to focus on, ignore and how to deal with speakers that are not
@@ -317,7 +367,7 @@ class SpeakerFocusConfig(BaseConfigModel):
focus_mode: SpeakerFocusMode = SpeakerFocusMode.RETAIN
-class SpeechSegmentConfig(BaseConfigModel):
+class SpeechSegmentConfig(BaseModel):
"""Configuration on how segments are emitted.
Parameters:
@@ -339,7 +389,7 @@ class SpeechSegmentConfig(BaseConfigModel):
pause_mark: Optional[str] = None
-class EndOfTurnPenaltyItem(BaseConfigModel):
+class EndOfTurnPenaltyItem(BaseModel):
"""End of turn penalty item.
Parameters:
@@ -353,19 +403,18 @@ class EndOfTurnPenaltyItem(BaseConfigModel):
is_not: bool = False
-class EndOfTurnConfig(BaseConfigModel):
+class EndOfTurnConfig(BaseModel):
"""Configuration for end of turn.
Parameters:
base_multiplier: Base multiplier for end of turn delay.
min_end_of_turn_delay: Minimum end of turn delay.
- end_of_turn_adjustment_factor: End of turn adjustment factor.
penalties: List of end of turn penalty items.
+ use_forced_eou: Whether to use forced end of utterance detection.
"""
base_multiplier: float = 1.0
- min_end_of_turn_delay: float = 0.3
- end_of_turn_adjustment_factor: float = 1.0
+ min_end_of_turn_delay: float = 0.01
penalties: list[EndOfTurnPenaltyItem] = Field(
default_factory=lambda: [
# Increase delay
@@ -380,48 +429,57 @@ class EndOfTurnConfig(BaseConfigModel):
),
# Decrease delay
EndOfTurnPenaltyItem(
- penalty=0.25, annotation=[AnnotationFlags.ENDS_WITH_FINAL, AnnotationFlags.ENDS_WITH_EOS]
+ penalty=0.5, annotation=[AnnotationFlags.ENDS_WITH_FINAL, AnnotationFlags.ENDS_WITH_EOS]
+ ),
+ # Smart Turn + VAD
+ EndOfTurnPenaltyItem(penalty=0.2, annotation=[AnnotationFlags.SMART_TURN_TRUE]),
+ EndOfTurnPenaltyItem(
+ penalty=0.2, annotation=[AnnotationFlags.VAD_STOPPED, AnnotationFlags.SMART_TURN_INACTIVE]
),
]
)
+ use_forced_eou: bool = False
-class SmartTurnConfig(BaseConfigModel):
- """Smart turn configuration for the Speechmatics Voice Agent.
-
- This configuration is used to determine when a turn has completed. It is used to
- extract slices of recent audio for post-processing by end of thought models.
+class VoiceActivityConfig(BaseModel):
+ """Configuration for voice activity detection.
Parameters:
- audio_buffer_length: Length of audio buffer to extract slices of recent audio for post-processing
- by end of thought models. Defaults to 0.0 seconds.
+ enabled: Whether voice activity detection is enabled.
+ silence_duration: Duration of silence in seconds before considering speech ended.
+ threshold: Threshold for voice activity detection.
+ """
- smart_turn_threshold: Smart turn threshold. This is used to determine when a turn has completed.
- Only used when `end_of_utterance_mode` is `EndOfUtteranceMode.SMART_TURN`. Defaults to 0.5.
+ enabled: bool = False
+ silence_duration: float = 0.18
+ threshold: float = 0.35
- slice_margin: Margin to add to the audio buffer to ensure that the end of thought models have
- enough audio to work with. Defaults to 0.05 seconds.
- positive_penalty: Positive penalty for smart turn. Defaults to -1.0.
+class SmartTurnConfig(BaseModel):
+ """Smart turn configuration for the Speechmatics Voice Agent.
- negative_penalty: Negative penalty for smart turn. Defaults to 2.5.
+ This configuration is used to determine when a turn has completed. It is used to
+ extract slices of recent audio for post-processing by end of thought models.
+
+ Parameters:
+ enabled: Whether smart turn is enabled.
+ smart_turn_threshold: Smart turn threshold. Defaults to 0.5.
+ max_audio_length: Maximum length of audio to analyze in seconds. Defaults to 8.0.
Examples:
>>> config = SmartTurnConfig(
- ... audio_buffer_length=0.5,
+ ... audio_buffer_length=15.0,
... smart_turn_threshold=0.5,
... slice_margin=0.05
... )
"""
- audio_buffer_length: float = 0.0
+ enabled: bool = False
smart_turn_threshold: float = 0.5
- slice_margin: float = 0.05
- positive_penalty: float = 0.3
- negative_penalty: float = 1.7
+ max_audio_length: float = 8.0
-class VoiceAgentConfig(BaseConfigModel):
+class VoiceAgentConfig(BaseModel):
"""Voice Agent configuration.
A framework-independent configuration object for the Speechmatics Voice Agent. This uses
@@ -470,11 +528,18 @@ class VoiceAgentConfig(BaseConfigModel):
than English. See documentation for more information.
Defaults to `None`.
- enable_diarization: Enable speaker diarization. When enabled, the STT engine will
- determine and attribute words to unique speakers. The speaker_sensitivity
- parameter can be used to adjust the sensitivity of diarization.
+ enable_entities: Enable entity detection. When enabled, the STT engine will
+ detect and attribute words to entities. This is useful for languages that use
+ different entities than English. See documentation for more information.
Defaults to `False`.
+ max_delay_mode: Determines whether the threshold specified in max_delay can be exceeded
+ if a potential entity is detected. Flexible means if a potential entity
+ is detected, then the max_delay can be overriden until the end of that
+ entity. Fixed means that max_delay specified ignores any potential
+ entity that would not be completed within that threshold.
+ Defaults to `MaxDelayMode.FLEXIBLE`.
+
include_partials: Include partial segment fragments (words) in the output of
AddPartialSegment messages. Partial fragments from the STT will always be used for
speaker activity detection. If `include_results` is enabled, then partials will
@@ -482,6 +547,11 @@ class VoiceAgentConfig(BaseConfigModel):
the formatted text output of individual segments.
Defaults to `True`.
+ enable_diarization: Enable speaker diarization. When enabled, the STT engine will
+ determine and attribute words to unique speakers. The speaker_sensitivity
+ parameter can be used to adjust the sensitivity of diarization.
+ Defaults to `False`.
+
speaker_sensitivity: Diarization sensitivity. A higher value increases the sensitivity
of diarization and helps when two or more speakers have similar voices.
Defaults to `0.5`.
@@ -510,9 +580,6 @@ class VoiceAgentConfig(BaseConfigModel):
include_results: Include word data in the response. This is useful for debugging and
understanding the STT engine's behavior. Defaults to False.
- use_forced_eou_message: Use forced end of utterance message. This will force the STT engine to emit
- end of utterance messages. Defaults to False.
-
transcription_update_preset: Emit segments when the text content or word timings change.
Options are: `COMPLETE` (emit on changes to text content), `COMPLETE_PLUS_TIMING`
(emit on changes to text content and word timings), `WORDS` (emit on changes to word
@@ -522,14 +589,19 @@ class VoiceAgentConfig(BaseConfigModel):
end_of_turn_config: End of turn configuration for the Speechmatics Voice Agent.
+ vad_config: Voice activity detection configuration for the Speechmatics Voice Agent.
+
smart_turn_config: Smart turn configuration for the Speechmatics Voice Agent.
speech_segment_config: Speech segment configuration for the Speechmatics Voice Agent.
+ audio_buffer_length: Length of internal rolling audio buffer in seconds. Defaults to `0.0`.
+
advanced_engine_control: Internal use only.
sample_rate: Audio sample rate for streaming. Defaults to `16000`.
audio_encoding: Audio encoding format. Defaults to `AudioEncoding.PCM_S16LE`.
+ chunk_size: Audio chunk size in frames. Defaults to `160`.
Examples:
Basic configuration:
@@ -583,9 +655,9 @@ class VoiceAgentConfig(BaseConfigModel):
... enable_diarization=True,
... speaker_sensitivity=0.7,
... max_speakers=3,
- ... end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN,
+ ... end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
... smart_turn_config=SmartTurnConfig(
- ... smart_turn_threshold=0.5
+ ... enabled=True
... ),
... additional_vocab=[
... AdditionalVocabEntry(content="API"),
@@ -604,16 +676,18 @@ class VoiceAgentConfig(BaseConfigModel):
output_locale: Optional[str] = None
# Features
- max_delay: float = 0.7
- end_of_utterance_silence_trigger: float = 0.2
+ max_delay: float = 1.0
+ end_of_utterance_silence_trigger: float = 0.5
end_of_utterance_max_delay: float = 10.0
end_of_utterance_mode: EndOfUtteranceMode = EndOfUtteranceMode.FIXED
additional_vocab: list[AdditionalVocabEntry] = Field(default_factory=list)
punctuation_overrides: Optional[dict] = None
+ enable_entities: bool = False
+ max_delay_mode: MaxDelayMode = MaxDelayMode.FLEXIBLE
+ include_partials: bool = True
# Diarization
enable_diarization: bool = False
- include_partials: bool = True
speaker_sensitivity: float = 0.5
max_speakers: Optional[int] = None
prefer_current_speaker: bool = False
@@ -622,11 +696,12 @@ class VoiceAgentConfig(BaseConfigModel):
# Advanced features
include_results: bool = False
- use_forced_eou_message: bool = False
transcription_update_preset: TranscriptionUpdatePreset = TranscriptionUpdatePreset.COMPLETE
end_of_turn_config: EndOfTurnConfig = Field(default_factory=EndOfTurnConfig)
- smart_turn_config: SmartTurnConfig = Field(default_factory=SmartTurnConfig)
+ vad_config: Optional[VoiceActivityConfig] = None
+ smart_turn_config: Optional[SmartTurnConfig] = None
speech_segment_config: SpeechSegmentConfig = Field(default_factory=SpeechSegmentConfig)
+ audio_buffer_length: float = 0.0
# Advanced engine configuration
advanced_engine_control: Optional[dict[str, Any]] = None
@@ -634,19 +709,54 @@ class VoiceAgentConfig(BaseConfigModel):
# Audio
sample_rate: int = 16000
audio_encoding: AudioEncoding = AudioEncoding.PCM_S16LE
+ chunk_size: int = 160
+
+ # Validation
+ @model_validator(mode="after") # type: ignore[misc]
+ def validate_config(self) -> Self:
+ """Validate the configuration."""
+
+ # Validation errors
+ errors: list[str] = []
+
+ # End of utterance mode cannot be EXTERNAL if smart turn is enabled
+ if self.end_of_utterance_mode == EndOfUtteranceMode.EXTERNAL and self.smart_turn_config:
+ errors.append("EXTERNAL mode cannot be used in conjunction with SmartTurnConfig")
+
+ # Cannot have FIXED and forced end of utterance enabled without VAD being enabled
+ if (self.end_of_utterance_mode == EndOfUtteranceMode.FIXED and self.end_of_turn_config.use_forced_eou) and not (
+ self.vad_config and self.vad_config.enabled
+ ):
+ errors.append("FIXED mode cannot be used in conjunction with forced end of utterance without VAD enabled")
+
+ # Cannot use VAD with external end of utterance mode
+ if self.end_of_utterance_mode == EndOfUtteranceMode.EXTERNAL and (self.vad_config and self.vad_config.enabled):
+ errors.append("EXTERNAL mode cannot be used in conjunction with VAD being enabled")
+
+ # Check end_of_utterance_max_delay is greater than end_of_utterance_silence_trigger
+ if self.end_of_utterance_max_delay < self.end_of_utterance_silence_trigger:
+ errors.append("end_of_utterance_max_delay must be greater than end_of_utterance_silence_trigger")
+
+ # If diarization is not enabled, then max_speakers cannot be set
+ if not self.enable_diarization and self.max_speakers:
+ errors.append("max_speakers cannot be set when enable_diarization is False")
+
+ # If diarization is not enabled, then SpeakerFocusConfig.focus_speakers and SpeakerFocusConfig.ignore_speakers must be empty
+ if not self.enable_diarization and (self.speaker_config.focus_speakers or self.speaker_config.ignore_speakers):
+ errors.append(
+ "SpeakerFocusConfig.focus_speakers and SpeakerFocusConfig.ignore_speakers must be empty when enable_diarization is False"
+ )
- # Parse JSON
- @classmethod
- def from_json(cls, json_data: str) -> VoiceAgentConfig:
- """Convert a JSON string to a VoiceAgentConfig object."""
- cfg: VoiceAgentConfig = cls.model_validate_json(json_data)
- return cfg
+ # Check sample rate
+ if self.sample_rate not in [8000, 16000]:
+ errors.append("sample_rate must be 8000 or 16000")
- # To JSON
- def to_json(self) -> str:
- """Convert the model to a JSON string."""
- config_str: str = self.model_dump_json(exclude_none=True, exclude_defaults=True, exclude_unset=True)
- return config_str
+ # Raise error if any validation errors
+ if errors:
+ raise ValueError(f"{len(errors)} config error(s): {'; '.join(errors)}")
+
+ # Return validated config
+ return self
# ==============================================================================
@@ -852,12 +962,28 @@ def end_time(self) -> float:
"""Return the end time of the segment."""
return self.fragments[-1].end_time if self.fragments else 0.0
- def model_dump(self, include_results: bool = False, **kwargs: Any) -> dict[str, Any]:
+ def to_dict(
+ self,
+ exclude_none: bool = True,
+ exclude_defaults: bool = False,
+ exclude_unset: bool = False,
+ include_results: bool = False,
+ **kwargs: Any,
+ ) -> dict[str, Any]:
"""Override model_dump to control fragments/results inclusion."""
# Always exclude fragments from the base dump
- kwargs["exclude"] = {"fragments"}
- data: dict[str, Any] = super().model_dump(**kwargs)
+ exclude = kwargs.get("exclude", set())
+ if isinstance(exclude, set):
+ exclude.add("fragments")
+ else:
+ exclude = {"fragments"}
+ kwargs["exclude"] = exclude
+
+ # Get the base dump
+ data: dict[str, Any] = super().model_dump(
+ exclude_none=exclude_none, exclude_defaults=exclude_defaults, exclude_unset=exclude_unset, **kwargs
+ )
# Add timing information
data["start_time"] = self.start_time
@@ -904,7 +1030,16 @@ def __init__(
annotate_segments=annotate_segments,
)
- super().__init__(session=session, fragments=fragments, segments=segments, focus_speakers=focus_speakers, **data)
+ # Initialize with the computed values
+ data.update(
+ {
+ "session": session,
+ "fragments": fragments,
+ "segments": segments,
+ "focus_speakers": focus_speakers,
+ }
+ )
+ super().__init__(**data)
@property
def start_time(self) -> float:
@@ -998,22 +1133,34 @@ def trim(self, start_time: float, end_time: float, annotate_segments: bool = Tru
# ==============================================================================
-class BaseMessageModel(BaseModel):
+class BaseMessage(BaseModel):
"""Base model for all messages."""
- def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
- """Default to excluding None values."""
- return super().model_dump(*args, **kwargs, exclude_none=True, mode="json") # type: ignore[no-any-return]
+ message: str
- def model_dump_json(self, *args: Any, **kwargs: Any) -> str:
- """Default to excluding None values."""
- return super().model_dump_json(*args, **kwargs, exclude_none=True) # type: ignore[no-any-return]
+ @classmethod
+ def from_message(cls, data: dict, **kwargs: Any) -> Self:
+ """Convert a message dictionary to a message object.
+ Alias for from_dict() for semantic clarity when working with messages.
+ """
+ return cls.from_dict(data, **kwargs)
-class BaseMessage(BaseMessageModel):
- """Base model for all messages."""
- message: AgentServerMessageType
+class MessageTimeMetadata(BaseModel):
+ """Metadata for segment messages.
+
+ Parameters:
+ time: The time of the event.
+ start_time: The start time of the segment.
+ end_time: The end time of the segment.
+ processing_time: The processing time of the segment.
+ """
+
+ time: Optional[float] = None
+ start_time: Optional[float] = None
+ end_time: Optional[float] = None
+ processing_time: Optional[float] = None
class ErrorMessage(BaseMessage):
@@ -1046,7 +1193,7 @@ class SessionMetricsMessage(BaseMessage):
processing_time: float
-class VADStatusMessage(BaseMessage):
+class SpeakerStatusMessage(BaseMessage):
"""Emitted when a speaker starts or ends speaking.
The speaker id is taken from the last word in the segment when
@@ -1065,20 +1212,22 @@ class VADStatusMessage(BaseMessage):
time: Optional[float] = None
-class MessageTimeMetadata(BaseMessageModel):
- """Metadata for segment messages.
+class VADStatusMessage(BaseMessage):
+ """Emitted when voice activity detection status changes.
Parameters:
- time: The time of the event.
- start_time: The start time of the segment.
- end_time: The end time of the segment.
- processing_time: The processing time of the segment.
+ message: The message type.
+ is_speech: Whether speech is detected.
+ probability: The probability of speech.
+ transition_duration_ms: The duration of the transition in milliseconds.
+ metadata: The time metadata.
"""
- time: Optional[float] = None
- start_time: Optional[float] = None
- end_time: Optional[float] = None
- processing_time: Optional[float] = None
+ message: AgentServerMessageType = AgentServerMessageType.VAD_STATUS
+ metadata: MessageTimeMetadata
+ is_speech: bool
+ probability: float
+ transition_duration_ms: float
class TurnStartEndResetMessage(BaseMessage):
@@ -1097,16 +1246,17 @@ class TurnStartEndResetMessage(BaseMessage):
metadata: MessageTimeMetadata
-class TurnPredictionMetadata(BaseMessageModel):
+class TurnPredictionMetadata(BaseModel):
"""Metadata for turn prediction messages.
Parameters:
ttl: The time to live of the prediction in seconds.
- reasons: The reasons for the prediction.
"""
ttl: float
- reasons: list[str]
+ reasons: list[str] = Field(default_factory=list, exclude=False)
+
+ model_config = ConfigDict(extra="ignore")
class TurnPredictionMessage(BaseMessage):
@@ -1128,7 +1278,7 @@ class SpeakerMetricsMessage(BaseMessage):
speakers: list[SessionSpeaker]
-class SegmentMessageSegmentFragment(BaseMessageModel):
+class SegmentMessageSegmentFragment(BaseModel):
"""Speech fragment for segment messages.
Parameters:
@@ -1148,11 +1298,12 @@ class SegmentMessageSegmentFragment(BaseMessageModel):
type: str = Field(default="word", alias="type_")
content: str = ""
attaches_to: str = ""
+ is_eos: bool = False
model_config = ConfigDict(extra="ignore")
-class SegmentMessageSegment(BaseMessageModel):
+class SegmentMessageSegment(BaseModel):
"""Partial or final segment.
Parameters:
@@ -1162,7 +1313,6 @@ class SegmentMessageSegment(BaseMessageModel):
language: The language of the frame.
text: The text of the segment.
fragments: The fragments associated with the segment.
- annotation: The annotation associated with the segment.
metadata: The metadata associated with the segment.
"""
@@ -1172,9 +1322,11 @@ class SegmentMessageSegment(BaseMessageModel):
language: Optional[str] = None
text: Optional[str] = None
fragments: Optional[list[SegmentMessageSegmentFragment]] = None
- annotation: list[AnnotationFlags] = Field(default_factory=list)
+ annotation: list[AnnotationFlags] = Field(default_factory=list, exclude=False)
metadata: MessageTimeMetadata
+ model_config = ConfigDict(extra="ignore")
+
class SegmentMessage(BaseMessage):
"""Emitted when a segment is added to the session."""
diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py
index 88703c9c..18b11830 100644
--- a/sdk/voice/speechmatics/voice/_presets.py
+++ b/sdk/voice/speechmatics/voice/_presets.py
@@ -6,9 +6,12 @@
from typing import Optional
+from ._models import EndOfTurnConfig
from ._models import EndOfUtteranceMode
from ._models import OperatingPoint
+from ._models import SmartTurnConfig
from ._models import SpeechSegmentConfig
+from ._models import VoiceActivityConfig
from ._models import VoiceAgentConfig
@@ -16,69 +19,102 @@ class VoiceAgentConfigPreset:
"""Set of preset configurations for the Voice Agent SDK."""
@staticmethod
- def LOW_LATENCY(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802
+ def FAST(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802
"""Best suited for low latency situations.
This configuration will emit the end of turn as soon as possible, with minimal
delay to finalizing the spoken sentences. It is not recommended for
conversation, as it will not account for pauses, slow speech or disfluencies.
+
+ Note that this uses our standard operating point so will have marginally lower
+ accuracy that the enhanced operating point.
"""
return VoiceAgentConfigPreset._merge_configs(
VoiceAgentConfig(
operating_point=OperatingPoint.STANDARD,
enable_diarization=True,
- max_delay=0.7,
+ max_delay=2.0,
+ end_of_utterance_silence_trigger=0.25,
+ end_of_utterance_mode=EndOfUtteranceMode.FIXED,
+ speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+ ),
+ overlay,
+ )
+
+ @staticmethod
+ def FIXED(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802
+ """Best suited for general conversational use cases with fixed end-of-utterance timing.
+
+ For conversation, there is a balance between accuracy, speed and the rate at
+ which the end of turn is emitted. This configuration uses fixed timing for
+ end-of-utterance detection.
+ """
+ return VoiceAgentConfigPreset._merge_configs(
+ VoiceAgentConfig(
+ operating_point=OperatingPoint.ENHANCED,
+ enable_diarization=True,
+ max_delay=2.0,
end_of_utterance_silence_trigger=0.5,
end_of_utterance_mode=EndOfUtteranceMode.FIXED,
- speech_segment_config=SpeechSegmentConfig(emit_sentences=True),
+ speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
),
overlay,
)
@staticmethod
- def CONVERSATION_ADAPTIVE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802
+ def ADAPTIVE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802
"""Best suited for general conversational use cases.
For conversation, there is a balance between accuracy, speed and the rate at
- which the end of turn is emitted. Tne use of ADAPTIVE means that the delay to
+ which the end of turn is emitted. The use of ADAPTIVE means that the delay to
finalizing the spoken sentences will be adjusted based on the words and whether
there are any pauses, slow speech or disfluencies.
+
+ Use of this will require `pip install speechmatics-voice[smart]` and may not
+ be suited to low-power devices.
"""
return VoiceAgentConfigPreset._merge_configs(
VoiceAgentConfig(
operating_point=OperatingPoint.ENHANCED,
enable_diarization=True,
- max_delay=0.7,
- end_of_utterance_silence_trigger=1.0,
+ max_delay=2.0,
+ end_of_utterance_silence_trigger=0.7,
end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+ vad_config=VoiceActivityConfig(enabled=True),
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=True),
),
overlay,
)
@staticmethod
- def CONVERSATION_SMART_TURN(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802
+ def SMART_TURN(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802
"""Best suited for complex conversational use cases.
For conversation, there is a balance between accuracy, speed and the rate at
- which the end of turn is emitted. Tne use of SMART_TURN means that the delay to
+ which the end of turn is emitted. The use of SMART_TURN means that the delay to
finalizing the spoken sentences will be adjusted based on the words and whether
there are any pauses, slow speech or disfluencies.
This preset will use a model to detect for acoustic indicators from the
speaker to determine when a turn has ended.
- Use of this will requite `pip install speechmatics-voice[smart]` and may not
+ Use of this will require `pip install speechmatics-voice[smart]` and may not
be suited to low-power devices.
"""
return VoiceAgentConfigPreset._merge_configs(
VoiceAgentConfig(
operating_point=OperatingPoint.ENHANCED,
enable_diarization=True,
- max_delay=0.7,
- end_of_utterance_silence_trigger=1.0,
- end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN,
+ max_delay=2.0,
+ end_of_utterance_silence_trigger=0.8,
+ end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+ smart_turn_config=SmartTurnConfig(
+ enabled=True,
+ ),
+ vad_config=VoiceActivityConfig(enabled=True),
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=True),
),
overlay,
)
@@ -89,33 +125,40 @@ def SCRIBE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # n
This mode will emit partial and final segments as they become available. The end of
utterance is set to fixed. End of turn is not required for note-taking.
+
+ Use of this will require `pip install speechmatics-voice[smart]` and may not
+ be suited to low-power devices.
"""
return VoiceAgentConfigPreset._merge_configs(
VoiceAgentConfig(
operating_point=OperatingPoint.ENHANCED,
enable_diarization=True,
- max_delay=1.0,
- end_of_utterance_silence_trigger=1.2,
- end_of_utterance_mode=EndOfUtteranceMode.FIXED,
+ max_delay=2.0,
+ end_of_utterance_silence_trigger=1.0,
+ end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
speech_segment_config=SpeechSegmentConfig(emit_sentences=True),
+ smart_turn_config=SmartTurnConfig(
+ enabled=True,
+ ),
+ vad_config=VoiceActivityConfig(enabled=True, silence_duration=0.2),
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=True),
),
overlay,
)
@staticmethod
def CAPTIONS(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802
- """Best suited for captions.
+ """Best suited for captioning.
- This mode will emit partial and final segments as they become available. The end of
- utterance is set to fixed. End of turn is not required for captions. The segments
- will only include finalized words.
+ This mode will emit final segments as they become available. The end of
+ utterance is set to fixed. End of turn is not required for captioning.
"""
return VoiceAgentConfigPreset._merge_configs(
VoiceAgentConfig(
operating_point=OperatingPoint.ENHANCED,
enable_diarization=True,
- max_delay=0.9,
- end_of_utterance_silence_trigger=1.2,
+ max_delay=0.7,
+ end_of_utterance_silence_trigger=0.5,
end_of_utterance_mode=EndOfUtteranceMode.FIXED,
speech_segment_config=SpeechSegmentConfig(emit_sentences=True),
include_partials=False,
@@ -134,10 +177,10 @@ def EXTERNAL(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: #
VoiceAgentConfig(
operating_point=OperatingPoint.ENHANCED,
enable_diarization=True,
- max_delay=1.0,
- end_of_utterance_silence_trigger=1.2,
+ max_delay=2.0,
end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL,
- speech_segment_config=SpeechSegmentConfig(emit_sentences=True),
+ speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=True),
),
overlay,
)
@@ -161,7 +204,7 @@ def load(preset: str, overlay_json: Optional[str] = None) -> VoiceAgentConfig:
try:
config: VoiceAgentConfig = getattr(VoiceAgentConfigPreset, preset.upper())()
if overlay_json is not None:
- overlay = VoiceAgentConfig.model_validate_json(overlay_json)
+ overlay = VoiceAgentConfig.from_json(overlay_json)
config = VoiceAgentConfigPreset._merge_configs(config, overlay)
return config
except ValueError:
@@ -189,9 +232,9 @@ def _merge_configs(base: VoiceAgentConfig, overlay: Optional[VoiceAgentConfig])
if overlay is None:
return base
- # Merge overlay into base - use model_validate to properly reconstruct nested models
+ # Merge overlay into base
merged_dict = {
**base.model_dump(exclude_unset=True, exclude_none=True),
**overlay.model_dump(exclude_unset=True, exclude_none=True),
}
- return VoiceAgentConfig.model_validate(merged_dict) # type: ignore[no-any-return]
+ return VoiceAgentConfig.from_dict(merged_dict)
diff --git a/sdk/voice/speechmatics/voice/_smart_turn.py b/sdk/voice/speechmatics/voice/_smart_turn.py
index 011318d8..9ce44a03 100644
--- a/sdk/voice/speechmatics/voice/_smart_turn.py
+++ b/sdk/voice/speechmatics/voice/_smart_turn.py
@@ -14,7 +14,8 @@
from urllib.parse import urlparse
import numpy as np
-from pydantic import BaseModel
+
+from speechmatics.voice._models import BaseModel
ort: Any
WhisperFeatureExtractor: Any
@@ -44,9 +45,9 @@ def _create_ssl_context(*args: Any, **kwargs: Any) -> ssl.SSLContext:
# Base model from HuggingFace
SMART_TURN_MODEL_URL = os.getenv(
- "SMART_TURN_HF_URL", "https://huggingface.co/pipecat-ai/smart-turn-v3/resolve/main/smart-turn-v3.0.onnx"
+ "SMART_TURN_HF_URL", "https://huggingface.co/pipecat-ai/smart-turn-v3/resolve/main/smart-turn-v3.1-cpu.onnx"
)
-SMART_TURN_MODEL_LOCAL_PATH = os.getenv("SMART_TURN_MODEL_PATH", ".models/smart-turn-v3.0.onnx")
+SMART_TURN_MODEL_LOCAL_PATH = os.getenv("SMART_TURN_MODEL_PATH", ".models/smart-turn-v3.1-cpu.onnx")
# Hint for when dependencies are not available
SMART_TURN_INSTALL_HINT = "SMART_TURN mode unavailable. Install `speechmatics-voice[smart]` to enable SMART_TURN mode."
@@ -187,13 +188,21 @@ async def predict(
dtype = np.int16 if sample_width == 2 else np.int8
int16_array: np.ndarray = np.frombuffer(audio_array, dtype=dtype).astype(np.int16)
+ # Truncate to last 8 seconds if needed (keep the tail/end of audio)
+ max_samples = 8 * sample_rate
+ if len(int16_array) > max_samples:
+ int16_array = int16_array[-max_samples:]
+
+ # Convert int16 to float32 in range [-1, 1] (same as reference implementation)
+ float32_array: np.ndarray = int16_array.astype(np.float32) / 32768.0
+
# Process audio using Whisper's feature extractor
inputs = self.feature_extractor(
- int16_array,
+ float32_array,
sampling_rate=sample_rate,
return_tensors="np",
padding="max_length",
- max_length=8 * sample_rate,
+ max_length=max_samples,
truncation=True,
do_normalize=True,
)
@@ -217,8 +226,8 @@ async def predict(
# Return the result
return SmartTurnPredictionResult(
prediction=prediction,
- probability=probability,
- processing_time=float((end_time - start_time).total_seconds()),
+ probability=round(probability, 3),
+ processing_time=round(float((end_time - start_time).total_seconds()), 3),
)
@staticmethod
diff --git a/sdk/voice/speechmatics/voice/_utils.py b/sdk/voice/speechmatics/voice/_utils.py
index 31e7e244..ecb01d0c 100644
--- a/sdk/voice/speechmatics/voice/_utils.py
+++ b/sdk/voice/speechmatics/voice/_utils.py
@@ -70,6 +70,7 @@ def format_segment_text(
**{
"speaker_id": segment.speaker_id,
"text": content,
+ "content": content,
"ts": segment.timestamp,
"lang": segment.language,
"start_time": fragments[0].start_time if fragments else 0,
@@ -294,7 +295,7 @@ def _annotate_segment(segment: SpeakerSegment) -> AnnotationResult:
# Categorize the speaker
if wpm < 80:
result.add(AnnotationFlags.VERY_SLOW_SPEAKER)
- elif wpm < 120:
+ elif wpm < 110:
result.add(AnnotationFlags.SLOW_SPEAKER)
elif wpm > 250:
result.add(AnnotationFlags.FAST_SPEAKER)
diff --git a/sdk/voice/speechmatics/voice/_vad.py b/sdk/voice/speechmatics/voice/_vad.py
new file mode 100644
index 00000000..e5a7b1e8
--- /dev/null
+++ b/sdk/voice/speechmatics/voice/_vad.py
@@ -0,0 +1,354 @@
+#
+# Copyright (c) 2025, Speechmatics / Cantab Research Ltd
+#
+
+from __future__ import annotations
+
+import logging
+import os
+import time
+import urllib.request
+from collections import deque
+from typing import Any
+from typing import Callable
+from typing import Optional
+from urllib.parse import urlparse
+
+import numpy as np
+
+from speechmatics.voice._models import BaseModel
+
+ort: Any
+logger = logging.getLogger(__name__)
+
+try:
+ import onnxruntime as _ort
+
+ ort = _ort
+except ModuleNotFoundError:
+ ort = None
+
+
+# Silero VAD model
+SILERO_MODEL_URL = os.getenv(
+ "SILERO_MODEL_URL", "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
+)
+SILERO_MODEL_PATH = os.getenv("SILERO_MODEL_PATH", ".models/silero_vad.onnx")
+
+# Hint for when dependencies are not available
+SILERO_INSTALL_HINT = "Silero VAD unavailable. Install `speechmatics-voice[smart]` to enable VAD."
+
+# Silero VAD constants
+SILERO_SAMPLE_RATE = 16000
+SILERO_CHUNK_SIZE = 512 # Silero expects 512 samples at 16kHz (32ms chunks)
+SILERO_CONTEXT_SIZE = 64 # Silero uses 64-sample context
+MODEL_RESET_STATES_TIME = 5.0 # Reset state every 5 seconds
+SILERO_CHUNK_DURATION_MS = (SILERO_CHUNK_SIZE / SILERO_SAMPLE_RATE) * 1000 # 32ms per chunk
+
+
+class SileroVADResult(BaseModel):
+ """VAD result from Silero.
+
+ Attributes:
+ is_speech: True if speech detected, False if silence
+ probability: Probability of speech (0.0-1.0)
+ transition_duration_ms: Duration of consecutive silence in milliseconds (used for transition threshold)
+ speech_ended: True if silence duration exceeded the threshold
+ metadata: Additional metadata about the VAD result
+ error: Error message if an error occurred
+ """
+
+ is_speech: bool = False
+ probability: float = 0.0
+ transition_duration_ms: float = 0.0
+ speech_ended: bool = False
+ metadata: Optional[dict] = None
+ error: Optional[str] = None
+
+
+class SileroVAD:
+ """Silero Voice Activity Detector.
+
+ Uses Silero's opensource VAD model for detecting speech vs silence.
+ Processes audio in 512-sample chunks at 16kHz.
+
+ Further information at https://github.com/snakers4/silero-vad
+ """
+
+ def __init__(
+ self,
+ auto_init: bool = True,
+ threshold: float = 0.5,
+ silence_duration: float = 0.1,
+ on_state_change: Optional[Callable[[SileroVADResult], None]] = None,
+ ):
+ """Create the new SileroVAD.
+
+ Args:
+ auto_init: Whether to automatically initialise the detector.
+ threshold: Probability threshold for speech detection (0.0-1.0).
+ silence_duration: Duration of consecutive silence (in ms) before considering speech ended.
+ on_state_change: Optional callback invoked when VAD state changes (speech <-> silence).
+ """
+
+ self._is_initialized: bool = False
+ self._threshold: float = threshold
+ self._on_state_change: Optional[Callable[[SileroVADResult], None]] = on_state_change
+
+ # ONNX session state
+ self._state: Optional[np.ndarray] = None
+ self._context: Optional[np.ndarray] = None
+ self._last_reset_time: float = 0.0
+
+ # Audio buffering
+ self._audio_buffer: bytes = b""
+
+ # Rolling window for predictions (100ms window = ~3-4 chunks at 32ms each)
+ window_chunks = int((silence_duration * 1000) / SILERO_CHUNK_DURATION_MS) + 1
+ self._prediction_window: deque[float] = deque(maxlen=window_chunks)
+
+ # State tracking
+ self._last_is_speech: bool = False # Track previous state for change detection (default: not speaking)
+
+ if auto_init:
+ self.setup()
+
+ @staticmethod
+ def dependencies_available() -> bool:
+ """Return whether optional Silero dependencies are installed."""
+ return ort is not None
+
+ def setup(self) -> None:
+ """Setup the detector.
+
+ Initialises the ONNX model and internal states.
+ """
+
+ # Show warning if dependencies are not available
+ if not self.dependencies_available():
+ logger.warning(SILERO_INSTALL_HINT)
+ return
+
+ try:
+ # Check / download the model
+ self.download_model()
+
+ # Check the model downloaded
+ if not self.model_exists():
+ logger.warning("Silero VAD model not found. Please download the model first.")
+ return
+
+ # Build the session
+ self.session = self.build_session(SILERO_MODEL_PATH)
+
+ # Initialize states
+ self._init_states()
+
+ # Set initialized
+ self._is_initialized = True
+
+ except Exception as e:
+ logger.error(f"Failed to setup SileroVAD: {e}")
+
+ def build_session(self, onnx_path: str) -> ort.InferenceSession:
+ """Build the ONNX session and load resources.
+
+ Args:
+ onnx_path: Path to the ONNX model.
+
+ Returns:
+ ONNX inference session.
+ """
+
+ # Show warning if dependencies are not available
+ if ort is None:
+ raise RuntimeError("onnxruntime is not available")
+
+ # Build the session
+ so = ort.SessionOptions()
+ so.inter_op_num_threads = 1
+ so.intra_op_num_threads = 1
+
+ # Return the new session
+ return ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"], sess_options=so)
+
+ def _init_states(self) -> None:
+ """Initialize or reset internal VAD states."""
+ self._state = np.zeros((2, 1, 128), dtype=np.float32)
+ self._context = np.zeros((1, SILERO_CONTEXT_SIZE), dtype=np.float32)
+ self._last_reset_time = time.time()
+
+ def _maybe_reset_states(self) -> None:
+ """Reset ONNX model states periodically to prevent drift.
+
+ Note: Does NOT reset prediction window or speech state tracking.
+ """
+ if (time.time() - self._last_reset_time) >= MODEL_RESET_STATES_TIME:
+ self._state = np.zeros((2, 1, 128), dtype=np.float32)
+ self._context = np.zeros((1, SILERO_CONTEXT_SIZE), dtype=np.float32)
+ self._last_reset_time = time.time()
+
+ def process_chunk(self, chunk_f32: np.ndarray) -> float:
+ """Process a single 512-sample chunk and return speech probability.
+
+ Args:
+ chunk_f32: Float32 numpy array of exactly 512 samples.
+
+ Returns:
+ Speech probability (0.0-1.0).
+
+ Raises:
+ ValueError: If chunk is not exactly 512 samples.
+ """
+ # Ensure shape (1, 512)
+ x = np.reshape(chunk_f32, (1, -1))
+ if x.shape[1] != SILERO_CHUNK_SIZE:
+ raise ValueError(f"Expected {SILERO_CHUNK_SIZE} samples, got {x.shape[1]}")
+
+ # Concatenate with context (previous 64 samples)
+ if self._context is not None:
+ x = np.concatenate((self._context, x), axis=1)
+
+ # Run ONNX inference
+ ort_inputs = {
+ "input": x.astype(np.float32),
+ "state": self._state,
+ "sr": np.array(SILERO_SAMPLE_RATE, dtype=np.int64),
+ }
+ out, self._state = self.session.run(None, ort_inputs)
+
+ # Update context (keep last 64 samples)
+ self._context = x[:, -SILERO_CONTEXT_SIZE:]
+
+ # Maybe reset states periodically
+ self._maybe_reset_states()
+
+ # Return probability (out shape is (1, 1))
+ return float(out[0][0])
+
+ async def process_audio(self, audio_bytes: bytes, sample_rate: int = 16000, sample_width: int = 2) -> None:
+ """Process incoming audio bytes and invoke callback on state changes.
+
+ This method buffers incomplete chunks and processes all complete 512-sample chunks.
+ The callback is invoked only once at the end if the VAD state changed during processing.
+
+ Args:
+ audio_bytes: Raw audio bytes (int16 PCM).
+ sample_rate: Sample rate of the audio (must be 16000).
+ sample_width: Sample width in bytes (2 for int16).
+ """
+
+ if not self._is_initialized:
+ logger.error("SileroVAD is not initialized")
+ return
+
+ if sample_rate != SILERO_SAMPLE_RATE:
+ logger.error(f"Sample rate must be {SILERO_SAMPLE_RATE}Hz, got {sample_rate}Hz")
+ return
+
+ # Add new bytes to buffer
+ self._audio_buffer += audio_bytes
+
+ # Calculate bytes per chunk (512 samples * 2 bytes for int16)
+ bytes_per_chunk = SILERO_CHUNK_SIZE * sample_width
+
+ # Process all complete chunks in buffer
+ while len(self._audio_buffer) >= bytes_per_chunk:
+ # Extract one chunk
+ chunk_bytes = self._audio_buffer[:bytes_per_chunk]
+ self._audio_buffer = self._audio_buffer[bytes_per_chunk:]
+
+ # Convert bytes to int16 array
+ dtype = np.int16 if sample_width == 2 else np.int8
+ int16_array: np.ndarray = np.frombuffer(chunk_bytes, dtype=dtype).astype(np.int16)
+
+ # Convert int16 to float32 in range [-1, 1]
+ float32_array: np.ndarray = int16_array.astype(np.float32) / 32768.0
+
+ try:
+ # Process the chunk and add probability to rolling window
+ probability = self.process_chunk(float32_array)
+ self._prediction_window.append(probability)
+
+ except Exception as e:
+ logger.error(f"Error processing VAD chunk: {e}")
+
+ # After processing all chunks, calculate weighted average from window
+ if len(self._prediction_window) > 0:
+ # Calculate weighted average (most recent predictions have higher weight)
+ weights = np.arange(1, len(self._prediction_window) + 1, dtype=np.float32)
+ weighted_avg = np.average(list(self._prediction_window), weights=weights)
+
+ # Determine speech state from weighted average
+ is_speech = bool(weighted_avg >= self._threshold)
+
+ # Check if state changed
+ state_changed = self._last_is_speech != is_speech
+
+ # Emit callback if state changed
+ if state_changed and self._on_state_change:
+ # Calculate transition duration (window duration)
+ transition_duration = len(self._prediction_window) * SILERO_CHUNK_DURATION_MS
+
+ # Determine if speech ended
+ speech_ended = self._last_is_speech and not is_speech
+
+ # VAD result
+ result = SileroVADResult(
+ is_speech=is_speech,
+ probability=round(float(weighted_avg), 3),
+ transition_duration_ms=transition_duration,
+ speech_ended=speech_ended,
+ )
+
+ # Trigger callback
+ self._on_state_change(result)
+
+ # Update state after emitting
+ self._last_is_speech = is_speech
+
+ def reset(self) -> None:
+ """Reset the VAD state and clear audio buffer."""
+ if self._is_initialized:
+ self._init_states()
+ self._audio_buffer = b""
+ self._prediction_window.clear()
+ self._last_is_speech = False
+
+ @staticmethod
+ def download_model() -> None:
+ """Download the ONNX model.
+
+ This will check if the model has been downloaded and is available in the
+ location specified by the SILERO_MODEL_PATH environment variable.
+
+ If not, it will download the model from GitHub.
+ """
+
+ # Check if model file exists
+ if SileroVAD.model_exists():
+ return
+
+ # Check the URL for valid schemes
+ parsed_url = urlparse(SILERO_MODEL_URL)
+ if parsed_url.scheme not in ("http", "https"):
+ logger.error(f"Invalid URL scheme: {parsed_url.scheme}")
+ return
+
+ # Report to the user
+ logger.warning("Silero VAD model not found. Downloading from GitHub...")
+
+ # Create the directory
+ os.makedirs(os.path.dirname(SILERO_MODEL_PATH), exist_ok=True)
+
+ # Download
+ urllib.request.urlretrieve(SILERO_MODEL_URL, SILERO_MODEL_PATH) # nosec B310
+
+ @staticmethod
+ def model_exists() -> bool:
+ """Check the model has been downloaded.
+
+ Returns:
+ True if the model file exists, False otherwise.
+ """
+ return os.path.exists(SILERO_MODEL_PATH)
diff --git a/tests/voice/_utils.py b/tests/voice/_utils.py
index c663e995..8308e905 100644
--- a/tests/voice/_utils.py
+++ b/tests/voice/_utils.py
@@ -1,4 +1,5 @@
import asyncio
+import datetime
import json
import os
import time
@@ -85,35 +86,42 @@ async def send_audio_file(
# Delay is based off 16kHz int16 and chunk size
delay = chunk_size / sample_rate / sample_size
- # Load the file
- async with aiofiles.open(file, "rb") as wav_file:
- # Trim off the WAV file header
- await wav_file.seek(44)
+ # Catch errors - we can be lazy as this is only for testing
+ try:
- # Send audio data
- next_time = time.perf_counter() + delay
- while not terminate_event.is_set() if terminate_event else True:
- """Reads all chunks until the end of the file with precision delay."""
+ # Load the file
+ async with aiofiles.open(file, "rb") as wav_file:
+ # Trim off the WAV file header
+ await wav_file.seek(44)
- # Read chunk
- chunk = await wav_file.read(chunk_size)
+ # Send audio data
+ next_time = time.perf_counter() + delay
+ while not terminate_event.is_set() if terminate_event else True:
+ """Reads all chunks until the end of the file with precision delay."""
- # End of file
- if not chunk:
- break
+ # Read chunk
+ chunk = await wav_file.read(chunk_size)
- # Send audio to client
- await client.send_audio(chunk)
+ # End of file
+ if not chunk:
+ break
- # Do any callbacks
- if progress_callback:
- progress_callback(len(chunk))
+ # Send audio to client
+ await client.send_audio(chunk)
- # Precision delay
- sleep_time = next_time - time.perf_counter()
- if sleep_time > 0:
- await asyncio.sleep(sleep_time)
- next_time += delay
+ # Do any callbacks
+ if progress_callback:
+ progress_callback(len(chunk))
+
+ # Precision delay
+ sleep_time = next_time - time.perf_counter()
+ if sleep_time > 0:
+ await asyncio.sleep(sleep_time)
+ next_time += delay
+
+ # Catch errors
+ except Exception:
+ pass
async def load_audio_file(audio_file: str) -> bytes:
@@ -165,23 +173,50 @@ async def send_silence(
# Iterations required
iterations = int(duration / delay)
- # Keep sending
- while (not terminate_event.is_set() if terminate_event else True) and iterations > 0:
- # Send audio to client
- await client.send_audio(silence)
+ # Catch errors - we can be lazy as this is only for testing
+ try:
+
+ # Keep sending
+ while (not terminate_event.is_set() if terminate_event else True) and iterations > 0:
+ # Send audio to client
+ await client.send_audio(silence)
+
+ # Do any callbacks
+ if progress_callback:
+ progress_callback(len(silence))
+
+ # Precision delay
+ sleep_time = next_time - time.perf_counter()
+ if sleep_time > 0:
+ await asyncio.sleep(sleep_time)
+ next_time += delay
+
+ # Reduce iterations
+ iterations -= 1
+
+ # Catch errors - we can be lazy as this is only for testing
+ except Exception:
+ pass
+
+
+def log_client_messages(client: VoiceAgentClient, messages: Optional[list[AgentServerMessageType]] = None) -> None:
+ """Register and log client messages."""
+
+ # Start time
+ start_time = datetime.datetime.now()
- # Do any callbacks
- if progress_callback:
- progress_callback(len(silence))
+ # Callback for each message
+ def _log_message(message):
+ ts = (datetime.datetime.now() - start_time).total_seconds()
+ print(json.dumps({"ts": round(ts, 3), "payload": message}))
- # Precision delay
- sleep_time = next_time - time.perf_counter()
- if sleep_time > 0:
- await asyncio.sleep(sleep_time)
- next_time += delay
+ # Set fo all agent messages, apart from AUDIO_ADDED
+ if messages is None:
+ messages = [message for message in AgentServerMessageType if message != AgentServerMessageType.AUDIO_ADDED]
- # Reduce iterations
- iterations -= 1
+ # Add listeners
+ for message_type in messages:
+ client.on(message_type, _log_message)
class ConversationLog:
diff --git a/tests/voice/assets/audio_03_16kHz.wav b/tests/voice/assets/audio_03_16kHz.wav
index 0c9b1793..2442d8e4 100644
--- a/tests/voice/assets/audio_03_16kHz.wav
+++ b/tests/voice/assets/audio_03_16kHz.wav
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:eadc1c0609e13027d0bb7bfe6bf7868c123cbd17421898e6023b1889f103cf17
-size 58460
+oid sha256:ef18686db712ccb8d7714e86358f64490da3eaa6ff7ed6e090070169d87b6ed2
+size 162670
diff --git a/tests/voice/test_03_conversation.py b/tests/voice/test_03_conversation.py
index aa2398bb..6adc6ad2 100644
--- a/tests/voice/test_03_conversation.py
+++ b/tests/voice/test_03_conversation.py
@@ -90,7 +90,7 @@ def log_message(message):
print()
print("---")
log_message({"message": "AudioFile", "path": audio_file})
- log_message({"message": "VoiceAgentConfig", **client._config.model_dump()})
+ log_message({"message": "VoiceAgentConfig", **client._config.to_dict()})
log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()})
log_message({"message": "AudioFormat", **client._audio_format.to_dict()})
diff --git a/tests/voice/test_04_models.py b/tests/voice/test_04_models.py
index 6e3af3d0..04c698eb 100644
--- a/tests/voice/test_04_models.py
+++ b/tests/voice/test_04_models.py
@@ -4,9 +4,11 @@
from speechmatics.voice import VoiceAgentConfig
from speechmatics.voice._models import AdditionalVocabEntry
+from speechmatics.voice._models import AgentServerMessageType
from speechmatics.voice._models import AnnotationFlags
from speechmatics.voice._models import AnnotationResult
from speechmatics.voice._models import OperatingPoint
+from speechmatics.voice._models import SessionMetricsMessage
from speechmatics.voice._models import SpeakerFocusConfig
from speechmatics.voice._models import SpeakerFocusMode
from speechmatics.voice._models import SpeakerIdentifier
@@ -28,7 +30,7 @@ async def test_voice_agent_config():
)
# Test JSON serialisation
- config_dict = config.model_dump()
+ config_dict = config.to_dict()
assert config_dict["language"] == "en"
assert config_dict["max_delay"] == 1.5
assert config_dict["enable_diarization"] is True
@@ -121,19 +123,19 @@ async def test_additional_vocab_entry():
entry = AdditionalVocabEntry(content="hello", sounds_like=["helo", "hallo"])
# Test JSON serialisation
- json_data = entry.model_dump()
- assert json_data["content"] == "hello"
- assert json_data["sounds_like"] == ["helo", "hallo"]
+ json_dict = entry.to_dict()
+ assert json_dict["content"] == "hello"
+ assert json_dict["sounds_like"] == ["helo", "hallo"]
# Test JSON deserialisation
- entry_from_json = AdditionalVocabEntry.model_validate(json_data)
+ entry_from_json = AdditionalVocabEntry.from_dict(json_dict)
assert entry_from_json.content == entry.content
assert entry_from_json.sounds_like == entry.sounds_like
# Test with defaults
entry_minimal = AdditionalVocabEntry(content="test")
- json_minimal = entry_minimal.model_dump()
- assert json_minimal["sounds_like"] == []
+ json_minimal = entry_minimal.to_dict()
+ assert "sounds_like" not in json_minimal
@pytest.mark.asyncio
@@ -153,23 +155,21 @@ async def test_speaker_focus_config():
)
# Test JSON serialisation
- json_data = config.model_dump()
- assert json_data["focus_speakers"] == ["S1", "S2"]
- assert json_data["ignore_speakers"] == ["__ASSISTANT__", "__SYSTEM__"]
- assert json_data["focus_mode"] == SpeakerFocusMode.IGNORE
+ json_dict = config.to_dict()
+ assert json_dict["focus_speakers"] == ["S1", "S2"]
+ assert json_dict["ignore_speakers"] == ["__ASSISTANT__", "__SYSTEM__"]
+ assert json_dict["focus_mode"] == SpeakerFocusMode.IGNORE
# Test JSON deserialisation
- config_from_json = SpeakerFocusConfig.model_validate(json_data)
+ config_from_json = SpeakerFocusConfig.from_dict(json_dict)
assert config_from_json.focus_speakers == config.focus_speakers
assert config_from_json.ignore_speakers == config.ignore_speakers
assert config_from_json.focus_mode == config.focus_mode
# Test with defaults
config_default = SpeakerFocusConfig()
- json_default = config_default.model_dump()
- assert json_default["focus_speakers"] == []
- assert json_default["ignore_speakers"] == []
- assert json_default["focus_mode"] == SpeakerFocusMode.RETAIN
+ json_default = config_default.to_json(exclude_none=False)
+ assert json_default == '{"focus_speakers":[],"ignore_speakers":[],"focus_mode":"retain"}'
@pytest.mark.asyncio
@@ -198,7 +198,7 @@ async def test_speech_fragment():
)
# Test JSON serialisation
- json_data = fragment.model_dump()
+ json_data = fragment.to_dict()
assert json_data["idx"] == 1
assert json_data["start_time"] == 0.5
assert json_data["end_time"] == 1.2
@@ -237,7 +237,7 @@ async def test_speaker_segment():
)
# Test model_dump() default behavior (should exclude fragments by default)
- json_data = segment.model_dump()
+ json_data = segment.to_dict()
assert json_data["speaker_id"] == "S1"
assert json_data["is_active"] is True
assert json_data["timestamp"] == "2025-01-01T12:00:00.500"
@@ -247,9 +247,38 @@ async def test_speaker_segment():
assert isinstance(json_data["annotation"], list)
# Test model_dump with include_results=True
- dict_data_results = segment.model_dump(include_results=True)
+ dict_data_results = segment.to_dict(include_results=True)
assert dict_data_results["speaker_id"] == "S1"
assert dict_data_results["text"] == "Hello world"
assert "results" in dict_data_results
assert "fragments" not in dict_data_results
assert len(dict_data_results["results"]) == 2
+
+
+@pytest.mark.asyncio
+async def test_event_messages():
+ """Test event messages."""
+
+ # Create a new event message
+ event_message = SessionMetricsMessage(
+ total_time=1.0,
+ total_time_str="00:00:01",
+ total_bytes=1024,
+ processing_time=0.5,
+ )
+
+ # Test dict
+ dict_data = event_message.to_dict()
+ assert dict_data["message"] == AgentServerMessageType.SESSION_METRICS
+ assert dict_data["message"] == "SessionMetrics"
+ assert dict_data["total_time"] == 1.0
+ assert dict_data["total_time_str"] == "00:00:01"
+ assert dict_data["total_bytes"] == 1024
+ assert dict_data["processing_time"] == 0.5
+
+ # Test JSON
+ json_data = event_message.to_json()
+ assert (
+ json_data
+ == '{"message":"SessionMetrics","total_time":1.0,"total_time_str":"00:00:01","total_bytes":1024,"processing_time":0.5}'
+ )
diff --git a/tests/voice/test_05_utterance.py b/tests/voice/test_05_utterance.py
index cf30453d..9c3c6604 100644
--- a/tests/voice/test_05_utterance.py
+++ b/tests/voice/test_05_utterance.py
@@ -7,8 +7,10 @@
import pytest
from _utils import ConversationLog
from _utils import get_client
+from _utils import log_client_messages
from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfTurnConfig
from speechmatics.voice import EndOfUtteranceMode
from speechmatics.voice import SpeechSegmentConfig
from speechmatics.voice import VoiceAgentConfig
@@ -35,12 +37,19 @@ async def test_speech_fragments():
start_time = datetime.datetime.now()
# Create a client
- client = await get_client(api_key="NONE", connect=False)
+ client = await get_client(
+ api_key="NONE",
+ connect=False,
+ )
assert client is not None
# Start the queue
client._start_stt_queue()
+ # Log messages
+ if SHOW_LOG:
+ log_client_messages(client)
+
# Event to wait
event_rx: asyncio.Event = asyncio.Event()
last_message: Optional[dict[str, Any]] = None
@@ -76,7 +85,7 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True):
# Add listener for first interim segment
message_reset()
- client.once(AgentServerMessageType.ADD_PARTIAL_SEGMENT, message_rx)
+ client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, message_rx)
# Inject first partial
await send_message(0, count=6, use_ttl=False)
@@ -99,22 +108,14 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True):
assert seg0["text"] == "Welcome"
assert f"{seg0['speaker_id']}: {seg0['text']}" == "S1: Welcome"
- # Add listener for final segment
- message_reset()
- client.once(AgentServerMessageType.ADD_SEGMENT, message_rx)
-
# Send a more partials and finals
await send_message(5, count=8, use_ttl=False)
- # Wait for final segment
- try:
- await asyncio.wait_for(event_rx.wait(), timeout=5.0)
- assert last_message is not None
- except asyncio.TimeoutError:
- pytest.fail("ADD_SEGMENT event was not received within 5 seconds")
+ # Yield a short while
+ await asyncio.sleep(0.5)
# Check the right message was received
- assert last_message.get("message") == AgentServerMessageType.ADD_SEGMENT
+ assert last_message.get("message") == AgentServerMessageType.ADD_PARTIAL_SEGMENT
# Check the segment
segments = last_message.get("segments", [])
@@ -153,14 +154,9 @@ async def test_end_of_utterance_fixed():
)
assert client is not None
- # Debug
+ # Log messages
if SHOW_LOG:
- client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, lambda message: print(message))
- client.on(AgentServerMessageType.ADD_SEGMENT, lambda message: print(message))
- client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, lambda message: print(message))
- client.on(AgentServerMessageType.START_OF_TURN, lambda message: print(message))
- client.on(AgentServerMessageType.END_OF_TURN, lambda message: print(message))
- client.on(AgentServerMessageType.END_OF_UTTERANCE, lambda message: print(message))
+ log_client_messages(client)
# Start the queue
client._start_stt_queue()
@@ -234,7 +230,9 @@ async def test_external_vad():
api_key="NONE",
connect=False,
config=VoiceAgentConfig(
- end_of_utterance_silence_trigger=adaptive_timeout, end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL
+ end_of_utterance_silence_trigger=adaptive_timeout,
+ end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL,
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=False),
),
)
assert client is not None
@@ -269,6 +267,10 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True):
# Emit the message
client.emit(message["payload"]["message"], message["payload"])
+ # Log messages
+ if SHOW_LOG:
+ log_client_messages(client)
+
# Inject conversation
await send_message(0, count=12, use_ttl=False)
@@ -333,10 +335,15 @@ async def test_end_of_utterance_adaptive_vad():
end_of_utterance_silence_trigger=adaptive_timeout,
end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=False),
),
)
assert client is not None
+ # Log messages
+ if SHOW_LOG:
+ log_client_messages(client)
+
# Start the queue
client._start_stt_queue()
@@ -385,14 +392,6 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True):
# Add listener for end of turn
client.once(AgentServerMessageType.END_OF_TURN, eot_rx)
- # Debug
- if SHOW_LOG:
- client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, lambda message: print(message))
- client.on(AgentServerMessageType.ADD_SEGMENT, lambda message: print(message))
- client.on(AgentServerMessageType.START_OF_TURN, lambda message: print(message))
- client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, lambda message: print(message))
- client.on(AgentServerMessageType.END_OF_TURN, lambda message: print(message))
-
# Inject conversation up to the penultimate final from the STT
await send_message(0, count=12, use_ttl=True)
diff --git a/tests/voice/test_07_languages.py b/tests/voice/test_07_languages.py
index 759e67b7..c83428d5 100644
--- a/tests/voice/test_07_languages.py
+++ b/tests/voice/test_07_languages.py
@@ -1,4 +1,3 @@
-import asyncio
import datetime
import json
import os
@@ -9,11 +8,15 @@
import pytest
from _utils import get_client
+from _utils import log_client_messages
from _utils import send_audio_file
+from _utils import send_silence
from speechmatics.voice import AdditionalVocabEntry
from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfTurnConfig
from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import SpeechSegmentConfig
from speechmatics.voice import VoiceAgentConfig
from speechmatics.voice._utils import TextUtils
@@ -23,6 +26,7 @@
# Constants
API_KEY = os.getenv("SPEECHMATICS_API_KEY")
URL = "wss://eu2.rt.speechmatics.com/v2"
+SHOW_LOG = os.getenv("SPEECHMATICS_SHOW_LOG", "0").lower() in ["1", "true"]
@dataclass
@@ -116,9 +120,14 @@ async def test_transcribe_languages(sample: AudioSample):
connect=False,
config=VoiceAgentConfig(
max_delay=1.2,
- end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL,
+ end_of_utterance_mode=EndOfUtteranceMode.FIXED,
+ end_of_utterance_silence_trigger=1.2,
language=sample.language,
additional_vocab=[AdditionalVocabEntry(content=vocab) for vocab in sample.vocab],
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=False),
+ speech_segment_config=SpeechSegmentConfig(
+ emit_sentences=False,
+ ),
),
)
assert client is not None
@@ -134,6 +143,10 @@ async def test_transcribe_languages(sample: AudioSample):
# Start time
start_time = datetime.datetime.now()
+ # Log messages
+ if SHOW_LOG:
+ log_client_messages(client)
+
# Bytes logger
def log_bytes_sent(bytes):
nonlocal bytes_sent
@@ -169,10 +182,8 @@ def log_segment(message):
# Individual payloads
await send_audio_file(client, audio_file, progress_callback=log_bytes_sent)
- # Send finalize
- await asyncio.sleep(1.5)
- client.finalize()
- await asyncio.sleep(1.5)
+ # Send some audio silence
+ await send_silence(client, 4.0)
# Extract the last message
assert last_message.get("message") == AgentServerMessageType.ADD_SEGMENT
diff --git a/tests/voice/test_08_multiple_speakers.py b/tests/voice/test_08_multiple_speakers.py
index adbebd52..fa662aa5 100644
--- a/tests/voice/test_08_multiple_speakers.py
+++ b/tests/voice/test_08_multiple_speakers.py
@@ -8,7 +8,6 @@
import pytest
from _utils import get_client
from _utils import send_audio_file
-from pydantic import BaseModel
from speechmatics.voice import AdditionalVocabEntry
from speechmatics.voice import AgentServerMessageType
@@ -17,6 +16,7 @@
from speechmatics.voice import SpeakerFocusMode
from speechmatics.voice import SpeechSegmentConfig
from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice._models import BaseModel
from speechmatics.voice._models import SpeakerSegment
# Skip for CI testing
@@ -167,8 +167,8 @@ def log_final_segment(message):
print()
print()
print("---")
- log_message({"message": "Sample", **sample.model_dump()})
- log_message({"message": "VoiceAgentConfig", **client._config.model_dump()})
+ log_message({"message": "Sample", **sample.to_dict()})
+ log_message({"message": "VoiceAgentConfig", **client._config.to_dict()})
log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()})
log_message({"message": "AudioFormat", **client._audio_format.to_dict()})
diff --git a/tests/voice/test_09_speaker_id.py b/tests/voice/test_09_speaker_id.py
index 95929848..6e8dc0bc 100644
--- a/tests/voice/test_09_speaker_id.py
+++ b/tests/voice/test_09_speaker_id.py
@@ -11,6 +11,7 @@
from speechmatics.rt import ClientMessageType
from speechmatics.voice import AdditionalVocabEntry
from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfTurnConfig
from speechmatics.voice import EndOfUtteranceMode
from speechmatics.voice import SpeakerIdentifier
from speechmatics.voice import SpeechSegmentConfig
@@ -58,6 +59,7 @@ async def test_extract_speaker_ids():
additional_vocab=[
AdditionalVocabEntry(content="GeoRouter"),
],
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=False),
),
)
@@ -115,7 +117,7 @@ def save_speakers_result(message):
print()
print()
print("---")
- log_message({"message": "VoiceAgentConfig", **client._config.model_dump()})
+ log_message({"message": "VoiceAgentConfig", **client._config.to_dict()})
log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()})
log_message({"message": "AudioFormat", **client._audio_format.to_dict()})
@@ -190,6 +192,7 @@ async def test_known_speakers():
additional_vocab=[
AdditionalVocabEntry(content="GeoRouter"),
],
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=False),
),
)
@@ -267,6 +270,7 @@ async def test_ignoring_assistant():
additional_vocab=[
AdditionalVocabEntry(content="GeoRouter"),
],
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=False),
),
)
diff --git a/tests/voice/test_10_finalize.py b/tests/voice/test_10_finalize.py
index 247d46f4..0abaed26 100644
--- a/tests/voice/test_10_finalize.py
+++ b/tests/voice/test_10_finalize.py
@@ -43,7 +43,6 @@ async def test_finalize():
end_of_utterance_silence_trigger=0.7,
max_delay=1.2,
end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL,
- use_forced_eou_message=True,
),
)
@@ -93,7 +92,7 @@ def eot_received_callback(message):
print()
print()
print("---")
- log_message({"message": "VoiceAgentConfig", **client._config.model_dump()})
+ log_message({"message": "VoiceAgentConfig", **client._config.to_dict()})
log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()})
log_message({"message": "AudioFormat", **client._audio_format.to_dict()})
diff --git a/tests/voice/test_11_audio_buffer.py b/tests/voice/test_11_audio_buffer.py
index 63f2fdc4..a10834e9 100644
--- a/tests/voice/test_11_audio_buffer.py
+++ b/tests/voice/test_11_audio_buffer.py
@@ -14,6 +14,7 @@
from speechmatics.voice import AdditionalVocabEntry
from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfTurnConfig
from speechmatics.voice import EndOfUtteranceMode
from speechmatics.voice import SmartTurnConfig
from speechmatics.voice import VoiceAgentConfig
@@ -54,9 +55,15 @@ async def test_buffer():
assert buffer.total_time == 0.0
assert buffer.size == 0
+ # Create 20 seconds worth of random bytes
+ random_data = bytes(random.getrandbits(8) for _ in range(int(20.0 * sample_rate * sample_width)))
+ assert len(random_data) == int(20.0 * sample_rate * sample_width)
+
# Add in 20 seconds of data
- for _ in range(int(20.0 * sample_rate / frame_size)):
- await buffer.put_frame(b"\x00" * frame_bytes)
+ for i in range(int(20.0 * sample_rate / frame_size)):
+ start_idx = (i * frame_bytes) % len(random_data)
+ frame_data = random_data[start_idx : start_idx + frame_bytes]
+ await buffer.put_frame(frame_data)
# Check values
assert buffer.total_frames == int(20.0 * sample_rate / frame_size)
@@ -64,7 +71,9 @@ async def test_buffer():
assert buffer.size == int(10.0 * sample_rate / frame_size)
# Check frame >< time conversion
- assert buffer._get_frame_from_time(buffer._get_time_from_frame(1234)) == 1234
+ tff = buffer._get_time_from_frame(1234)
+ tft = buffer._get_frame_from_time(tff)
+ assert tft == 1234
# Get data from more than 10 seconds ago
data = await buffer.get_frames(2.5, 7.5)
@@ -74,6 +83,11 @@ async def test_buffer():
data = await buffer.get_frames(12.5, 17.5)
assert len(data) == int(5.0 * sample_rate / frame_size) * frame_bytes
+ # Check the contents of the buffer
+ data = await buffer.get_frames(15.0, 20.0)
+ random_data_last_5_seconds = random_data[-int(5.0 * sample_rate * sample_width) :]
+ assert data == random_data_last_5_seconds
+
@pytest.mark.asyncio
async def test_buffer_bytes():
@@ -126,8 +140,8 @@ async def test_buffer_bytes():
# Extract data
data = await buffer.get_frames(start_time, end_time)
- # Test
- assert len(data) == int((end_time - start_time) * sample_rate / frame_size) * frame_bytes
+ # Test (two frames)
+ assert len(data) == int((end_time - start_time) * sample_rate / frame_size) * frame_bytes * 2
@pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping in CI")
@@ -248,7 +262,8 @@ async def save_slice(
additional_vocab=[
AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]),
],
- smart_turn_config=SmartTurnConfig(audio_buffer_length=20.0),
+ smart_turn_config=SmartTurnConfig(enabled=True),
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=False),
),
)
@@ -353,7 +368,8 @@ async def save_slice(
additional_vocab=[
AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]),
],
- smart_turn_config=SmartTurnConfig(audio_buffer_length=20.0),
+ smart_turn_config=SmartTurnConfig(enabled=True),
+ end_of_turn_config=EndOfTurnConfig(use_forced_eou=False),
),
)
diff --git a/tests/voice/test_12_smart_turn_with_files.py b/tests/voice/test_12_smart_turn_with_files.py
index 553b6ab2..7b90ec49 100644
--- a/tests/voice/test_12_smart_turn_with_files.py
+++ b/tests/voice/test_12_smart_turn_with_files.py
@@ -28,7 +28,6 @@ class PredictionTest(BaseModel):
language="en",
expected=SmartTurnPredictionResult(
prediction=False,
- probability=0.095,
),
),
PredictionTest(
@@ -37,7 +36,6 @@ class PredictionTest(BaseModel):
language="en",
expected=SmartTurnPredictionResult(
prediction=False,
- probability=0.011,
),
),
PredictionTest(
@@ -46,7 +44,6 @@ class PredictionTest(BaseModel):
language="en",
expected=SmartTurnPredictionResult(
prediction=True,
- probability=0.892,
),
),
]
@@ -74,14 +71,8 @@ async def test_prediction(sample: PredictionTest):
# Run an inference
result = await detector.predict(bytes_array, language=sample.language, sample_rate=16000, sample_width=2)
- # Processing time < 100ms
- assert result.processing_time < 0.1
+ # Processing time < 200ms
+ assert result.processing_time < 0.2
# Check result
assert result.prediction == sample.expected.prediction
-
- # Prediction within 5% of expected
- assert (
- result.probability >= sample.expected.probability - 0.05
- and result.probability <= sample.expected.probability + 0.05
- )
diff --git a/tests/voice/test_13_smart_turn_transcribe.py b/tests/voice/test_13_smart_turn_transcribe.py
index 9283c33e..d7ec6b65 100644
--- a/tests/voice/test_13_smart_turn_transcribe.py
+++ b/tests/voice/test_13_smart_turn_transcribe.py
@@ -12,6 +12,7 @@
from speechmatics.voice import AdditionalVocabEntry
from speechmatics.voice import AgentServerMessageType
from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import SmartTurnConfig
from speechmatics.voice import SpeechSegmentConfig
from speechmatics.voice import VoiceAgentConfig
from speechmatics.voice._smart_turn import SmartTurnDetector
@@ -94,13 +95,13 @@ async def test_prediction(sample: TranscriptionTest):
connect=False,
config=VoiceAgentConfig(
max_delay=0.7,
- end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN,
+ end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
end_of_utterance_silence_trigger=0.5,
enable_diarization=True,
sample_rate=sample.sample_rate,
additional_vocab=sample.additional_vocab,
- use_forced_eou_message=True,
speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+ smart_turn_config=SmartTurnConfig(enabled=True),
),
)
diff --git a/tests/voice/test_14_presets.py b/tests/voice/test_14_presets.py
index 16693925..a5cc898f 100644
--- a/tests/voice/test_14_presets.py
+++ b/tests/voice/test_14_presets.py
@@ -11,20 +11,18 @@ async def test_presets():
"""Test VoiceAgentConfigPreset presets."""
# Create a preset
- preset: VoiceAgentConfig = VoiceAgentConfigPreset.LOW_LATENCY()
+ preset: VoiceAgentConfig = VoiceAgentConfigPreset.FAST()
assert preset is not None
- assert preset.speech_segment_config.emit_sentences is True
+ assert preset.speech_segment_config.emit_sentences is False
# Overlay #1
- preset: VoiceAgentConfig = VoiceAgentConfigPreset.LOW_LATENCY(
- VoiceAgentConfig(max_delay=12.34, enable_diarization=False)
- )
+ preset: VoiceAgentConfig = VoiceAgentConfigPreset.FAST(VoiceAgentConfig(max_delay=12.34, enable_diarization=False))
assert preset is not None
assert preset.max_delay == 12.34
assert preset.enable_diarization is False
# Overlay #2
- preset: VoiceAgentConfig = VoiceAgentConfigPreset.LOW_LATENCY(
+ preset: VoiceAgentConfig = VoiceAgentConfigPreset.FAST(
VoiceAgentConfig(speech_segment_config=SpeechSegmentConfig(emit_sentences=False))
)
assert preset is not None
@@ -33,10 +31,10 @@ async def test_presets():
# Preset names
presets = VoiceAgentConfigPreset.list_presets()
- assert "low_latency" in presets
+ assert "fast" in presets
# Get a preset by a name
- preset: VoiceAgentConfig = VoiceAgentConfigPreset.load("low_latency")
+ preset: VoiceAgentConfig = VoiceAgentConfigPreset.load("fast")
assert preset is not None
@@ -45,7 +43,7 @@ async def test_json_presets():
"""Test VoiceAgentConfigPreset JSON presets."""
# With a JSON string overlay
- preset: VoiceAgentConfig = VoiceAgentConfigPreset.load("low_latency", '{"operating_point": "enhanced"}')
+ preset: VoiceAgentConfig = VoiceAgentConfigPreset.load("fast", '{"operating_point": "enhanced"}')
assert preset is not None
assert preset.operating_point == OperatingPoint.ENHANCED
@@ -55,4 +53,4 @@ async def test_json_presets():
# Check with invalid overlay
with pytest.raises(ValueError):
- VoiceAgentConfigPreset.load("low_latency", '{"invalid": "value"}')
+ VoiceAgentConfigPreset.load("fast", '{"invalid": "value"}')