livekit · chenghao-mou · Apr 10, 2026 · Apr 10, 2026
diff --git a/livekit-agents/livekit/agents/inference/_ws.py b/livekit-agents/livekit/agents/inference/_ws.py
@@ -0,0 +1,176 @@
+from __future__ import annotations
+
+import asyncio
+from collections.abc import AsyncIterator
+from types import TracebackType
+from typing import overload
+
+import aiohttp
+
+from .._exceptions import (
+    APIConnectionError,
+    APIStatusError,
+    APITimeoutError,
+    create_api_error_from_http,
+)
+from ..log import logger
+from ._utils import create_access_token, get_inference_headers
+
+_WS_CLOSE_TYPES = frozenset(
+    {
+        aiohttp.WSMsgType.CLOSED,
+        aiohttp.WSMsgType.CLOSE,
+        aiohttp.WSMsgType.CLOSING,
+    }
+)
+
+_PAYLOAD_TO_WS_TYPE: dict[type[str] | type[bytes], aiohttp.WSMsgType] = {
+    str: aiohttp.WSMsgType.TEXT,
+    bytes: aiohttp.WSMsgType.BINARY,
+}
+
+
+class InferenceWebSocket:
+    """Context manager that connects to a LiveKit inference WebSocket endpoint.
+
+    Handles URL scheme conversion (http->ws), authentication, connection timeout,
+    error wrapping, and provides recv iterators with automatic close detection.
+
+    Usage::
+
+        async with InferenceWebSocket(
+            session=http_session,
+            base_url="https://agent-gateway.livekit.cloud/v1",
+            path="/stt?model=deepgram/nova-3",
+            api_key=api_key,
+            api_secret=api_secret,
+            timeout=10.0,
+        ) as iws:
+            await iws.send(session_create_json)
+            ...
+    """
+
+    def __init__(
+        self,
+        *,
+        session: aiohttp.ClientSession,
+        base_url: str,
+        path: str,
+        api_key: str,
+        api_secret: str,
+        timeout: float,
+    ) -> None:
+        self._session = session
+        self._base_url = base_url
+        self._path = path
+        self._api_key = api_key
+        self._api_secret = api_secret
+        self._timeout = timeout
+        self._ws: aiohttp.ClientWebSocketResponse | None = None
+        self._closing = False
+
+    async def __aenter__(self) -> InferenceWebSocket:
+        base_url = self._base_url
+        if base_url.startswith(("http://", "https://")):
+            base_url = base_url.replace("http", "ws", 1)
+
+        headers = {
+            **get_inference_headers(),
+            "Authorization": f"Bearer {create_access_token(self._api_key, self._api_secret)}",
+        }
+
+        try:
+            self._ws = await asyncio.wait_for(
+                self._session.ws_connect(f"{base_url}{self._path}", headers=headers),
+                self._timeout,
+            )
+        except aiohttp.ClientResponseError as e:
+            if e.status == 429:
+                raise APIStatusError(
+                    f"inference quota exceeded: {e.message}",
+                    status_code=e.status,
+                    retryable=False,
+                ) from e
+            raise create_api_error_from_http(e.message, status=e.status) from e
+        except asyncio.TimeoutError as e:
+            raise APITimeoutError("inference websocket connection timed out") from e
+        except aiohttp.ClientConnectorError as e:
+            raise APIConnectionError(
+                f"failed to connect to inference websocket at {self._path}"
+            ) from e
+
+        return self
+
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        if self._ws is not None and not self._ws.closed:
+            await self._ws.close()
+        self._ws = None
+
+    @property
+    def ws(self) -> aiohttp.ClientWebSocketResponse:
+        assert self._ws is not None, "InferenceWebSocket not connected"
+        return self._ws
+
+    @property
+    def closed(self) -> bool:
+        return self._ws is None or self._ws.closed
+
+    def mark_closing(self) -> None:
+        """Signal that a graceful close has been initiated by the caller.
+
+        After calling this, the recv iterators will return cleanly when the
+        server closes the WebSocket instead of raising ``APIStatusError``.
+        """
+        self._closing = True
+
+    @overload
+    async def send(self, data: str) -> None: ...
+
+    @overload
+    async def send(self, data: bytes) -> None: ...
+
+    async def send(self, data: str | bytes) -> None:
+        if isinstance(data, str):
+            await self.ws.send_str(data)
+        else:
+            await self.ws.send_bytes(data)
+
+    @overload
+    def recv(self, payload_type: type[str]) -> AsyncIterator[str]: ...
+
+    @overload
+    def recv(self, payload_type: type[bytes]) -> AsyncIterator[bytes]: ...
+
+    async def recv(self, payload_type: type[str] | type[bytes] = str) -> AsyncIterator[str | bytes]:
+        """Yield payloads from the WebSocket.
+
+        Args:
+            payload_type: ``str`` for text frames, ``bytes`` for binary frames.
+
+        Handles CLOSED/CLOSE/CLOSING detection: returns cleanly if
+        ``mark_closing()`` was called or the session is closed,
+        otherwise raises ``APIStatusError``.
+        """
+        expected_ws_type = _PAYLOAD_TO_WS_TYPE[payload_type]
+        ws = self.ws
+        while True:
+            msg = await ws.receive()
+            if msg.type in _WS_CLOSE_TYPES:
+                if self._closing or self._session.closed:
+                    return
+                raise APIStatusError(
+                    message="inference websocket connection closed unexpectedly",
+                    status_code=ws.close_code or -1,
+                    body=f"{msg.data=} {msg.extra=}",
+                )
+
+            if msg.type != expected_ws_type:
+                logger.warning("unexpected inference websocket message type %s", msg.type)
+                continue
+
+            yield msg.data
diff --git a/livekit-agents/livekit/agents/inference/interruption.py b/livekit-agents/livekit/agents/inference/interruption.py
@@ -49,8 +49,8 @@
     STAGING_INFERENCE_URL,
     create_access_token,
     get_default_inference_url,
-    get_inference_headers,
 )
+from ._ws import InferenceWebSocket
 
 SAMPLE_RATE = 16000
 THRESHOLD = 0.5
@@ -929,13 +929,35 @@ def update_options(
             self._opts.min_frames = math.ceil(min_interruption_duration * _FRAMES_PER_SECOND)
         self._reconnect_event.set()
 
+    def _build_session_create_message(self) -> str:
+        settings = InterruptionWSSessionCreateSettings(
+            sample_rate=self._opts.sample_rate,
+            num_channels=1,
+            threshold=self._opts.threshold,
+            min_frames=self._model._opts.min_frames,
+            encoding="s16le",
+        )
+        msg = InterruptionWSSessionCreateMessage(
+            type=InterruptionWSMessageType.SESSION_CREATE,
+            settings=settings,
+        )
+        return msg.model_dump_json()
+
     async def _run(self) -> None:
-        closing_ws = False
+        async def wait_worker_tasks(tasks: list[asyncio.Task[None]]) -> None:
+            done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
+
+            for task in done:
+                task.result()
+
+            if pending:
+                done, _ = await asyncio.wait(pending)
+                for task in done:
+                    task.result()
 
         async def send_task(
-            ws: aiohttp.ClientWebSocketResponse, input_ch: aio.Chan[npt.NDArray[np.int16]]
+            iws: InferenceWebSocket, input_ch: aio.Chan[npt.NDArray[np.int16]]
         ) -> None:
-            nonlocal closing_ws
             timeout_ns = int(self._opts.inference_timeout * 1e9)
 
             async for audio_data in input_ch:
@@ -955,43 +977,21 @@ async def send_task(
                 await self._num_requests.increment()
                 created_at = perf_counter_ns()
                 header = struct.pack("<Q", created_at)  # 8 bytes
-                await ws.send_bytes(header + audio_data.tobytes())
+                await iws.send(header + audio_data.tobytes())
                 self._cache[created_at] = InterruptionCacheEntry(
                     created_at=created_at,
                     speech_input=audio_data,
                 )
 
-            closing_ws = True
-            msg = InterruptionWSSessionCloseMessage(
+            iws.mark_closing()
+            close_msg = InterruptionWSSessionCloseMessage(
                 type=InterruptionWSMessageType.SESSION_CLOSE,
             )
-            await ws.send_str(msg.model_dump_json())
-
-        async def recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
-            nonlocal closing_ws
-
-            while True:
-                ws_msg = await ws.receive()
-                if ws_msg.type in (
-                    aiohttp.WSMsgType.CLOSED,
-                    aiohttp.WSMsgType.CLOSE,
-                    aiohttp.WSMsgType.CLOSING,
-                ):
-                    if closing_ws or self._session.closed:
-                        return
-                    raise APIStatusError(
-                        message=f"LiveKit Adaptive Interruption connection closed unexpectedly: {ws_msg.data}",
-                        status_code=ws.close_code or -1,
-                        body=f"{ws_msg.data=} {ws_msg.extra=}",
-                    )
-
-                if ws_msg.type != aiohttp.WSMsgType.TEXT:
-                    logger.warning(
-                        "unexpected LiveKit Adaptive Interruption message type %s", ws_msg.type
-                    )
-                    continue
+            await iws.send(close_msg.model_dump_json())
 
-                data = json.loads(ws_msg.data)
+        async def recv_task(iws: InferenceWebSocket) -> None:
+            async for raw_data in iws.recv(str):
+                data = json.loads(raw_data)
                 msg: AnyInterruptionWSMessage = InterruptionWSMessage.validate_python(data)
 
                 match msg:
@@ -1068,24 +1068,34 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
                             data,
                         )
 
-        ws: aiohttp.ClientWebSocketResponse | None = None
-
         while True:
             data_ch = aio.Chan[npt.NDArray[np.int16]]()
-            try:
-                closing_ws = False
-                ws = await self._connect_ws()
+            async with InferenceWebSocket(
+                session=self._session,
+                base_url=self._opts.base_url,
+                path="/bargein",
+                api_key=self._opts.api_key,
+                api_secret=self._opts.api_secret,
+                timeout=self._conn_options.timeout,
+            ) as iws:
+                try:
+                    await iws.send(self._build_session_create_message())
+                except Exception as e:
+                    raise APIConnectionError(
+                        "failed to send session.create to adaptive interruption"
+                    ) from e
+
                 tasks = [
                     asyncio.create_task(self._forward_data(data_ch)),
-                    asyncio.create_task(send_task(ws, data_ch)),
-                    asyncio.create_task(recv_task(ws)),
+                    asyncio.create_task(send_task(iws, data_ch)),
+                    asyncio.create_task(recv_task(iws)),
                 ]
-                tasks_group = asyncio.gather(*tasks)
+                worker_tasks_done = asyncio.create_task(wait_worker_tasks(tasks))
                 wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
 
                 try:
                     done, _ = await asyncio.wait(
-                        (tasks_group, wait_reconnect_task),
+                        (worker_tasks_done, wait_reconnect_task),
                         return_when=asyncio.FIRST_COMPLETED,
                     )
 
@@ -1098,74 +1108,8 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
 
                     self._reconnect_event.clear()
                 finally:
-                    closing_ws = True
-                    if ws is not None and not ws.closed:
-                        await ws.close()
-                        ws = None
-                    await aio.gracefully_cancel(*tasks, wait_reconnect_task)
-                    tasks_group.cancel()
-                    try:
-                        tasks_group.exception()
-                    except asyncio.CancelledError:
-                        pass
-            finally:
-                closing_ws = True
-                if ws is not None and not ws.closed:
-                    await ws.close()
-
-    async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
-        """Connect to the LiveKit Adaptive Interruption WebSocket."""
-        settings = InterruptionWSSessionCreateSettings(
-            sample_rate=self._opts.sample_rate,
-            num_channels=1,
-            threshold=self._opts.threshold,
-            min_frames=self._model._opts.min_frames,
-            encoding="s16le",
-        )
-
-        base_url = self._opts.base_url
-        if base_url.startswith(("http://", "https://")):
-            base_url = base_url.replace("http", "ws", 1)
-        headers = {
-            **get_inference_headers(),
-            "Authorization": f"Bearer {create_access_token(self._opts.api_key, self._opts.api_secret)}",
-        }
-        try:
-            ws = await asyncio.wait_for(
-                self._session.ws_connect(f"{base_url}/bargein", headers=headers),
-                self._conn_options.timeout,
-            )
-        except (
-            aiohttp.ClientConnectorError,
-            asyncio.TimeoutError,
-            aiohttp.ClientResponseError,
-        ) as e:
-            if isinstance(e, aiohttp.ClientResponseError) and e.status == 429:
-                raise APIStatusError(
-                    "LiveKit Adaptive Interruption quota exceeded",
-                    status_code=e.status,
-                    retryable=False,
-                ) from e
-            elif isinstance(e, asyncio.TimeoutError):
-                raise APIConnectionError(
-                    "failed to connect to LiveKit Adaptive Interruption: timeout",
-                    retryable=False,
-                ) from e
-            raise APIConnectionError("failed to connect to LiveKit Adaptive Interruption") from e
-
-        try:
-            msg = InterruptionWSSessionCreateMessage(
-                type=InterruptionWSMessageType.SESSION_CREATE,
-                settings=settings,
-            )
-            await ws.send_str(msg.model_dump_json())
-        except Exception as e:
-            await ws.close()
-            raise APIConnectionError(
-                "failed to send session.create message to LiveKit Adaptive Interruption"
-            ) from e
-
-        return ws
+                    iws.mark_closing()
+                    await aio.gracefully_cancel(worker_tasks_done, *tasks, wait_reconnect_task)
 
 
 # endregion