From d132592ddb57a1f7c3d9ee35b94bf88679408048 Mon Sep 17 00:00:00 2001 From: Pranav Joshi Date: Fri, 12 Dec 2025 13:26:30 +0000 Subject: [PATCH 1/4] initial no-op s2s core implementation --- packages/api/src/hooks/useSpeechToSpeech.ts | 3 + packages/api/src/hooks/useVoiceActivities.ts | 11 + .../SpeechToSpeech/SpeechToSpeechComposer.tsx | 134 +++++++++ .../SpeechToSpeech/private/Context.ts | 14 + .../private/useAudioPlayer.spec.tsx | 279 ++++++++++++++++++ .../SpeechToSpeech/private/useAudioPlayer.ts | 69 +++++ .../SpeechToSpeech/private/useContext.ts | 15 + .../private/useRecorder.spec.tsx | 160 ++++++++++ .../SpeechToSpeech/private/useRecorder.ts | 128 ++++++++ .../SpeechToSpeech/types/SpeechState.ts | 1 + .../SpeechToSpeech/useSpeechToSpeech.ts | 6 + packages/core/src/index.ts | 2 + .../voiceActivity/isVoiceActivity.spec.ts | 88 ++++++ .../utils/voiceActivity/isVoiceActivity.ts | 14 + 14 files changed, 924 insertions(+) create mode 100644 packages/api/src/hooks/useSpeechToSpeech.ts create mode 100644 packages/api/src/hooks/useVoiceActivities.ts create mode 100644 packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx create mode 100644 packages/api/src/providers/SpeechToSpeech/private/Context.ts create mode 100644 packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx create mode 100644 packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts create mode 100644 packages/api/src/providers/SpeechToSpeech/private/useContext.ts create mode 100644 packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx create mode 100644 packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts create mode 100644 packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts create mode 100644 packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts create mode 100644 packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts create mode 100644 packages/core/src/utils/voiceActivity/isVoiceActivity.ts diff --git a/packages/api/src/hooks/useSpeechToSpeech.ts b/packages/api/src/hooks/useSpeechToSpeech.ts new file mode 100644 index 0000000000..4f529a2c08 --- /dev/null +++ b/packages/api/src/hooks/useSpeechToSpeech.ts @@ -0,0 +1,3 @@ +import useSpeechToSpeech from '../providers/SpeechToSpeech/useSpeechToSpeech'; + +export default useSpeechToSpeech; diff --git a/packages/api/src/hooks/useVoiceActivities.ts b/packages/api/src/hooks/useVoiceActivities.ts new file mode 100644 index 0000000000..d65e142b17 --- /dev/null +++ b/packages/api/src/hooks/useVoiceActivities.ts @@ -0,0 +1,11 @@ +import { isVoiceActivity, type WebChatActivity } from 'botframework-webchat-core'; +import { useSelector } from './internal/WebChatReduxContext'; + +const activitiesSelector = (state: { activities: WebChatActivity[] }) => state.activities; + +const of = (predicate: (activity: WebChatActivity) => boolean) => (state: { activities: WebChatActivity[] }) => + activitiesSelector(state).filter(predicate); + +export default function useVoiceActivities(): [WebChatActivity[]] { + return [useSelector(of(activity => isVoiceActivity(activity)))]; +} diff --git a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx new file mode 100644 index 0000000000..0ccf1a6f32 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx @@ -0,0 +1,134 @@ +import React, { useCallback, useEffect, useMemo, useRef, useState, type ReactNode } from 'react'; +import { isVoiceActivity, WebChatActivity } from 'botframework-webchat-core'; +import { useAudioPlayer } from './private/useAudioPlayer'; +import { useRecorder } from './private/useRecorder'; +import { useDebouncedNotifications, usePostActivity, useVoiceActivities } from '../../hooks'; +import SpeechToSpeechContext from './private/Context'; +import { SpeechState } from './types/SpeechState'; + +export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> = ({ children }) => { + const [voiceActivities] = useVoiceActivities(); + const postActivity = usePostActivity(); + const [{ connectivitystatus }] = useDebouncedNotifications(); + const { playAudio, stopAudio, isPlaying } = useAudioPlayer(); + + const lastProcessedIndexRef = useRef(0); + + // Remove when we have activity protocol changes, we would get this as part of signal activity. + const [speechState, setSpeechState] = useState('idle'); + + const isConnected = useMemo(() => connectivitystatus?.message === 'connected', [connectivitystatus]); + + const sendAudioChunk = useCallback( + (base64: string) => { + postActivity({ + type: 'event', + name: 'stream.chunk', + value: { voiceLiveEvent: { type: 'input_audio_buffer.append', audio: base64 } } + } as any); + }, + [postActivity] + ); + + const { recording, setRecording: baseSetRecording } = useRecorder(sendAudioChunk); + + const cancelActiveResponse = useCallback(() => { + if (isPlaying) { + postActivity({ + type: 'event', + value: { voiceLiveEvent: { type: 'response.cancel' } } + } as any); + } + }, [isPlaying, postActivity]); + + const handleVoiceActivity = useCallback( + (activity: WebChatActivity) => { + if (!isVoiceActivity(activity)) { + return; + } + + const { voiceLiveEvent } = activity.value; + + switch (voiceLiveEvent.type) { + case 'input_audio_buffer.speech_started': + stopAudio(); + setSpeechState('listening'); + break; + case 'input_audio_buffer.speech_stopped': + setSpeechState('processing'); + break; + case 'response.audio.delta': + if (voiceLiveEvent.delta && recording) { + playAudio(voiceLiveEvent.delta); + } + break; + case 'response.done': + if (!isPlaying) { + setSpeechState('listening'); + } + break; + default: + break; + } + }, + [isPlaying, playAudio, recording, stopAudio] + ); + + useEffect(() => { + const startIndex = lastProcessedIndexRef.current; + + if (!voiceActivities.length || startIndex >= voiceActivities.length) { + return; + } + + // If not recording, skip processing voice activities but update ref + // so next time we start recording, we only process new activities. + if (!recording) { + lastProcessedIndexRef.current = voiceActivities.length; + return; + } + + for (let i = startIndex; i < voiceActivities.length; i++) { + // eslint-disable-next-line security/detect-object-injection + handleVoiceActivity(voiceActivities[i]); + } + + if (isPlaying && speechState !== 'bot_speaking') { + setSpeechState('bot_speaking'); + } else if (!isPlaying && speechState === 'bot_speaking') { + setSpeechState('listening'); + } + + lastProcessedIndexRef.current = voiceActivities.length; + }, [voiceActivities, recording, postActivity, isPlaying, playAudio, speechState, stopAudio, handleVoiceActivity]); + + const setRecording = useCallback( + (shouldRecord: boolean) => { + if (!isConnected) { + return; + } + + if (!recording) { + setSpeechState('listening'); + } else { + stopAudio(); + cancelActiveResponse(); + setSpeechState('idle'); + } + + baseSetRecording(shouldRecord); + }, + [isConnected, recording, baseSetRecording, stopAudio, cancelActiveResponse] + ); + + const contextValue = useMemo( + () => ({ + recording, + setRecording, + speechState + }), + [recording, setRecording, speechState] + ); + + return {children}; +}; diff --git a/packages/api/src/providers/SpeechToSpeech/private/Context.ts b/packages/api/src/providers/SpeechToSpeech/private/Context.ts new file mode 100644 index 0000000000..ce85310246 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/Context.ts @@ -0,0 +1,14 @@ +import { createContext } from 'react'; +import { SpeechState } from '../types/SpeechState'; + +type SpeechToSpeechContextType = { + recording: boolean; + setRecording: (recording: boolean) => void; + speechState: SpeechState; +}; + +const SpeechToSpeechContext = createContext(undefined!); + +export default SpeechToSpeechContext; + +export type { SpeechToSpeechContextType }; diff --git a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx new file mode 100644 index 0000000000..8c1d42cb08 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx @@ -0,0 +1,279 @@ +/** @jest-environment @happy-dom/jest-environment */ +/// +/// + +import { render, type RenderResult } from '@testing-library/react'; +import React, { type ComponentType } from 'react'; +import { useAudioPlayer } from './useAudioPlayer'; + +// Mock AudioContext and related APIs +const mockAudioContext = { + sampleRate: 24000, + currentTime: 0, + destination: {}, + state: 'running', + resume: jest.fn().mockResolvedValue(undefined), + close: jest.fn().mockResolvedValue(undefined), + createBuffer: jest.fn(), + createBufferSource: jest.fn() +}; + +const mockAudioBuffer = { + duration: 0.1, // 100m + getChannelData: jest.fn().mockReturnValue(new Float32Array(2400)) +}; + +const mockBufferSource = { + buffer: null, + connect: jest.fn(), + start: jest.fn(), + stop: jest.fn(), + disconnect: jest.fn(), + onended: null +}; + +// Mock global AudioContext +global.AudioContext = jest.fn(() => mockAudioContext) as any; +global.atob = jest.fn(str => str); // Simple mock for base64 decode + +type UseAudioPlayerReturn = ReturnType; + +describe('setup', () => { + let HookApp: ComponentType; + let hookData: UseAudioPlayerReturn | undefined; + let renderResult: RenderResult; + const originalAudioContext = global.AudioContext; + + beforeEach(() => { + jest.clearAllMocks(); + mockAudioContext.currentTime = 0; + mockAudioContext.createBuffer.mockReturnValue(mockAudioBuffer); + mockAudioContext.createBufferSource.mockReturnValue(mockBufferSource); + mockBufferSource.buffer = null; + mockBufferSource.onended = null; + + HookApp = () => { + hookData = useAudioPlayer(); + return null; + }; + }); + + afterEach(() => { + global.AudioContext = originalAudioContext; + }); + + describe('Initialization', () => { + test('should initialize with correct default values', () => { + render(); + + expect(hookData?.isPlaying).toBe(false); + expect(typeof hookData?.playAudio).toBe('function'); + expect(typeof hookData?.stopAudio).toBe('function'); + }); + + test('should create AudioContext on first playAudio call', () => { + render(); + + hookData?.playAudio('dGVzdA=='); // base64 for 'test' + + expect(AudioContext).toHaveBeenCalledWith({ sampleRate: 24000 }); + }); + + test('should reuse existing AudioContext on subsequent calls', () => { + render(); + + hookData?.playAudio('dGVzdA=='); + hookData?.playAudio('dGVzdDI='); + + expect(AudioContext).toHaveBeenCalledTimes(1); + }); + }); + + describe('Audio playback', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should process base64 audio data correctly', () => { + hookData?.playAudio('dGVzdA=='); + + expect(global.atob).toHaveBeenCalledWith('dGVzdA=='); + expect(mockAudioContext.createBuffer).toHaveBeenCalledWith(1, expect.any(Number), 24000); + expect(mockAudioContext.createBufferSource).toHaveBeenCalled(); + }); + + test('should set up audio buffer source correctly', () => { + hookData?.playAudio('dGVzdA=='); + + expect(mockBufferSource.connect).toHaveBeenCalledWith(mockAudioContext.destination); + expect(mockBufferSource.start).toHaveBeenCalled(); + expect(mockBufferSource.buffer).toBe(mockAudioBuffer); + }); + + test('should resume AudioContext if needed', () => { + hookData?.playAudio('dGVzdA=='); + + expect(mockAudioContext.resume).toHaveBeenCalled(); + }); + + test('should queue multiple audio chunks correctly', () => { + mockAudioBuffer.duration = 0.1; // 100ms + + hookData?.playAudio('dGVzdA=='); + hookData?.playAudio('dGVzdDI='); + + expect(mockBufferSource.start).toHaveBeenCalledTimes(2); + // First chunk starts at currentTime (0), second at 0.1 + expect(mockBufferSource.start).toHaveBeenNthCalledWith(1, 0); + expect(mockBufferSource.start).toHaveBeenNthCalledWith(2, 0.1); + }); + }); + + describe('isPlaying state', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should return true when audio is queued for playback', () => { + mockAudioContext.currentTime = 0; + mockAudioBuffer.duration = 0.1; + + hookData?.playAudio('dGVzdA=='); + renderResult.rerender(); + + expect(hookData?.isPlaying).toBe(true); + }); + + test('should return false when no audio is queued', () => { + expect(hookData?.isPlaying).toBe(false); + }); + + test('should handle multiple chunks and playing state', () => { + mockAudioContext.currentTime = 0.05; // In the middle of first chunk + mockAudioBuffer.duration = 0.1; + + hookData?.playAudio('dGVzdA=='); // 0 - 0.1 + hookData?.playAudio('dGVzdDI='); // 0.1 - 0.2 + renderResult.rerender(); + + expect(hookData?.isPlaying).toBe(true); + }); + }); + + describe('Audio cleanup', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should clean up buffer source on ended', () => { + hookData?.playAudio('dGVzdA=='); + + // Simulate audio ended + if (mockBufferSource.onended) { + mockBufferSource.onended(); + } + + expect(mockBufferSource.disconnect).toHaveBeenCalled(); + expect(mockBufferSource.buffer).toBeNull(); + }); + + test('should stop all audio and close context', () => { + hookData?.playAudio('dGVzdA=='); + + hookData?.stopAudio(); + renderResult.rerender(); + + expect(mockAudioContext.close).toHaveBeenCalled(); + expect(hookData?.isPlaying).toBe(false); + }); + }); + + describe('Error handling', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should handle invalid base64 data gracefully', () => { + expect(() => { + hookData?.playAudio('invalid-base64!@#'); + }).not.toThrow(); + }); + + test('should handle AudioContext creation failure', () => { + global.AudioContext = jest.fn(() => { + throw new Error('AudioContext not supported'); + }) as any; + + expect(() => { + hookData?.playAudio('dGVzdA=='); + }).toThrow('AudioContext not supported'); + }); + + test('should handle missing audio context in isPlaying', () => { + // Before any audio is played, audioCtxRef should be null + expect(hookData?.isPlaying).toBe(false); + }); + }); + + describe('Real-world scenarios', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should handle streaming audio chunks', () => { + mockAudioBuffer.duration = 0.05; // 50ms chunks + + // Simulate streaming 5 chunks + for (let i = 0; i < 5; i++) { + hookData?.playAudio(`chunk${i}`); + } + + expect(mockBufferSource.start).toHaveBeenCalledTimes(5); + renderResult.rerender(); + expect(hookData?.isPlaying).toBe(true); + }); + + test('should handle playback interruption', () => { + hookData?.playAudio('dGVzdA=='); + renderResult.rerender(); + expect(hookData?.isPlaying).toBe(true); + + hookData?.stopAudio(); + renderResult.rerender(); + expect(hookData?.isPlaying).toBe(false); + expect(mockAudioContext.close).toHaveBeenCalled(); + }); + + test('should handle resume after stop', () => { + // Play, stop, then play again + hookData?.playAudio('dGVzdA=='); + hookData?.stopAudio(); + hookData?.playAudio('dGVzdDI='); + + expect(AudioContext).toHaveBeenCalledTimes(2); // New context after stop + }); + }); + + describe('Performance considerations', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should handle large audio data', () => { + const largeBase64 = 'A'.repeat(10000); + + expect(() => { + hookData?.playAudio(largeBase64); + }).not.toThrow(); + }); + + test('should handle rapid successive calls', () => { + for (let i = 0; i < 100; i++) { + // Ensure the mock "base64" data has an even length as Int16Array (which represents 16-bit audio samples) requires the underlying data to be in multiples of 2 bytes + hookData?.playAudio(`chunk${i}`.padEnd(8, ' ')); + } + + expect(mockBufferSource.start).toHaveBeenCalledTimes(100); + }); + }); +}); diff --git a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts new file mode 100644 index 0000000000..6216932a8c --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts @@ -0,0 +1,69 @@ +import { useRef, useCallback } from 'react'; + +const SAMPLE_RATE = 24000; +const INT16_SCALE = 32768; + +export function useAudioPlayer() { + const audioCtxRef = useRef(null); + const nextPlayTimeRef = useRef(0); + + const initAudio = useCallback(() => { + if (!audioCtxRef.current) { + audioCtxRef.current = new AudioContext({ sampleRate: SAMPLE_RATE }); + } + return audioCtxRef.current; + }, []); + + const playAudio = useCallback( + (base64: string) => { + const audioCtx = initAudio(); + audioCtx.resume?.(); + + try { + const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0)); + const int16 = new Int16Array(bytes.buffer); + const float32 = new Float32Array(int16.length); + + for (let i = 0; i < int16.length; i++) { + // eslint-disable-next-line security/detect-object-injection + float32[i] = int16[i] / INT16_SCALE; + } + + const buffer = audioCtx.createBuffer(1, float32.length, SAMPLE_RATE); + buffer.getChannelData(0).set(float32); + + const src = audioCtx.createBufferSource(); + src.buffer = buffer; + src.connect(audioCtx.destination); + + // Clear buffer when finished + src.onended = () => { + src.disconnect(); + src.buffer = null; + }; + + nextPlayTimeRef.current = Math.max(nextPlayTimeRef.current, audioCtx.currentTime); + src.start(nextPlayTimeRef.current); + nextPlayTimeRef.current += buffer.duration; + } catch (error) { + console.warn('botframework-webchat: Error during audio playback in useAudioPlayer:', error); + } + }, + [initAudio] + ); + + const stopAudio = useCallback(() => { + nextPlayTimeRef.current = 0; + + if (audioCtxRef.current) { + audioCtxRef.current.close(); + audioCtxRef.current = null; + } + }, []); + + return { + playAudio, + stopAudio, + isPlaying: audioCtxRef.current ? audioCtxRef.current.currentTime < nextPlayTimeRef.current : false + }; +} diff --git a/packages/api/src/providers/SpeechToSpeech/private/useContext.ts b/packages/api/src/providers/SpeechToSpeech/private/useContext.ts new file mode 100644 index 0000000000..50926b0a12 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useContext.ts @@ -0,0 +1,15 @@ +import { useContext } from 'react'; + +import SpeechToSpeechContext from './Context'; + +import type { SpeechToSpeechContextType } from './Context'; + +export default function useSpeechToSpeechContext(thrownOnUndefined = true): SpeechToSpeechContextType { + const contextValue = useContext(SpeechToSpeechContext); + + if (thrownOnUndefined && !contextValue) { + throw new Error('botframework-webchat internal: This hook can only be used under .'); + } + + return contextValue; +} diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx new file mode 100644 index 0000000000..01368ceda2 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx @@ -0,0 +1,160 @@ +/** @jest-environment @happy-dom/jest-environment */ +/// + +import { act, render, waitFor, type RenderResult } from '@testing-library/react'; +import React, { type ComponentType } from 'react'; +import { useRecorder } from './useRecorder'; + +// --- Mocks --- + +const mockTrack = { + stop: jest.fn() +}; + +const mockMediaStream = { + getTracks: jest.fn(() => [mockTrack]) +}; + +const mockMediaDevices = { + getUserMedia: jest.fn().mockResolvedValue(mockMediaStream) +}; + +const mockWorkletPort = { + postMessage: jest.fn(), + onmessage: null as ((event: { data: any }) => void) | null +}; + +const mockWorkletNode = { + connect: jest.fn(), + disconnect: jest.fn(), + port: mockWorkletPort +}; + +const mockAudioContext = { + state: 'running', + resume: jest.fn().mockResolvedValue(undefined), + createMediaStreamSource: jest.fn(() => ({ + connect: jest.fn() + })), + destination: {}, + audioWorklet: { + addModule: jest.fn().mockResolvedValue(undefined) + } +}; + +// --- Global Mocks Setup --- + +Object.defineProperty(global.navigator, 'mediaDevices', { + value: mockMediaDevices, + writable: true +}); + +global.AudioContext = jest.fn(() => mockAudioContext as any); +global.AudioWorkletNode = jest.fn(() => mockWorkletNode as any); +global.Blob = jest.fn(parts => ({ parts, type: parts[1]?.type })) as any; +global.URL.createObjectURL = jest.fn(() => 'blob:http://localhost/mock-url'); +global.URL.revokeObjectURL = jest.fn(); +global.btoa = jest.fn(str => `btoa(${str})`); + +// --- Tests --- + +describe('useRecorder', () => { + let onAudioChunk: jest.Mock; + let HookApp: ComponentType<{ onAudioChunk: (base64: string) => void }>; + let hookData: ReturnType | undefined; + // eslint-disable-next-line @typescript-eslint/no-unused-vars + let renderResult: RenderResult; + + beforeEach(() => { + // Clear all mocks before each test + jest.clearAllMocks(); + onAudioChunk = jest.fn(); + hookData = undefined; + mockWorkletPort.onmessage = null; + (mockAudioContext.state as any) = 'running'; + + HookApp = ({ onAudioChunk }) => { + hookData = useRecorder(onAudioChunk); + return null; + }; + }); + + test('should be initially not recording', () => { + render(); + expect(hookData?.recording).toBe(false); + }); + + test('should start recording when setRecording(true) is called', async () => { + renderResult = render(); + + act(() => { + hookData?.setRecording(true); + }); + + await waitFor(() => expect(hookData?.recording).toBe(true)); + + expect(navigator.mediaDevices.getUserMedia).toHaveBeenCalledTimes(1); + expect(global.AudioContext).toHaveBeenCalledTimes(1); + expect(mockAudioContext.audioWorklet.addModule).toHaveBeenCalledTimes(1); + expect(global.AudioWorkletNode).toHaveBeenCalledWith(expect.anything(), 'audio-recorder'); + expect(mockWorkletNode.connect).toHaveBeenCalledTimes(1); + expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'START' }); + }); + + test('should stop recording when setRecording(false) is called', async () => { + renderResult = render(); + + // Start recording + act(() => { + hookData?.setRecording(true); + }); + + await waitFor(() => expect(hookData?.recording).toBe(true)); + + // Stop recording + act(() => { + hookData?.setRecording(false); + }); + + await waitFor(() => expect(hookData?.recording).toBe(false)); + + expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'STOP' }); + expect(mockWorkletNode.disconnect).toHaveBeenCalledTimes(1); + expect(mockTrack.stop).toHaveBeenCalledTimes(1); + }); + + test('should process audio chunks sent from the worklet', async () => { + render(); + + act(() => { + hookData?.setRecording(true); + }); + + await waitFor(() => expect(mockWorkletPort.onmessage).not.toBeNull()); + + // Simulate a message from the audio worklet + const mockAudioData = new Float32Array([0.1, 0.2, -0.1]); + act(() => { + mockWorkletPort.onmessage!({ + data: { + eventType: 'audio', + audioData: mockAudioData + } + }); + }); + + await waitFor(() => expect(onAudioChunk).toHaveBeenCalledTimes(1)); + expect(global.btoa).toHaveBeenCalled(); + }); + + test('should handle suspended audio context by resuming it', async () => { + (mockAudioContext.state as any) = 'suspended'; + render(); + + act(() => { + hookData?.setRecording(true); + }); + + await waitFor(() => expect(mockAudioContext.resume).toHaveBeenCalledTimes(1)); + }); +}); diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts new file mode 100644 index 0000000000..b9930cada1 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts @@ -0,0 +1,128 @@ +import { useRef, useState, useCallback } from 'react'; + +const audioProcessorCode = ` +class AudioRecorderProcessor extends AudioWorkletProcessor { + constructor() { + super() + this.recording = false + this.buffer = [] + this.port.onmessage = e => { + if (e.data.command === 'START') this.recording = true + else if (e.data.command === 'STOP') { + this.recording = false + if (this.buffer.length) this.sendBuffer() + } + } + } + sendBuffer() { + if (this.buffer.length) { + this.port.postMessage({ + eventType: 'audio', + audioData: new Float32Array(this.buffer) + }) + this.buffer = [] + } + } + process(inputs) { + if (inputs[0]?.length && this.recording) { + this.buffer.push(...inputs[0][0]) + if (this.buffer.length >= 2400) this.sendBuffer() + } + return true + } +} +registerProcessor('audio-recorder', AudioRecorderProcessor) +`; + +const INT16_MIN = -32768; +const INT16_MAX = 32767; +const INT16_SCALE = 32767; + +export function useRecorder(onAudioChunk: (base64: string) => void) { + const [recording, setRecordingInternal] = useState(false); + const audioCtxRef = useRef(null); + const workletRef = useRef(null); + const streamRef = useRef(null); + + const initAudio = useCallback(async () => { + if (audioCtxRef.current) { + return; + } + const audioCtx = new AudioContext({ sampleRate: 24000 }); + const blob = new Blob([audioProcessorCode], { + type: 'application/javascript' + }); + // eslint-disable-next-line no-restricted-properties + const url = URL.createObjectURL(blob); + await audioCtx.audioWorklet.addModule(url); + URL.revokeObjectURL(url); + // eslint-disable-next-line require-atomic-updates + audioCtxRef.current = audioCtx; + }, []); + + const startRecording = useCallback(async () => { + await initAudio(); + const audioCtx = audioCtxRef.current!; + if (audioCtx.state === 'suspended') { + await audioCtx.resume(); + } + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + sampleRate: 24000, + echoCancellation: true + } + }); + streamRef.current = stream; + const source = audioCtx.createMediaStreamSource(stream); + const worklet = new AudioWorkletNode(audioCtx, 'audio-recorder'); + + worklet.port.onmessage = e => { + if (e.data.eventType === 'audio') { + const float32 = e.data.audioData; + const int16 = new Int16Array(float32.length); + for (let i = 0; i < float32.length; i++) { + // eslint-disable-next-line security/detect-object-injection + int16[i] = Math.max(INT16_MIN, Math.min(INT16_MAX, float32[i] * INT16_SCALE)); + } + const base64 = btoa(String.fromCharCode(...new Uint8Array(int16.buffer))); + onAudioChunk(base64); + } + }; + + source.connect(worklet); + worklet.connect(audioCtx.destination); + worklet.port.postMessage({ command: 'START' }); + workletRef.current = worklet; + setRecordingInternal(true); + }, [initAudio, onAudioChunk]); + + const stopRecording = useCallback(() => { + if (workletRef.current) { + workletRef.current.port.postMessage({ command: 'STOP' }); + workletRef.current.disconnect(); + workletRef.current = null; + } + if (streamRef.current) { + streamRef.current.getTracks().forEach(track => track.stop()); + streamRef.current = null; + } + setRecordingInternal(false); + }, []); + + const setRecording = useCallback( + async (shouldRecord: boolean) => { + if (!shouldRecord && recording) { + stopRecording(); + } else if (shouldRecord && !recording) { + await startRecording(); + } + }, + [recording, startRecording, stopRecording] + ); + + return { + recording, + setRecording + }; +} diff --git a/packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts b/packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts new file mode 100644 index 0000000000..62d5cc8c13 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts @@ -0,0 +1 @@ +export type SpeechState = 'idle' | 'listening' | 'processing' | 'bot_speaking'; diff --git a/packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts b/packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts new file mode 100644 index 0000000000..d7ac3fac44 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts @@ -0,0 +1,6 @@ +import { SpeechToSpeechContextType } from './private/Context'; +import useSpeechToSpeechContext from './private/useContext'; + +export default function useSpeechToSpeech(): readonly [SpeechToSpeechContextType] { + return [useSpeechToSpeechContext()]; +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index e635e6a060..c0580223d8 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -81,6 +81,7 @@ import type { DefinedTerm as OrgSchemaDefinedTerm } from './types/external/OrgSc import type { Project as OrgSchemaProject } from './types/external/OrgSchema/Project'; import type { Thing as OrgSchemaThing } from './types/external/OrgSchema/Thing'; import type { UserReview as OrgSchemaUserReview } from './types/external/OrgSchema/UserReview'; +import isVoiceActivity from './utils/voiceActivity/isVoiceActivity'; const Constants = { ActivityClientState, DictateState }; @@ -96,6 +97,7 @@ export { getActivityLivestreamingMetadata, getOrgSchemaMessage, isForbiddenPropertyName, + isVoiceActivity, markActivity, onErrorResumeNext, parseAction, diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts new file mode 100644 index 0000000000..c8d744595e --- /dev/null +++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts @@ -0,0 +1,88 @@ +import isVoiceActivity from './isVoiceActivity'; +import { WebChatActivity } from '../../types/WebChatActivity'; + +// Mock activity factory for testing +const createMockActivity = (type: string = 'event', value?: any): WebChatActivity => ({ + type: type as any, + id: 'test-activity-id', + from: { id: 'test-user' }, + channelData: { + 'webchat:sequence-id': 1 + }, + ...(value && { value }) +}); + +const createMockVoiceActivity = (voiceEventType: string, additionalProps?: any): WebChatActivity => + createMockActivity('event', { + voiceLiveEvent: { + type: voiceEventType, + ...additionalProps + } + }); + +describe('isVoiceActivity', () => { + describe('Valid voice activities', () => { + test('should return true for event activity with voiceLiveEvent', () => { + const activity = createMockVoiceActivity('response.audio.delta', { delta: 'audiodata' }); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + + test('should return true for voice activity with minimal voiceLiveEvent', () => { + const activity = createMockActivity('event', { voiceLiveEvent: {} }); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + }); + + describe('Invalid activities', () => { + const testCases = [ + // Invalid by activity type + { + name: 'message activity with voiceLiveEvent', + activity: () => createMockActivity('message', { voiceLiveEvent: { type: 'response.audio.delta' } }) + }, + { + name: 'typing activity', + activity: () => createMockActivity('typing') + }, + { + name: 'event activity with value', + activity: () => ({ ...createMockActivity('event'), value: 'not an object' }) + } + ]; + + test.each(testCases)('should return false for $name', ({ activity }) => { + const result = isVoiceActivity(activity()); + + expect(result).toBe(false); + }); + }); + + describe('Real-world voice event types', () => { + const voiceEventTypes = [ + 'input_audio_buffer.append', + 'input_audio_buffer.speech_started', + 'input_audio_buffer.speech_stopped', + 'conversation.item.input_audio_transcription.completed', + 'response.audio.delta', + 'response.audio_transcript.delta', + 'response.audio_transcript.done', + 'response.done', + 'session.update', + 'response.cancel' + ]; + + test.each(voiceEventTypes)('should return true for voice event type: %s', eventType => { + const activity = createMockVoiceActivity(eventType); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + }); +}); diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts new file mode 100644 index 0000000000..e16154e590 --- /dev/null +++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts @@ -0,0 +1,14 @@ +import { WebChatActivity } from '../../types/WebChatActivity'; + +// This is interim type guard until activity protocol is ratified. +const isVoiceActivity = ( + activity: WebChatActivity +): activity is WebChatActivity & { + value: { voiceLiveEvent: any }; +} => + activity.type === 'event' && + activity.value && + typeof activity.value === 'object' && + 'voiceLiveEvent' in activity.value; + +export default isVoiceActivity; From a98245729479f03422a3a2c04ff33222d3368b56 Mon Sep 17 00:00:00 2001 From: Pranav Joshi Date: Fri, 12 Dec 2025 13:41:48 +0000 Subject: [PATCH 2/4] minor --- packages/core/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index c0580223d8..a81d494a07 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -42,6 +42,7 @@ import getOrgSchemaMessage from './utils/getOrgSchemaMessage'; import isForbiddenPropertyName from './utils/isForbiddenPropertyName'; import onErrorResumeNext from './utils/onErrorResumeNext'; import singleToArray from './utils/singleToArray'; +import isVoiceActivity from './utils/voiceActivity/isVoiceActivity'; export { CLEAR_SUGGESTED_ACTIONS, @@ -81,7 +82,6 @@ import type { DefinedTerm as OrgSchemaDefinedTerm } from './types/external/OrgSc import type { Project as OrgSchemaProject } from './types/external/OrgSchema/Project'; import type { Thing as OrgSchemaThing } from './types/external/OrgSchema/Thing'; import type { UserReview as OrgSchemaUserReview } from './types/external/OrgSchema/UserReview'; -import isVoiceActivity from './utils/voiceActivity/isVoiceActivity'; const Constants = { ActivityClientState, DictateState }; From 9ddc63c8878a8d8eefda425688ec69d5fae74492 Mon Sep 17 00:00:00 2001 From: Pranav Joshi Date: Wed, 7 Jan 2026 17:10:14 +0000 Subject: [PATCH 3/4] refactor to align close to activity structure --- packages/api/src/hooks/index.ts | 4 +- packages/api/src/hooks/useVoiceActivities.ts | 9 +- .../SpeechToSpeech/SpeechToSpeechComposer.tsx | 100 +++++------ .../SpeechToSpeech/private/useAudioPlayer.ts | 12 +- .../private/useRecorder.spec.tsx | 6 +- .../SpeechToSpeech/private/useRecorder.ts | 83 +++++---- .../activities/combineActivitiesReducer.ts | 15 +- .../createGroupedActivitiesReducer.ts | 24 ++- .../src/reducers/activities/sort/types.ts | 1 + .../src/reducers/activities/sort/upsert.ts | 16 +- .../voiceActivity/isVoiceActivity.spec.ts | 84 +++++---- .../utils/voiceActivity/isVoiceActivity.ts | 11 +- .../isVoiceTranscriptActivity.spec.ts | 164 ++++++++++++++++++ .../isVoiceTranscriptActivity.ts | 18 ++ 14 files changed, 404 insertions(+), 143 deletions(-) create mode 100644 packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts create mode 100644 packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts diff --git a/packages/api/src/hooks/index.ts b/packages/api/src/hooks/index.ts index f5a1a959d7..b1d027ee3a 100644 --- a/packages/api/src/hooks/index.ts +++ b/packages/api/src/hooks/index.ts @@ -71,6 +71,7 @@ import useUIState from './useUIState'; import useUserID from './useUserID'; import useUsername from './useUsername'; import useVoiceSelector from './useVoiceSelector'; +import useVoiceActivities from './useVoiceActivities'; export { useBuildRenderActivityCallback } from '@msinternal/botframework-webchat-api-middleware'; export { useSuggestedActionsHooks } from '@msinternal/botframework-webchat-redux-store'; @@ -148,5 +149,6 @@ export { useUIState, useUserID, useUsername, - useVoiceSelector + useVoiceSelector, + useVoiceActivities }; diff --git a/packages/api/src/hooks/useVoiceActivities.ts b/packages/api/src/hooks/useVoiceActivities.ts index d65e142b17..0abff2229f 100644 --- a/packages/api/src/hooks/useVoiceActivities.ts +++ b/packages/api/src/hooks/useVoiceActivities.ts @@ -1,11 +1,6 @@ -import { isVoiceActivity, type WebChatActivity } from 'botframework-webchat-core'; +import { type WebChatActivity } from 'botframework-webchat-core'; import { useSelector } from './internal/WebChatReduxContext'; -const activitiesSelector = (state: { activities: WebChatActivity[] }) => state.activities; - -const of = (predicate: (activity: WebChatActivity) => boolean) => (state: { activities: WebChatActivity[] }) => - activitiesSelector(state).filter(predicate); - export default function useVoiceActivities(): [WebChatActivity[]] { - return [useSelector(of(activity => isVoiceActivity(activity)))]; + return [useSelector(({ voiceActivities }) => voiceActivities)]; } diff --git a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx index 0ccf1a6f32..b1f978e42c 100644 --- a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx +++ b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx @@ -10,36 +10,28 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> const [voiceActivities] = useVoiceActivities(); const postActivity = usePostActivity(); const [{ connectivitystatus }] = useDebouncedNotifications(); - const { playAudio, stopAudio, isPlaying } = useAudioPlayer(); - const lastProcessedIndexRef = useRef(0); - - // Remove when we have activity protocol changes, we would get this as part of signal activity. const [speechState, setSpeechState] = useState('idle'); + // config received from server on session start, for now ccv2 and mmrt runs on different sample rate and chunk interval. + // we will read those config, free form object as unsure of what all session config would be needed in future. + const [serverConfig, setServerConfig] = useState | null>(null); + const { playAudio, stopAudio, isPlaying } = useAudioPlayer(serverConfig); + const isConnected = useMemo(() => connectivitystatus?.message === 'connected', [connectivitystatus]); const sendAudioChunk = useCallback( - (base64: string) => { + (base64: string, timestamp: string) => { postActivity({ type: 'event', name: 'stream.chunk', - value: { voiceLiveEvent: { type: 'input_audio_buffer.append', audio: base64 } } + value: { voice: { contentUrl: base64, timestamp } } } as any); }, [postActivity] ); - const { recording, setRecording: baseSetRecording } = useRecorder(sendAudioChunk); - - const cancelActiveResponse = useCallback(() => { - if (isPlaying) { - postActivity({ - type: 'event', - value: { voiceLiveEvent: { type: 'response.cancel' } } - } as any); - } - }, [isPlaying, postActivity]); + const { recording, setRecording: baseSetRecording } = useRecorder(sendAudioChunk, serverConfig); const handleVoiceActivity = useCallback( (activity: WebChatActivity) => { @@ -47,50 +39,51 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> return; } - const { voiceLiveEvent } = activity.value; - - switch (voiceLiveEvent.type) { - case 'input_audio_buffer.speech_started': - stopAudio(); - setSpeechState('listening'); - break; - case 'input_audio_buffer.speech_stopped': - setSpeechState('processing'); - break; - case 'response.audio.delta': - if (voiceLiveEvent.delta && recording) { - playAudio(voiceLiveEvent.delta); - } - break; - case 'response.done': - if (!isPlaying) { + const { name, value } = activity; + const { voice } = value; + + // TODO - this will be commandResult activity and not event, need to think on handling of command and commandResult activities. + if (name === 'session.init' && value.session?.config) { + setServerConfig(value.session.config as Record); + } else if (name === 'session.update') { + switch (voice.bot_state) { + case 'voice.request.detected': + stopAudio(); setSpeechState('listening'); - } - break; - default: - break; + break; + case 'voice.request.processing': + setSpeechState('processing'); + break; + default: + break; + } + } else if (name === 'stream.chunk' && voice.contentUrl) { + playAudio(voice.contentUrl); } }, - [isPlaying, playAudio, recording, stopAudio] + [playAudio, stopAudio] ); - useEffect(() => { const startIndex = lastProcessedIndexRef.current; - if (!voiceActivities.length || startIndex >= voiceActivities.length) { return; } - // If not recording, skip processing voice activities but update ref - // so next time we start recording, we only process new activities. - if (!recording) { - lastProcessedIndexRef.current = voiceActivities.length; - return; - } - for (let i = startIndex; i < voiceActivities.length; i++) { // eslint-disable-next-line security/detect-object-injection - handleVoiceActivity(voiceActivities[i]); + const activity = voiceActivities[i]; + + // Skip processing the activity if it's from the user as we want to process only incoming voice activities. + // we may receive (optional) config from server as soon as socket is established + // at that time recording would be off but we still want to process to read config and act on it. + if ( + activity.from?.role === 'user' || + (!recording && isVoiceActivity(activity) && activity.name !== 'session.init') + ) { + continue; + } + + handleVoiceActivity(activity); } if (isPlaying && speechState !== 'bot_speaking') { @@ -100,25 +93,24 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> } lastProcessedIndexRef.current = voiceActivities.length; - }, [voiceActivities, recording, postActivity, isPlaying, playAudio, speechState, stopAudio, handleVoiceActivity]); + }, [voiceActivities, recording, isPlaying, speechState, handleVoiceActivity]); const setRecording = useCallback( - (shouldRecord: boolean) => { + async (shouldRecord: boolean) => { if (!isConnected) { return; } - if (!recording) { + if (shouldRecord) { setSpeechState('listening'); } else { stopAudio(); - cancelActiveResponse(); setSpeechState('idle'); } - baseSetRecording(shouldRecord); + await baseSetRecording(shouldRecord); }, - [isConnected, recording, baseSetRecording, stopAudio, cancelActiveResponse] + [isConnected, baseSetRecording, stopAudio] ); const contextValue = useMemo( diff --git a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts index 6216932a8c..f9b8405387 100644 --- a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts +++ b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts @@ -1,18 +1,20 @@ import { useRef, useCallback } from 'react'; -const SAMPLE_RATE = 24000; +const DEFAULT_SAMPLE_RATE = 24000; const INT16_SCALE = 32768; -export function useAudioPlayer() { +export function useAudioPlayer(config?: Record | null) { const audioCtxRef = useRef(null); const nextPlayTimeRef = useRef(0); + const { sampleRate = DEFAULT_SAMPLE_RATE } = config || {}; + const initAudio = useCallback(() => { if (!audioCtxRef.current) { - audioCtxRef.current = new AudioContext({ sampleRate: SAMPLE_RATE }); + audioCtxRef.current = new AudioContext({ sampleRate: sampleRate as number }); } return audioCtxRef.current; - }, []); + }, [sampleRate]); const playAudio = useCallback( (base64: string) => { @@ -29,7 +31,7 @@ export function useAudioPlayer() { float32[i] = int16[i] / INT16_SCALE; } - const buffer = audioCtx.createBuffer(1, float32.length, SAMPLE_RATE); + const buffer = audioCtx.createBuffer(1, float32.length, audioCtx.sampleRate); buffer.getChannelData(0).set(float32); const src = audioCtx.createBufferSource(); diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx index 01368ceda2..f2b01ca6b5 100644 --- a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx +++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx @@ -7,6 +7,8 @@ import { useRecorder } from './useRecorder'; // --- Mocks --- +jest.mock('../../Ponyfill/usePonyfill', () => ({ __esModule: true, default: jest.fn(() => [{ Date: global.Date }]) })); + const mockTrack = { stop: jest.fn() }; @@ -96,7 +98,9 @@ describe('useRecorder', () => { expect(navigator.mediaDevices.getUserMedia).toHaveBeenCalledTimes(1); expect(global.AudioContext).toHaveBeenCalledTimes(1); expect(mockAudioContext.audioWorklet.addModule).toHaveBeenCalledTimes(1); - expect(global.AudioWorkletNode).toHaveBeenCalledWith(expect.anything(), 'audio-recorder'); + expect(global.AudioWorkletNode).toHaveBeenCalledWith(expect.anything(), 'audio-recorder', { + processorOptions: { bufferSize: 2400 } + }); expect(mockWorkletNode.connect).toHaveBeenCalledTimes(1); expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'START' }); }); diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts index b9930cada1..7ba19ba244 100644 --- a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts +++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts @@ -1,54 +1,64 @@ import { useRef, useState, useCallback } from 'react'; +import usePonyfill from '../../Ponyfill/usePonyfill'; const audioProcessorCode = ` -class AudioRecorderProcessor extends AudioWorkletProcessor { - constructor() { - super() - this.recording = false - this.buffer = [] - this.port.onmessage = e => { - if (e.data.command === 'START') this.recording = true - else if (e.data.command === 'STOP') { - this.recording = false - if (this.buffer.length) this.sendBuffer() + class AudioRecorderProcessor extends AudioWorkletProcessor { + constructor(options) { + super() + this.recording = false + this.buffer = [] + this.bufferSize = options.processorOptions.bufferSize + this.port.onmessage = e => { + if (e.data.command === 'START') this.recording = true + else if (e.data.command === 'STOP') { + this.recording = false + this.buffer = [] + } } } - } - sendBuffer() { - if (this.buffer.length) { - this.port.postMessage({ - eventType: 'audio', - audioData: new Float32Array(this.buffer) - }) - this.buffer = [] + sendBuffer() { + while (this.buffer.length >= this.bufferSize) { + const chunk = this.buffer.splice(0, this.bufferSize) + this.port.postMessage({ + eventType: 'audio', + audioData: new Float32Array(chunk) + }) + } } - } - process(inputs) { - if (inputs[0]?.length && this.recording) { - this.buffer.push(...inputs[0][0]) - if (this.buffer.length >= 2400) this.sendBuffer() + process(inputs) { + if (inputs[0]?.length && this.recording) { + this.buffer.push(...inputs[0][0]) + if (this.buffer.length >= this.bufferSize) this.sendBuffer() + } + return true } - return true } -} -registerProcessor('audio-recorder', AudioRecorderProcessor) -`; + registerProcessor('audio-recorder', AudioRecorderProcessor)`; const INT16_MIN = -32768; const INT16_MAX = 32767; const INT16_SCALE = 32767; +const DEFAULT_SAMPLE_RATE = 24000; +const DEFAULT_CHUNK_SIZE_IN_MS = 100; +const MS_IN_SECOND = 1000; -export function useRecorder(onAudioChunk: (base64: string) => void) { +export function useRecorder( + onAudioChunk: (base64: string, timestamp: string) => void, + config?: Record | null +) { const [recording, setRecordingInternal] = useState(false); const audioCtxRef = useRef(null); const workletRef = useRef(null); const streamRef = useRef(null); + const [{ Date }] = usePonyfill(); + + const { sampleRate = DEFAULT_SAMPLE_RATE, chunkIntervalMs = DEFAULT_CHUNK_SIZE_IN_MS } = config || {}; const initAudio = useCallback(async () => { if (audioCtxRef.current) { return; } - const audioCtx = new AudioContext({ sampleRate: 24000 }); + const audioCtx = new AudioContext({ sampleRate: sampleRate as number }); const blob = new Blob([audioProcessorCode], { type: 'application/javascript' }); @@ -58,7 +68,7 @@ export function useRecorder(onAudioChunk: (base64: string) => void) { URL.revokeObjectURL(url); // eslint-disable-next-line require-atomic-updates audioCtxRef.current = audioCtx; - }, []); + }, [sampleRate]); const startRecording = useCallback(async () => { await initAudio(); @@ -69,16 +79,21 @@ export function useRecorder(onAudioChunk: (base64: string) => void) { const stream = await navigator.mediaDevices.getUserMedia({ audio: { channelCount: 1, - sampleRate: 24000, + sampleRate, echoCancellation: true } }); streamRef.current = stream; const source = audioCtx.createMediaStreamSource(stream); - const worklet = new AudioWorkletNode(audioCtx, 'audio-recorder'); + const worklet = new AudioWorkletNode(audioCtx, 'audio-recorder', { + processorOptions: { + bufferSize: ((sampleRate as number) * (chunkIntervalMs as number)) / MS_IN_SECOND + } + }); worklet.port.onmessage = e => { if (e.data.eventType === 'audio') { + const timestamp = new Date().toISOString(); const float32 = e.data.audioData; const int16 = new Int16Array(float32.length); for (let i = 0; i < float32.length; i++) { @@ -86,7 +101,7 @@ export function useRecorder(onAudioChunk: (base64: string) => void) { int16[i] = Math.max(INT16_MIN, Math.min(INT16_MAX, float32[i] * INT16_SCALE)); } const base64 = btoa(String.fromCharCode(...new Uint8Array(int16.buffer))); - onAudioChunk(base64); + onAudioChunk(base64, timestamp); } }; @@ -95,7 +110,7 @@ export function useRecorder(onAudioChunk: (base64: string) => void) { worklet.port.postMessage({ command: 'START' }); workletRef.current = worklet; setRecordingInternal(true); - }, [initAudio, onAudioChunk]); + }, [Date, chunkIntervalMs, initAudio, onAudioChunk, sampleRate]); const stopRecording = useCallback(() => { if (workletRef.current) { diff --git a/packages/core/src/reducers/activities/combineActivitiesReducer.ts b/packages/core/src/reducers/activities/combineActivitiesReducer.ts index 488df1be62..49ee3cee3d 100644 --- a/packages/core/src/reducers/activities/combineActivitiesReducer.ts +++ b/packages/core/src/reducers/activities/combineActivitiesReducer.ts @@ -10,6 +10,7 @@ import createGroupedActivitiesReducer, { type ActivitiesState = { activities: readonly WebChatActivity[]; + voiceActivities: readonly WebChatActivity[]; groupedActivities: GroupedActivitiesState; }; @@ -34,7 +35,12 @@ export default function combineActivitiesReducer( state: (ExistingState & ActivitiesState) | undefined, action: ExistingAction & GroupedActivitiesAction ): ExistingState & ActivitiesState { - const { activities: _activities, groupedActivities, ...existingState } = state ?? {}; + const { + activities: _activities, + voiceActivities: _voiceActivities, + groupedActivities, + ...existingState + } = state ?? {}; const nextState = existingSlicedReducer(existingState as ExistingState, action); const nextGroupedActivities = groupedActivitiesReducer(groupedActivities, action); @@ -52,7 +58,12 @@ export default function combineActivitiesReducer( ); return hasChanged - ? { ...nextState, activities: nextGroupedActivities.sortedActivities, groupedActivities: nextGroupedActivities } + ? { + ...nextState, + activities: nextGroupedActivities.sortedActivities, + voiceActivities: nextGroupedActivities.voiceActivities, + groupedActivities: nextGroupedActivities + } : state; }; } diff --git a/packages/core/src/reducers/activities/createGroupedActivitiesReducer.ts b/packages/core/src/reducers/activities/createGroupedActivitiesReducer.ts index f333c67af1..e7179ac44e 100644 --- a/packages/core/src/reducers/activities/createGroupedActivitiesReducer.ts +++ b/packages/core/src/reducers/activities/createGroupedActivitiesReducer.ts @@ -32,7 +32,7 @@ import type { WebChatActivity } from '../../types/WebChatActivity'; import patchActivity from './patchActivity'; import deleteActivityByLocalId from './sort/deleteActivityByLocalId'; import { generateLocalIdInActivity, getLocalIdFromActivity, setLocalIdInActivity } from './sort/property/LocalId'; -import { getPositionFromActivity, setPositionInActivity } from './sort/property/Position'; +import { getPositionFromActivity, queryPositionFromActivity, setPositionInActivity } from './sort/property/Position'; import { setReceivedAtInActivity } from './sort/property/ReceivedAt'; import { querySendStatusFromOutgoingActivity, setSendStatusInOutgoingActivity } from './sort/property/SendStatus'; import queryLocalIdAByActivityId from './sort/queryLocalIdByActivityId'; @@ -42,6 +42,8 @@ import updateActivityChannelData, { updateActivityChannelDataInternalSkipNameCheck } from './sort/updateActivityChannelData'; import upsert, { INITIAL_STATE } from './sort/upsert'; +import isVoiceActivity from '../../utils/voiceActivity/isVoiceActivity'; +import isVoiceTranscriptActivity from '../../utils/voiceActivity/isVoiceTranscriptActivity'; type GroupedActivitiesAction = | DeleteActivityAction @@ -100,6 +102,13 @@ function createGroupedActivitiesReducer( payload: { activity } } = action; + // Not transcript voice does not render on UI and mostly fire and forget as we dont't have replay etc. + // hence we don't want to process and simply pass through. + if (isVoiceActivity(activity) && !isVoiceTranscriptActivity(activity)) { + state = upsert(ponyfill, state, activity); + break; + } + // Patch activity so the outgoing blob: URL is not re-downloadable. // Related to /__tests__/html2/accessibility/liveRegion/attachment/file. @@ -151,6 +160,12 @@ function createGroupedActivitiesReducer( } case POST_ACTIVITY_FULFILLED: { + // Not transcript voice does not render on UI and mostly fire and forget as we dont't have replay etc. + // hence we don't want to process and simply pass through. + if (isVoiceActivity(action.payload.activity) && !isVoiceTranscriptActivity(action.payload.activity)) { + state = upsert(ponyfill, state, action.payload.activity); + break; + } const localId = queryLocalIdAByClientActivityId(state, action.meta.clientActivityID); const existingActivity = localId && state.activityMap.get(localId)?.activity; @@ -175,8 +190,11 @@ function createGroupedActivitiesReducer( activity = setSendStatusInOutgoingActivity(activity, SENT); activity = setLocalIdInActivity(activity, localId); - // Keep existing position. - activity = setPositionInActivity(activity, getPositionFromActivity(existingActivity)); + // Keep existing position (if it exists - voice activities don't have positions) + const existingPosition = queryPositionFromActivity(existingActivity); + if (typeof existingPosition !== 'undefined') { + activity = setPositionInActivity(activity, getPositionFromActivity(existingActivity)); + } // Compare the INCOMING_ACTIVITY below: // - POST_ACTIVITY_FULFILLED will mark send status as SENT diff --git a/packages/core/src/reducers/activities/sort/types.ts b/packages/core/src/reducers/activities/sort/types.ts index 286711854f..28be6d1149 100644 --- a/packages/core/src/reducers/activities/sort/types.ts +++ b/packages/core/src/reducers/activities/sort/types.ts @@ -57,6 +57,7 @@ type State = { readonly livestreamSessionMap: LivestreamSessionMap; readonly sortedChatHistoryList: SortedChatHistory; readonly sortedActivities: readonly Activity[]; + readonly voiceActivities: readonly Activity[]; }; export { diff --git a/packages/core/src/reducers/activities/sort/upsert.ts b/packages/core/src/reducers/activities/sort/upsert.ts index c917d77568..9528d1c956 100644 --- a/packages/core/src/reducers/activities/sort/upsert.ts +++ b/packages/core/src/reducers/activities/sort/upsert.ts @@ -18,6 +18,8 @@ import { type SortedChatHistoryEntry, type State } from './types'; +import isVoiceActivity from '../../../utils/voiceActivity/isVoiceActivity'; +import isVoiceTranscriptActivity from '../../../utils/voiceActivity/isVoiceTranscriptActivity'; // Honoring timestamp or not: // @@ -48,7 +50,8 @@ const INITIAL_STATE = Object.freeze({ livestreamSessionMap: Object.freeze(new Map()), howToGroupingMap: Object.freeze(new Map()), sortedActivities: Object.freeze([]), - sortedChatHistoryList: Object.freeze([]) + sortedChatHistoryList: Object.freeze([]), + voiceActivities: Object.freeze([]) } satisfies State); // Question: Why insertion sort works but not quick sort? @@ -58,6 +61,14 @@ const INITIAL_STATE = Object.freeze({ // - Duplicate timestamps: activities without timestamp can't be sort deterministically with quick sort function upsert(ponyfill: Pick, state: State, activity: Activity): State { + // we only want to process transcript voice activities thorugh this as those will be rendered. + // all other voice activities will be stored in separate slice and we don't want to perform any operation on them. + if (isVoiceActivity(activity) && !isVoiceTranscriptActivity(activity)) { + return Object.freeze({ + ...state, + voiceActivities: Object.freeze([...state.voiceActivities, activity]) + } satisfies State); + } const nextActivityIdToLocalIdMap = new Map(state.activityIdToLocalIdMap); const nextActivityMap = new Map(state.activityMap); const nextClientActivityIdToLocalIdMap = new Map(state.clientActivityIdToLocalIdMap); @@ -336,7 +347,8 @@ function upsert(ponyfill: Pick, state: State, activ howToGroupingMap: Object.freeze(nextHowToGroupingMap), livestreamSessionMap: Object.freeze(nextLivestreamSessionMap), sortedActivities: Object.freeze(nextSortedActivities), - sortedChatHistoryList: Object.freeze(nextSortedChatHistoryList) + sortedChatHistoryList: Object.freeze(nextSortedChatHistoryList), + voiceActivities: state.voiceActivities } satisfies State); } diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts index c8d744595e..1b2c3abf77 100644 --- a/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts +++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts @@ -2,36 +2,34 @@ import isVoiceActivity from './isVoiceActivity'; import { WebChatActivity } from '../../types/WebChatActivity'; // Mock activity factory for testing -const createMockActivity = (type: string = 'event', value?: any): WebChatActivity => ({ +const createMockActivity = (type: string = 'event', name?: string, value?: any): WebChatActivity => ({ type: type as any, id: 'test-activity-id', from: { id: 'test-user' }, channelData: { 'webchat:sequence-id': 1 }, + ...(name && { name }), ...(value && { value }) }); -const createMockVoiceActivity = (voiceEventType: string, additionalProps?: any): WebChatActivity => - createMockActivity('event', { - voiceLiveEvent: { - type: voiceEventType, - ...additionalProps - } +const createMockVoiceActivity = (name: string, voiceProps: Record): WebChatActivity => + createMockActivity('event', name, { + voice: voiceProps }); describe('isVoiceActivity', () => { describe('Valid voice activities', () => { - test('should return true for event activity with voiceLiveEvent', () => { - const activity = createMockVoiceActivity('response.audio.delta', { delta: 'audiodata' }); + test('should return true for event activity with voice', () => { + const activity = createMockVoiceActivity('stream.chunk', { contentUrl: 'base64' }); const result = isVoiceActivity(activity); expect(result).toBe(true); }); - test('should return true for voice activity with minimal voiceLiveEvent', () => { - const activity = createMockActivity('event', { voiceLiveEvent: {} }); + test('should return true for voice activity with minimal voice', () => { + const activity = createMockActivity('event', 'stream.chunk', { voice: {} }); const result = isVoiceActivity(activity); @@ -41,18 +39,29 @@ describe('isVoiceActivity', () => { describe('Invalid activities', () => { const testCases = [ - // Invalid by activity type { - name: 'message activity with voiceLiveEvent', - activity: () => createMockActivity('message', { voiceLiveEvent: { type: 'response.audio.delta' } }) + name: 'message activity with voice', + activity: () => createMockActivity('message', 'stream.chunk', { voice: { contentUrl: 'base64' } }) }, { name: 'typing activity', activity: () => createMockActivity('typing') }, { - name: 'event activity with value', - activity: () => ({ ...createMockActivity('event'), value: 'not an object' }) + name: 'event activity with non-object value', + activity: () => ({ ...createMockActivity('event', 'test'), value: 'not an object' }) + }, + { + name: 'event activity without voice property', + activity: () => createMockActivity('event', 'test', { someOtherProp: 'value' }) + }, + { + name: 'event activity with no value', + activity: () => createMockActivity('event', 'test') + }, + { + name: 'event activity with no name', + activity: () => createMockActivity('event', undefined, { voice: {} }) } ]; @@ -63,22 +72,37 @@ describe('isVoiceActivity', () => { }); }); - describe('Real-world voice event types', () => { - const voiceEventTypes = [ - 'input_audio_buffer.append', - 'input_audio_buffer.speech_started', - 'input_audio_buffer.speech_stopped', - 'conversation.item.input_audio_transcription.completed', - 'response.audio.delta', - 'response.audio_transcript.delta', - 'response.audio_transcript.done', - 'response.done', - 'session.update', - 'response.cancel' + describe('Real-world voice activity scenarios', () => { + const voiceScenarios = [ + { + name: 'session.update with speech detected state', + eventName: 'session.update', + voiceProps: { bot_state: 'voice.request.detected', message: 'Your request is identified' } + }, + { + name: 'session.update with processing state', + eventName: 'session.update', + voiceProps: { bot_state: 'voice.request.processing', message: 'Your request is being processed' } + }, + { + name: 'stream.end with user transcription', + eventName: 'stream.end', + voiceProps: { transcription: 'My destination is bangalore', origin: 'user' } + }, + { + name: 'stream.chunk with server audio response', + eventName: 'stream.chunk', + voiceProps: { contentUrl: 'base64chunk' } + }, + { + name: 'stream.end with bot transcription', + eventName: 'stream.end', + voiceProps: { transcription: 'Your destination is at 1000m above sea level', origin: 'bot' } + } ]; - test.each(voiceEventTypes)('should return true for voice event type: %s', eventType => { - const activity = createMockVoiceActivity(eventType); + test.each(voiceScenarios)('should return true for $name', ({ eventName, voiceProps }) => { + const activity = createMockVoiceActivity(eventName, voiceProps); const result = isVoiceActivity(activity); diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts index e16154e590..a17937d8ba 100644 --- a/packages/core/src/utils/voiceActivity/isVoiceActivity.ts +++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts @@ -1,14 +1,17 @@ import { WebChatActivity } from '../../types/WebChatActivity'; -// This is interim type guard until activity protocol is ratified. +// This is interim until activity protocol is ratified. const isVoiceActivity = ( activity: WebChatActivity ): activity is WebChatActivity & { - value: { voiceLiveEvent: any }; + name: string; + type: 'event'; + value: { voice: any }; } => activity.type === 'event' && - activity.value && + !!activity.name && + !!activity.value && typeof activity.value === 'object' && - 'voiceLiveEvent' in activity.value; + 'voice' in activity.value; export default isVoiceActivity; diff --git a/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts new file mode 100644 index 0000000000..e061e24813 --- /dev/null +++ b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts @@ -0,0 +1,164 @@ +import isVoiceTranscriptActivity from './isVoiceTranscriptActivity'; +import { WebChatActivity } from '../../types/WebChatActivity'; + +// Mock activity factory for testing +const createMockActivity = (type: string = 'event', name?: string, value?: any): WebChatActivity => ({ + type: type as any, + id: 'test-activity-id', + from: { id: 'test-user' }, + channelData: { + 'webchat:sequence-id': 1 + }, + ...(name && { name }), + ...(value && { value }) +}); + +const createMockVoiceActivity = (name: string, voiceProps: Record): WebChatActivity => + createMockActivity('event', name, { + voice: voiceProps + }); + +describe('isVoiceTranscriptActivity', () => { + describe('Valid transcript activities', () => { + test('should return true for stream.end with user transcription', () => { + const activity = createMockVoiceActivity('stream.end', { + transcription: 'Hello world', + origin: 'user' + }); + + const result = isVoiceTranscriptActivity(activity); + + expect(result).toBe(true); + }); + + test('should return true for stream.end with bot transcription', () => { + const activity = createMockVoiceActivity('stream.end', { + transcription: 'Hi there!', + origin: 'bot' + }); + + const result = isVoiceTranscriptActivity(activity); + + expect(result).toBe(true); + }); + + test('should return true for stream.end with empty transcription string', () => { + const activity = createMockVoiceActivity('stream.end', { + transcription: '', + origin: 'user' + }); + + const result = isVoiceTranscriptActivity(activity); + + expect(result).toBe(true); + }); + }); + + describe('Invalid activities', () => { + const testCases = [ + { + name: 'stream.chunk voice activity', + activity: () => createMockVoiceActivity('stream.chunk', { contentUrl: 'base64' }) + }, + { + name: 'session.update voice activity', + activity: () => createMockVoiceActivity('session.update', { bot_state: 'voice.request.detected' }) + }, + { + name: 'stream.end without transcription', + activity: () => createMockVoiceActivity('stream.end', { origin: 'user' }) + }, + { + name: 'stream.end with non-string transcription', + activity: () => createMockVoiceActivity('stream.end', { transcription: 123, origin: 'user' }) + }, + { + name: 'stream.end with null transcription', + activity: () => createMockVoiceActivity('stream.end', { transcription: null, origin: 'user' }) + }, + { + name: 'regular message activity', + activity: () => createMockActivity('message', 'test') + }, + { + name: 'typing activity', + activity: () => createMockActivity('typing') + }, + { + name: 'event activity without voice data', + activity: () => createMockActivity('event', 'stream.end', { someData: 'test' }) + }, + { + name: 'event activity with null value', + activity: () => ({ ...createMockActivity('event', 'stream.end'), value: null }) + }, + { + name: 'event activity without value', + activity: () => createMockActivity('event', 'stream.end') + }, + { + name: 'event activity without name', + activity: () => createMockActivity('event', undefined, { voice: { transcription: 'test' } }) + } + ]; + + test.each(testCases)('should return false for $name', ({ activity }) => { + const result = isVoiceTranscriptActivity(activity()); + + expect(result).toBe(false); + }); + }); + + describe('Real-world scenarios', () => { + test('should identify user transcript in conversation flow', () => { + const conversationActivities = [ + createMockVoiceActivity('session.update', { bot_state: 'voice.request.detected' }), + createMockVoiceActivity('session.update', { bot_state: 'voice.request.processing' }), + createMockVoiceActivity('stream.end', { + transcription: 'What is the weather today?', + origin: 'user' + }) + ]; + + const transcriptResults = conversationActivities.map(activity => isVoiceTranscriptActivity(activity)); + + expect(transcriptResults).toEqual([false, false, true]); + }); + + test('should identify bot transcript in response flow', () => { + const responseActivities = [ + createMockVoiceActivity('session.update', { bot_state: 'voice.response.available' }), + createMockVoiceActivity('stream.chunk', { contentUrl: 'chunk1' }), + createMockVoiceActivity('stream.chunk', { contentUrl: 'chunk2' }), + createMockVoiceActivity('stream.end', { + transcription: 'Today will be sunny with a high of 75 degrees.', + origin: 'bot' + }) + ]; + + const transcriptResults = responseActivities.map(activity => isVoiceTranscriptActivity(activity)); + + expect(transcriptResults).toEqual([false, false, false, true]); + }); + + test('should handle complete conversation with mixed activities', () => { + const mixedActivities = [ + createMockActivity('message', 'test'), + createMockVoiceActivity('stream.end', { + transcription: 'Hello', + origin: 'user' + }), + createMockVoiceActivity('stream.chunk', { contentUrl: 'audio' }), + createMockVoiceActivity('stream.end', { + transcription: 'Hi there!', + origin: 'bot' + }), + createMockActivity('typing') + ]; + + const transcriptResults = mixedActivities.map(activity => isVoiceTranscriptActivity(activity)); + + expect(transcriptResults).toEqual([false, true, false, true, false]); + }); + }); +}); diff --git a/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts new file mode 100644 index 0000000000..f6da7a746a --- /dev/null +++ b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts @@ -0,0 +1,18 @@ +import isVoiceActivity from './isVoiceActivity'; +import { WebChatActivity } from '../../types/WebChatActivity'; + +const isVoiceTranscriptActivity = ( + activity: WebChatActivity +): activity is WebChatActivity & { + value: { + voice: { + transcription: string; + origin: 'user' | 'bot'; + }; + }; +} => + isVoiceActivity(activity) && + activity.name === 'stream.end' && + typeof activity.value?.voice?.transcription === 'string'; + +export default isVoiceTranscriptActivity; From 0838e44d585e450425b04990ceea415d754981fe Mon Sep 17 00:00:00 2001 From: Pranav Joshi Date: Thu, 8 Jan 2026 10:54:44 +0000 Subject: [PATCH 4/4] refactor composer to not use direct state inside effect --- .../api/src/hooks/internal/useStateRef.ts | 31 +++++++++ .../SpeechToSpeech/SpeechToSpeechComposer.tsx | 64 ++++++++++++------- .../isVoiceTranscriptActivity.ts | 2 +- 3 files changed, 73 insertions(+), 24 deletions(-) create mode 100644 packages/api/src/hooks/internal/useStateRef.ts diff --git a/packages/api/src/hooks/internal/useStateRef.ts b/packages/api/src/hooks/internal/useStateRef.ts new file mode 100644 index 0000000000..a6f517fcc3 --- /dev/null +++ b/packages/api/src/hooks/internal/useStateRef.ts @@ -0,0 +1,31 @@ +import { useCallback, useRef, useState } from 'react'; + +import type { Dispatch, MutableRefObject, SetStateAction } from 'react'; + +export default function useStateRef( + initialValue?: T +): readonly [T, Dispatch>, MutableRefObject] { + const [_, forceRender] = useState(); + const valueRef: MutableRefObject = useRef(initialValue); + + const setter: Dispatch> = useCallback( + (value: SetStateAction) => { + const { current } = valueRef; + + value = value instanceof Function ? value(current) : value; + + if (current !== value) { + valueRef.current = value; + + forceRender({}); + } + }, + [forceRender, valueRef] + ); + + return Object.freeze([valueRef.current, setter, valueRef]) as readonly [ + T, + Dispatch>, + MutableRefObject + ]; +} diff --git a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx index b1f978e42c..70d9c3aa4a 100644 --- a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx +++ b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx @@ -1,21 +1,22 @@ -import React, { useCallback, useEffect, useMemo, useRef, useState, type ReactNode } from 'react'; +import React, { useCallback, useEffect, useMemo, useRef, type ReactNode } from 'react'; import { isVoiceActivity, WebChatActivity } from 'botframework-webchat-core'; import { useAudioPlayer } from './private/useAudioPlayer'; import { useRecorder } from './private/useRecorder'; import { useDebouncedNotifications, usePostActivity, useVoiceActivities } from '../../hooks'; import SpeechToSpeechContext from './private/Context'; import { SpeechState } from './types/SpeechState'; +import useStateRef from '../../hooks/internal/useStateRef'; export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> = ({ children }) => { const [voiceActivities] = useVoiceActivities(); const postActivity = usePostActivity(); const [{ connectivitystatus }] = useDebouncedNotifications(); const lastProcessedIndexRef = useRef(0); - const [speechState, setSpeechState] = useState('idle'); + const [speechState, setSpeechState] = useStateRef('idle'); - // config received from server on session start, for now ccv2 and mmrt runs on different sample rate and chunk interval. + // config received from server on session init (only once), for now ccv2 and mmrt runs on different sample rate and chunk interval. // we will read those config, free form object as unsure of what all session config would be needed in future. - const [serverConfig, setServerConfig] = useState | null>(null); + const [serverConfig, setServerConfig] = useStateRef | null>(null); const { playAudio, stopAudio, isPlaying } = useAudioPlayer(serverConfig); const isConnected = useMemo(() => connectivitystatus?.message === 'connected', [connectivitystatus]); @@ -42,27 +43,44 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> const { name, value } = activity; const { voice } = value; - // TODO - this will be commandResult activity and not event, need to think on handling of command and commandResult activities. - if (name === 'session.init' && value.session?.config) { - setServerConfig(value.session.config as Record); - } else if (name === 'session.update') { - switch (voice.bot_state) { - case 'voice.request.detected': - stopAudio(); - setSpeechState('listening'); - break; - case 'voice.request.processing': - setSpeechState('processing'); - break; - default: - break; + switch (name) { + // TODO - this will be commandResult activity and not event, need to think on handling of command and commandResult activities. + case 'session.init': { + setServerConfig(value.session?.config as Record); + break; } - } else if (name === 'stream.chunk' && voice.contentUrl) { - playAudio(voice.contentUrl); + + case 'session.update': { + switch (voice.bot_state) { + case 'voice.request.detected': + stopAudio(); + setSpeechState('listening'); + break; + + case 'voice.request.processing': + setSpeechState('processing'); + break; + + default: + break; + } + break; + } + + case 'stream.chunk': { + if (voice.contentUrl) { + playAudio(voice.contentUrl); + } + break; + } + + default: + break; } }, - [playAudio, stopAudio] + [playAudio, setServerConfig, setSpeechState, stopAudio] ); + useEffect(() => { const startIndex = lastProcessedIndexRef.current; if (!voiceActivities.length || startIndex >= voiceActivities.length) { @@ -93,7 +111,7 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> } lastProcessedIndexRef.current = voiceActivities.length; - }, [voiceActivities, recording, isPlaying, speechState, handleVoiceActivity]); + }, [handleVoiceActivity, isPlaying, recording, setSpeechState, speechState, voiceActivities]); const setRecording = useCallback( async (shouldRecord: boolean) => { @@ -110,7 +128,7 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> await baseSetRecording(shouldRecord); }, - [isConnected, baseSetRecording, stopAudio] + [isConnected, baseSetRecording, setSpeechState, stopAudio] ); const contextValue = useMemo( diff --git a/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts index f6da7a746a..c6ae5bd742 100644 --- a/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts +++ b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts @@ -7,7 +7,7 @@ const isVoiceTranscriptActivity = ( value: { voice: { transcription: string; - origin: 'user' | 'bot'; + origin: 'user' | 'agent'; }; }; } =>