From d132592ddb57a1f7c3d9ee35b94bf88679408048 Mon Sep 17 00:00:00 2001
From: Pranav Joshi <pranavjoshi001@gmail.com>
Date: Fri, 12 Dec 2025 13:26:30 +0000
Subject: [PATCH 1/4] initial no-op s2s core implementation

---
 packages/api/src/hooks/useSpeechToSpeech.ts   |   3 +
 packages/api/src/hooks/useVoiceActivities.ts  |  11 +
 .../SpeechToSpeech/SpeechToSpeechComposer.tsx | 134 +++++++++
 .../SpeechToSpeech/private/Context.ts         |  14 +
 .../private/useAudioPlayer.spec.tsx           | 279 ++++++++++++++++++
 .../SpeechToSpeech/private/useAudioPlayer.ts  |  69 +++++
 .../SpeechToSpeech/private/useContext.ts      |  15 +
 .../private/useRecorder.spec.tsx              | 160 ++++++++++
 .../SpeechToSpeech/private/useRecorder.ts     | 128 ++++++++
 .../SpeechToSpeech/types/SpeechState.ts       |   1 +
 .../SpeechToSpeech/useSpeechToSpeech.ts       |   6 +
 packages/core/src/index.ts                    |   2 +
 .../voiceActivity/isVoiceActivity.spec.ts     |  88 ++++++
 .../utils/voiceActivity/isVoiceActivity.ts    |  14 +
 14 files changed, 924 insertions(+)
 create mode 100644 packages/api/src/hooks/useSpeechToSpeech.ts
 create mode 100644 packages/api/src/hooks/useVoiceActivities.ts
 create mode 100644 packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx
 create mode 100644 packages/api/src/providers/SpeechToSpeech/private/Context.ts
 create mode 100644 packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx
 create mode 100644 packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts
 create mode 100644 packages/api/src/providers/SpeechToSpeech/private/useContext.ts
 create mode 100644 packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx
 create mode 100644 packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts
 create mode 100644 packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts
 create mode 100644 packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts
 create mode 100644 packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts
 create mode 100644 packages/core/src/utils/voiceActivity/isVoiceActivity.ts

diff --git a/packages/api/src/hooks/useSpeechToSpeech.ts b/packages/api/src/hooks/useSpeechToSpeech.ts
new file mode 100644
index 0000000000..4f529a2c08
--- /dev/null
+++ b/packages/api/src/hooks/useSpeechToSpeech.ts
@@ -0,0 +1,3 @@
+import useSpeechToSpeech from '../providers/SpeechToSpeech/useSpeechToSpeech';
+
+export default useSpeechToSpeech;
diff --git a/packages/api/src/hooks/useVoiceActivities.ts b/packages/api/src/hooks/useVoiceActivities.ts
new file mode 100644
index 0000000000..d65e142b17
--- /dev/null
+++ b/packages/api/src/hooks/useVoiceActivities.ts
@@ -0,0 +1,11 @@
+import { isVoiceActivity, type WebChatActivity } from 'botframework-webchat-core';
+import { useSelector } from './internal/WebChatReduxContext';
+
+const activitiesSelector = (state: { activities: WebChatActivity[] }) => state.activities;
+
+const of = (predicate: (activity: WebChatActivity) => boolean) => (state: { activities: WebChatActivity[] }) =>
+  activitiesSelector(state).filter(predicate);
+
+export default function useVoiceActivities(): [WebChatActivity[]] {
+  return [useSelector(of(activity => isVoiceActivity(activity)))];
+}
diff --git a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx
new file mode 100644
index 0000000000..0ccf1a6f32
--- /dev/null
+++ b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx
@@ -0,0 +1,134 @@
+import React, { useCallback, useEffect, useMemo, useRef, useState, type ReactNode } from 'react';
+import { isVoiceActivity, WebChatActivity } from 'botframework-webchat-core';
+import { useAudioPlayer } from './private/useAudioPlayer';
+import { useRecorder } from './private/useRecorder';
+import { useDebouncedNotifications, usePostActivity, useVoiceActivities } from '../../hooks';
+import SpeechToSpeechContext from './private/Context';
+import { SpeechState } from './types/SpeechState';
+
+export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> = ({ children }) => {
+  const [voiceActivities] = useVoiceActivities();
+  const postActivity = usePostActivity();
+  const [{ connectivitystatus }] = useDebouncedNotifications();
+  const { playAudio, stopAudio, isPlaying } = useAudioPlayer();
+
+  const lastProcessedIndexRef = useRef(0);
+
+  // Remove when we have activity protocol changes, we would get this as part of signal activity.
+  const [speechState, setSpeechState] = useState<SpeechState>('idle');
+
+  const isConnected = useMemo(() => connectivitystatus?.message === 'connected', [connectivitystatus]);
+
+  const sendAudioChunk = useCallback(
+    (base64: string) => {
+      postActivity({
+        type: 'event',
+        name: 'stream.chunk',
+        value: { voiceLiveEvent: { type: 'input_audio_buffer.append', audio: base64 } }
+      } as any);
+    },
+    [postActivity]
+  );
+
+  const { recording, setRecording: baseSetRecording } = useRecorder(sendAudioChunk);
+
+  const cancelActiveResponse = useCallback(() => {
+    if (isPlaying) {
+      postActivity({
+        type: 'event',
+        value: { voiceLiveEvent: { type: 'response.cancel' } }
+      } as any);
+    }
+  }, [isPlaying, postActivity]);
+
+  const handleVoiceActivity = useCallback(
+    (activity: WebChatActivity) => {
+      if (!isVoiceActivity(activity)) {
+        return;
+      }
+
+      const { voiceLiveEvent } = activity.value;
+
+      switch (voiceLiveEvent.type) {
+        case 'input_audio_buffer.speech_started':
+          stopAudio();
+          setSpeechState('listening');
+          break;
+        case 'input_audio_buffer.speech_stopped':
+          setSpeechState('processing');
+          break;
+        case 'response.audio.delta':
+          if (voiceLiveEvent.delta && recording) {
+            playAudio(voiceLiveEvent.delta);
+          }
+          break;
+        case 'response.done':
+          if (!isPlaying) {
+            setSpeechState('listening');
+          }
+          break;
+        default:
+          break;
+      }
+    },
+    [isPlaying, playAudio, recording, stopAudio]
+  );
+
+  useEffect(() => {
+    const startIndex = lastProcessedIndexRef.current;
+
+    if (!voiceActivities.length || startIndex >= voiceActivities.length) {
+      return;
+    }
+
+    // If not recording, skip processing voice activities but update ref
+    // so next time we start recording, we only process new activities.
+    if (!recording) {
+      lastProcessedIndexRef.current = voiceActivities.length;
+      return;
+    }
+
+    for (let i = startIndex; i < voiceActivities.length; i++) {
+      // eslint-disable-next-line security/detect-object-injection
+      handleVoiceActivity(voiceActivities[i]);
+    }
+
+    if (isPlaying && speechState !== 'bot_speaking') {
+      setSpeechState('bot_speaking');
+    } else if (!isPlaying && speechState === 'bot_speaking') {
+      setSpeechState('listening');
+    }
+
+    lastProcessedIndexRef.current = voiceActivities.length;
+  }, [voiceActivities, recording, postActivity, isPlaying, playAudio, speechState, stopAudio, handleVoiceActivity]);
+
+  const setRecording = useCallback(
+    (shouldRecord: boolean) => {
+      if (!isConnected) {
+        return;
+      }
+
+      if (!recording) {
+        setSpeechState('listening');
+      } else {
+        stopAudio();
+        cancelActiveResponse();
+        setSpeechState('idle');
+      }
+
+      baseSetRecording(shouldRecord);
+    },
+    [isConnected, recording, baseSetRecording, stopAudio, cancelActiveResponse]
+  );
+
+  const contextValue = useMemo(
+    () => ({
+      recording,
+      setRecording,
+      speechState
+    }),
+    [recording, setRecording, speechState]
+  );
+
+  return <SpeechToSpeechContext.Provider value={contextValue}>{children}</SpeechToSpeechContext.Provider>;
+};
diff --git a/packages/api/src/providers/SpeechToSpeech/private/Context.ts b/packages/api/src/providers/SpeechToSpeech/private/Context.ts
new file mode 100644
index 0000000000..ce85310246
--- /dev/null
+++ b/packages/api/src/providers/SpeechToSpeech/private/Context.ts
@@ -0,0 +1,14 @@
+import { createContext } from 'react';
+import { SpeechState } from '../types/SpeechState';
+
+type SpeechToSpeechContextType = {
+  recording: boolean;
+  setRecording: (recording: boolean) => void;
+  speechState: SpeechState;
+};
+
+const SpeechToSpeechContext = createContext<SpeechToSpeechContextType>(undefined!);
+
+export default SpeechToSpeechContext;
+
+export type { SpeechToSpeechContextType };
diff --git a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx
new file mode 100644
index 0000000000..8c1d42cb08
--- /dev/null
+++ b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx
@@ -0,0 +1,279 @@
+/** @jest-environment @happy-dom/jest-environment */
+/// <reference types="jest" />
+/// <reference types="node" />
+
+import { render, type RenderResult } from '@testing-library/react';
+import React, { type ComponentType } from 'react';
+import { useAudioPlayer } from './useAudioPlayer';
+
+// Mock AudioContext and related APIs
+const mockAudioContext = {
+  sampleRate: 24000,
+  currentTime: 0,
+  destination: {},
+  state: 'running',
+  resume: jest.fn().mockResolvedValue(undefined),
+  close: jest.fn().mockResolvedValue(undefined),
+  createBuffer: jest.fn(),
+  createBufferSource: jest.fn()
+};
+
+const mockAudioBuffer = {
+  duration: 0.1, // 100m
+  getChannelData: jest.fn().mockReturnValue(new Float32Array(2400))
+};
+
+const mockBufferSource = {
+  buffer: null,
+  connect: jest.fn(),
+  start: jest.fn(),
+  stop: jest.fn(),
+  disconnect: jest.fn(),
+  onended: null
+};
+
+// Mock global AudioContext
+global.AudioContext = jest.fn(() => mockAudioContext) as any;
+global.atob = jest.fn(str => str); // Simple mock for base64 decode
+
+type UseAudioPlayerReturn = ReturnType<typeof useAudioPlayer>;
+
+describe('setup', () => {
+  let HookApp: ComponentType;
+  let hookData: UseAudioPlayerReturn | undefined;
+  let renderResult: RenderResult;
+  const originalAudioContext = global.AudioContext;
+
+  beforeEach(() => {
+    jest.clearAllMocks();
+    mockAudioContext.currentTime = 0;
+    mockAudioContext.createBuffer.mockReturnValue(mockAudioBuffer);
+    mockAudioContext.createBufferSource.mockReturnValue(mockBufferSource);
+    mockBufferSource.buffer = null;
+    mockBufferSource.onended = null;
+
+    HookApp = () => {
+      hookData = useAudioPlayer();
+      return null;
+    };
+  });
+
+  afterEach(() => {
+    global.AudioContext = originalAudioContext;
+  });
+
+  describe('Initialization', () => {
+    test('should initialize with correct default values', () => {
+      render(<HookApp />);
+
+      expect(hookData?.isPlaying).toBe(false);
+      expect(typeof hookData?.playAudio).toBe('function');
+      expect(typeof hookData?.stopAudio).toBe('function');
+    });
+
+    test('should create AudioContext on first playAudio call', () => {
+      render(<HookApp />);
+
+      hookData?.playAudio('dGVzdA=='); // base64 for 'test'
+
+      expect(AudioContext).toHaveBeenCalledWith({ sampleRate: 24000 });
+    });
+
+    test('should reuse existing AudioContext on subsequent calls', () => {
+      render(<HookApp />);
+
+      hookData?.playAudio('dGVzdA==');
+      hookData?.playAudio('dGVzdDI=');
+
+      expect(AudioContext).toHaveBeenCalledTimes(1);
+    });
+  });
+
+  describe('Audio playback', () => {
+    beforeEach(() => {
+      renderResult = render(<HookApp />);
+    });
+
+    test('should process base64 audio data correctly', () => {
+      hookData?.playAudio('dGVzdA==');
+
+      expect(global.atob).toHaveBeenCalledWith('dGVzdA==');
+      expect(mockAudioContext.createBuffer).toHaveBeenCalledWith(1, expect.any(Number), 24000);
+      expect(mockAudioContext.createBufferSource).toHaveBeenCalled();
+    });
+
+    test('should set up audio buffer source correctly', () => {
+      hookData?.playAudio('dGVzdA==');
+
+      expect(mockBufferSource.connect).toHaveBeenCalledWith(mockAudioContext.destination);
+      expect(mockBufferSource.start).toHaveBeenCalled();
+      expect(mockBufferSource.buffer).toBe(mockAudioBuffer);
+    });
+
+    test('should resume AudioContext if needed', () => {
+      hookData?.playAudio('dGVzdA==');
+
+      expect(mockAudioContext.resume).toHaveBeenCalled();
+    });
+
+    test('should queue multiple audio chunks correctly', () => {
+      mockAudioBuffer.duration = 0.1; // 100ms
+
+      hookData?.playAudio('dGVzdA==');
+      hookData?.playAudio('dGVzdDI=');
+
+      expect(mockBufferSource.start).toHaveBeenCalledTimes(2);
+      // First chunk starts at currentTime (0), second at 0.1
+      expect(mockBufferSource.start).toHaveBeenNthCalledWith(1, 0);
+      expect(mockBufferSource.start).toHaveBeenNthCalledWith(2, 0.1);
+    });
+  });
+
+  describe('isPlaying state', () => {
+    beforeEach(() => {
+      renderResult = render(<HookApp />);
+    });
+
+    test('should return true when audio is queued for playback', () => {
+      mockAudioContext.currentTime = 0;
+      mockAudioBuffer.duration = 0.1;
+
+      hookData?.playAudio('dGVzdA==');
+      renderResult.rerender(<HookApp />);
+
+      expect(hookData?.isPlaying).toBe(true);
+    });
+
+    test('should return false when no audio is queued', () => {
+      expect(hookData?.isPlaying).toBe(false);
+    });
+
+    test('should handle multiple chunks and playing state', () => {
+      mockAudioContext.currentTime = 0.05; // In the middle of first chunk
+      mockAudioBuffer.duration = 0.1;
+
+      hookData?.playAudio('dGVzdA=='); // 0 - 0.1
+      hookData?.playAudio('dGVzdDI='); // 0.1 - 0.2
+      renderResult.rerender(<HookApp />);
+
+      expect(hookData?.isPlaying).toBe(true);
+    });
+  });
+
+  describe('Audio cleanup', () => {
+    beforeEach(() => {
+      renderResult = render(<HookApp />);
+    });
+
+    test('should clean up buffer source on ended', () => {
+      hookData?.playAudio('dGVzdA==');
+
+      // Simulate audio ended
+      if (mockBufferSource.onended) {
+        mockBufferSource.onended();
+      }
+
+      expect(mockBufferSource.disconnect).toHaveBeenCalled();
+      expect(mockBufferSource.buffer).toBeNull();
+    });
+
+    test('should stop all audio and close context', () => {
+      hookData?.playAudio('dGVzdA==');
+
+      hookData?.stopAudio();
+      renderResult.rerender(<HookApp />);
+
+      expect(mockAudioContext.close).toHaveBeenCalled();
+      expect(hookData?.isPlaying).toBe(false);
+    });
+  });
+
+  describe('Error handling', () => {
+    beforeEach(() => {
+      renderResult = render(<HookApp />);
+    });
+
+    test('should handle invalid base64 data gracefully', () => {
+      expect(() => {
+        hookData?.playAudio('invalid-base64!@#');
+      }).not.toThrow();
+    });
+
+    test('should handle AudioContext creation failure', () => {
+      global.AudioContext = jest.fn(() => {
+        throw new Error('AudioContext not supported');
+      }) as any;
+
+      expect(() => {
+        hookData?.playAudio('dGVzdA==');
+      }).toThrow('AudioContext not supported');
+    });
+
+    test('should handle missing audio context in isPlaying', () => {
+      // Before any audio is played, audioCtxRef should be null
+      expect(hookData?.isPlaying).toBe(false);
+    });
+  });
+
+  describe('Real-world scenarios', () => {
+    beforeEach(() => {
+      renderResult = render(<HookApp />);
+    });
+
+    test('should handle streaming audio chunks', () => {
+      mockAudioBuffer.duration = 0.05; // 50ms chunks
+
+      // Simulate streaming 5 chunks
+      for (let i = 0; i < 5; i++) {
+        hookData?.playAudio(`chunk${i}`);
+      }
+
+      expect(mockBufferSource.start).toHaveBeenCalledTimes(5);
+      renderResult.rerender(<HookApp />);
+      expect(hookData?.isPlaying).toBe(true);
+    });
+
+    test('should handle playback interruption', () => {
+      hookData?.playAudio('dGVzdA==');
+      renderResult.rerender(<HookApp />);
+      expect(hookData?.isPlaying).toBe(true);
+
+      hookData?.stopAudio();
+      renderResult.rerender(<HookApp />);
+      expect(hookData?.isPlaying).toBe(false);
+      expect(mockAudioContext.close).toHaveBeenCalled();
+    });
+
+    test('should handle resume after stop', () => {
+      // Play, stop, then play again
+      hookData?.playAudio('dGVzdA==');
+      hookData?.stopAudio();
+      hookData?.playAudio('dGVzdDI=');
+
+      expect(AudioContext).toHaveBeenCalledTimes(2); // New context after stop
+    });
+  });
+
+  describe('Performance considerations', () => {
+    beforeEach(() => {
+      renderResult = render(<HookApp />);
+    });
+
+    test('should handle large audio data', () => {
+      const largeBase64 = 'A'.repeat(10000);
+
+      expect(() => {
+        hookData?.playAudio(largeBase64);
+      }).not.toThrow();
+    });
+
+    test('should handle rapid successive calls', () => {
+      for (let i = 0; i < 100; i++) {
+        // Ensure the mock "base64" data has an even length as Int16Array (which represents 16-bit audio samples) requires the underlying data to be in multiples of 2 bytes
+        hookData?.playAudio(`chunk${i}`.padEnd(8, ' '));
+      }
+
+      expect(mockBufferSource.start).toHaveBeenCalledTimes(100);
+    });
+  });
+});
diff --git a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts
new file mode 100644
index 0000000000..6216932a8c
--- /dev/null
+++ b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts
@@ -0,0 +1,69 @@
+import { useRef, useCallback } from 'react';
+
+const SAMPLE_RATE = 24000;
+const INT16_SCALE = 32768;
+
+export function useAudioPlayer() {
+  const audioCtxRef = useRef<AudioContext | null>(null);
+  const nextPlayTimeRef = useRef(0);
+
+  const initAudio = useCallback(() => {
+    if (!audioCtxRef.current) {
+      audioCtxRef.current = new AudioContext({ sampleRate: SAMPLE_RATE });
+    }
+    return audioCtxRef.current;
+  }, []);
+
+  const playAudio = useCallback(
+    (base64: string) => {
+      const audioCtx = initAudio();
+      audioCtx.resume?.();
+
+      try {
+        const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0));
+        const int16 = new Int16Array(bytes.buffer);
+        const float32 = new Float32Array(int16.length);
+
+        for (let i = 0; i < int16.length; i++) {
+          // eslint-disable-next-line security/detect-object-injection
+          float32[i] = int16[i] / INT16_SCALE;
+        }
+
+        const buffer = audioCtx.createBuffer(1, float32.length, SAMPLE_RATE);
+        buffer.getChannelData(0).set(float32);
+
+        const src = audioCtx.createBufferSource();
+        src.buffer = buffer;
+        src.connect(audioCtx.destination);
+
+        // Clear buffer when finished
+        src.onended = () => {
+          src.disconnect();
+          src.buffer = null;
+        };
+
+        nextPlayTimeRef.current = Math.max(nextPlayTimeRef.current, audioCtx.currentTime);
+        src.start(nextPlayTimeRef.current);
+        nextPlayTimeRef.current += buffer.duration;
+      } catch (error) {
+        console.warn('botframework-webchat: Error during audio playback in useAudioPlayer:', error);
+      }
+    },
+    [initAudio]
+  );
+
+  const stopAudio = useCallback(() => {
+    nextPlayTimeRef.current = 0;
+
+    if (audioCtxRef.current) {
+      audioCtxRef.current.close();
+      audioCtxRef.current = null;
+    }
+  }, []);
+
+  return {
+    playAudio,
+    stopAudio,
+    isPlaying: audioCtxRef.current ? audioCtxRef.current.currentTime < nextPlayTimeRef.current : false
+  };
+}
diff --git a/packages/api/src/providers/SpeechToSpeech/private/useContext.ts b/packages/api/src/providers/SpeechToSpeech/private/useContext.ts
new file mode 100644
index 0000000000..50926b0a12
--- /dev/null
+++ b/packages/api/src/providers/SpeechToSpeech/private/useContext.ts
@@ -0,0 +1,15 @@
+import { useContext } from 'react';
+
+import SpeechToSpeechContext from './Context';
+
+import type { SpeechToSpeechContextType } from './Context';
+
+export default function useSpeechToSpeechContext(thrownOnUndefined = true): SpeechToSpeechContextType {
+  const contextValue = useContext(SpeechToSpeechContext);
+
+  if (thrownOnUndefined && !contextValue) {
+    throw new Error('botframework-webchat internal: This hook can only be used under <SpeechToSpeechContext>.');
+  }
+
+  return contextValue;
+}
diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx
new file mode 100644
index 0000000000..01368ceda2
--- /dev/null
+++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx
@@ -0,0 +1,160 @@
+/** @jest-environment @happy-dom/jest-environment */
+/// <reference types="jest" />
+
+import { act, render, waitFor, type RenderResult } from '@testing-library/react';
+import React, { type ComponentType } from 'react';
+import { useRecorder } from './useRecorder';
+
+// --- Mocks ---
+
+const mockTrack = {
+  stop: jest.fn()
+};
+
+const mockMediaStream = {
+  getTracks: jest.fn(() => [mockTrack])
+};
+
+const mockMediaDevices = {
+  getUserMedia: jest.fn().mockResolvedValue(mockMediaStream)
+};
+
+const mockWorkletPort = {
+  postMessage: jest.fn(),
+  onmessage: null as ((event: { data: any }) => void) | null
+};
+
+const mockWorkletNode = {
+  connect: jest.fn(),
+  disconnect: jest.fn(),
+  port: mockWorkletPort
+};
+
+const mockAudioContext = {
+  state: 'running',
+  resume: jest.fn().mockResolvedValue(undefined),
+  createMediaStreamSource: jest.fn(() => ({
+    connect: jest.fn()
+  })),
+  destination: {},
+  audioWorklet: {
+    addModule: jest.fn().mockResolvedValue(undefined)
+  }
+};
+
+// --- Global Mocks Setup ---
+
+Object.defineProperty(global.navigator, 'mediaDevices', {
+  value: mockMediaDevices,
+  writable: true
+});
+
+global.AudioContext = jest.fn(() => mockAudioContext as any);
+global.AudioWorkletNode = jest.fn(() => mockWorkletNode as any);
+global.Blob = jest.fn(parts => ({ parts, type: parts[1]?.type })) as any;
+global.URL.createObjectURL = jest.fn(() => 'blob:http://localhost/mock-url');
+global.URL.revokeObjectURL = jest.fn();
+global.btoa = jest.fn(str => `btoa(${str})`);
+
+// --- Tests ---
+
+describe('useRecorder', () => {
+  let onAudioChunk: jest.Mock;
+  let HookApp: ComponentType<{ onAudioChunk: (base64: string) => void }>;
+  let hookData: ReturnType<typeof useRecorder> | undefined;
+  // eslint-disable-next-line @typescript-eslint/no-unused-vars
+  let renderResult: RenderResult;
+
+  beforeEach(() => {
+    // Clear all mocks before each test
+    jest.clearAllMocks();
+    onAudioChunk = jest.fn();
+    hookData = undefined;
+    mockWorkletPort.onmessage = null;
+    (mockAudioContext.state as any) = 'running';
+
+    HookApp = ({ onAudioChunk }) => {
+      hookData = useRecorder(onAudioChunk);
+      return null;
+    };
+  });
+
+  test('should be initially not recording', () => {
+    render(<HookApp onAudioChunk={onAudioChunk} />);
+    expect(hookData?.recording).toBe(false);
+  });
+
+  test('should start recording when setRecording(true) is called', async () => {
+    renderResult = render(<HookApp onAudioChunk={onAudioChunk} />);
+
+    act(() => {
+      hookData?.setRecording(true);
+    });
+
+    await waitFor(() => expect(hookData?.recording).toBe(true));
+
+    expect(navigator.mediaDevices.getUserMedia).toHaveBeenCalledTimes(1);
+    expect(global.AudioContext).toHaveBeenCalledTimes(1);
+    expect(mockAudioContext.audioWorklet.addModule).toHaveBeenCalledTimes(1);
+    expect(global.AudioWorkletNode).toHaveBeenCalledWith(expect.anything(), 'audio-recorder');
+    expect(mockWorkletNode.connect).toHaveBeenCalledTimes(1);
+    expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'START' });
+  });
+
+  test('should stop recording when setRecording(false) is called', async () => {
+    renderResult = render(<HookApp onAudioChunk={onAudioChunk} />);
+
+    // Start recording
+    act(() => {
+      hookData?.setRecording(true);
+    });
+
+    await waitFor(() => expect(hookData?.recording).toBe(true));
+
+    // Stop recording
+    act(() => {
+      hookData?.setRecording(false);
+    });
+
+    await waitFor(() => expect(hookData?.recording).toBe(false));
+
+    expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'STOP' });
+    expect(mockWorkletNode.disconnect).toHaveBeenCalledTimes(1);
+    expect(mockTrack.stop).toHaveBeenCalledTimes(1);
+  });
+
+  test('should process audio chunks sent from the worklet', async () => {
+    render(<HookApp onAudioChunk={onAudioChunk} />);
+
+    act(() => {
+      hookData?.setRecording(true);
+    });
+
+    await waitFor(() => expect(mockWorkletPort.onmessage).not.toBeNull());
+
+    // Simulate a message from the audio worklet
+    const mockAudioData = new Float32Array([0.1, 0.2, -0.1]);
+    act(() => {
+      mockWorkletPort.onmessage!({
+        data: {
+          eventType: 'audio',
+          audioData: mockAudioData
+        }
+      });
+    });
+
+    await waitFor(() => expect(onAudioChunk).toHaveBeenCalledTimes(1));
+    expect(global.btoa).toHaveBeenCalled();
+  });
+
+  test('should handle suspended audio context by resuming it', async () => {
+    (mockAudioContext.state as any) = 'suspended';
+    render(<HookApp onAudioChunk={onAudioChunk} />);
+
+    act(() => {
+      hookData?.setRecording(true);
+    });
+
+    await waitFor(() => expect(mockAudioContext.resume).toHaveBeenCalledTimes(1));
+  });
+});
diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts
new file mode 100644
index 0000000000..b9930cada1
--- /dev/null
+++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts
@@ -0,0 +1,128 @@
+import { useRef, useState, useCallback } from 'react';
+
+const audioProcessorCode = `
+class AudioRecorderProcessor extends AudioWorkletProcessor {
+  constructor() {
+    super()
+    this.recording = false
+    this.buffer = []
+    this.port.onmessage = e => {
+      if (e.data.command === 'START') this.recording = true
+      else if (e.data.command === 'STOP') {
+        this.recording = false
+        if (this.buffer.length) this.sendBuffer()
+      }
+    }
+  }
+  sendBuffer() {
+    if (this.buffer.length) {
+      this.port.postMessage({
+        eventType: 'audio',
+        audioData: new Float32Array(this.buffer)
+      })
+      this.buffer = []
+    }
+  }
+  process(inputs) {
+    if (inputs[0]?.length && this.recording) {
+      this.buffer.push(...inputs[0][0])
+      if (this.buffer.length >= 2400) this.sendBuffer()
+    }
+    return true
+  }
+}
+registerProcessor('audio-recorder', AudioRecorderProcessor)
+`;
+
+const INT16_MIN = -32768;
+const INT16_MAX = 32767;
+const INT16_SCALE = 32767;
+
+export function useRecorder(onAudioChunk: (base64: string) => void) {
+  const [recording, setRecordingInternal] = useState(false);
+  const audioCtxRef = useRef<AudioContext | null>(null);
+  const workletRef = useRef<AudioWorkletNode | null>(null);
+  const streamRef = useRef<MediaStream | null>(null);
+
+  const initAudio = useCallback(async () => {
+    if (audioCtxRef.current) {
+      return;
+    }
+    const audioCtx = new AudioContext({ sampleRate: 24000 });
+    const blob = new Blob([audioProcessorCode], {
+      type: 'application/javascript'
+    });
+    // eslint-disable-next-line no-restricted-properties
+    const url = URL.createObjectURL(blob);
+    await audioCtx.audioWorklet.addModule(url);
+    URL.revokeObjectURL(url);
+    // eslint-disable-next-line require-atomic-updates
+    audioCtxRef.current = audioCtx;
+  }, []);
+
+  const startRecording = useCallback(async () => {
+    await initAudio();
+    const audioCtx = audioCtxRef.current!;
+    if (audioCtx.state === 'suspended') {
+      await audioCtx.resume();
+    }
+    const stream = await navigator.mediaDevices.getUserMedia({
+      audio: {
+        channelCount: 1,
+        sampleRate: 24000,
+        echoCancellation: true
+      }
+    });
+    streamRef.current = stream;
+    const source = audioCtx.createMediaStreamSource(stream);
+    const worklet = new AudioWorkletNode(audioCtx, 'audio-recorder');
+
+    worklet.port.onmessage = e => {
+      if (e.data.eventType === 'audio') {
+        const float32 = e.data.audioData;
+        const int16 = new Int16Array(float32.length);
+        for (let i = 0; i < float32.length; i++) {
+          // eslint-disable-next-line security/detect-object-injection
+          int16[i] = Math.max(INT16_MIN, Math.min(INT16_MAX, float32[i] * INT16_SCALE));
+        }
+        const base64 = btoa(String.fromCharCode(...new Uint8Array(int16.buffer)));
+        onAudioChunk(base64);
+      }
+    };
+
+    source.connect(worklet);
+    worklet.connect(audioCtx.destination);
+    worklet.port.postMessage({ command: 'START' });
+    workletRef.current = worklet;
+    setRecordingInternal(true);
+  }, [initAudio, onAudioChunk]);
+
+  const stopRecording = useCallback(() => {
+    if (workletRef.current) {
+      workletRef.current.port.postMessage({ command: 'STOP' });
+      workletRef.current.disconnect();
+      workletRef.current = null;
+    }
+    if (streamRef.current) {
+      streamRef.current.getTracks().forEach(track => track.stop());
+      streamRef.current = null;
+    }
+    setRecordingInternal(false);
+  }, []);
+
+  const setRecording = useCallback(
+    async (shouldRecord: boolean) => {
+      if (!shouldRecord && recording) {
+        stopRecording();
+      } else if (shouldRecord && !recording) {
+        await startRecording();
+      }
+    },
+    [recording, startRecording, stopRecording]
+  );
+
+  return {
+    recording,
+    setRecording
+  };
+}
diff --git a/packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts b/packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts
new file mode 100644
index 0000000000..62d5cc8c13
--- /dev/null
+++ b/packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts
@@ -0,0 +1 @@
+export type SpeechState = 'idle' | 'listening' | 'processing' | 'bot_speaking';
diff --git a/packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts b/packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts
new file mode 100644
index 0000000000..d7ac3fac44
--- /dev/null
+++ b/packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts
@@ -0,0 +1,6 @@
+import { SpeechToSpeechContextType } from './private/Context';
+import useSpeechToSpeechContext from './private/useContext';
+
+export default function useSpeechToSpeech(): readonly [SpeechToSpeechContextType] {
+  return [useSpeechToSpeechContext()];
+}
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index e635e6a060..c0580223d8 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -81,6 +81,7 @@ import type { DefinedTerm as OrgSchemaDefinedTerm } from './types/external/OrgSc
 import type { Project as OrgSchemaProject } from './types/external/OrgSchema/Project';
 import type { Thing as OrgSchemaThing } from './types/external/OrgSchema/Thing';
 import type { UserReview as OrgSchemaUserReview } from './types/external/OrgSchema/UserReview';
+import isVoiceActivity from './utils/voiceActivity/isVoiceActivity';
 
 const Constants = { ActivityClientState, DictateState };
 
@@ -96,6 +97,7 @@ export {
   getActivityLivestreamingMetadata,
   getOrgSchemaMessage,
   isForbiddenPropertyName,
+  isVoiceActivity,
   markActivity,
   onErrorResumeNext,
   parseAction,
diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts
new file mode 100644
index 0000000000..c8d744595e
--- /dev/null
+++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts
@@ -0,0 +1,88 @@
+import isVoiceActivity from './isVoiceActivity';
+import { WebChatActivity } from '../../types/WebChatActivity';
+
+// Mock activity factory for testing
+const createMockActivity = (type: string = 'event', value?: any): WebChatActivity => ({
+  type: type as any,
+  id: 'test-activity-id',
+  from: { id: 'test-user' },
+  channelData: {
+    'webchat:sequence-id': 1
+  },
+  ...(value && { value })
+});
+
+const createMockVoiceActivity = (voiceEventType: string, additionalProps?: any): WebChatActivity =>
+  createMockActivity('event', {
+    voiceLiveEvent: {
+      type: voiceEventType,
+      ...additionalProps
+    }
+  });
+
+describe('isVoiceActivity', () => {
+  describe('Valid voice activities', () => {
+    test('should return true for event activity with voiceLiveEvent', () => {
+      const activity = createMockVoiceActivity('response.audio.delta', { delta: 'audiodata' });
+
+      const result = isVoiceActivity(activity);
+
+      expect(result).toBe(true);
+    });
+
+    test('should return true for voice activity with minimal voiceLiveEvent', () => {
+      const activity = createMockActivity('event', { voiceLiveEvent: {} });
+
+      const result = isVoiceActivity(activity);
+
+      expect(result).toBe(true);
+    });
+  });
+
+  describe('Invalid activities', () => {
+    const testCases = [
+      // Invalid by activity type
+      {
+        name: 'message activity with voiceLiveEvent',
+        activity: () => createMockActivity('message', { voiceLiveEvent: { type: 'response.audio.delta' } })
+      },
+      {
+        name: 'typing activity',
+        activity: () => createMockActivity('typing')
+      },
+      {
+        name: 'event activity with value',
+        activity: () => ({ ...createMockActivity('event'), value: 'not an object' })
+      }
+    ];
+
+    test.each(testCases)('should return false for $name', ({ activity }) => {
+      const result = isVoiceActivity(activity());
+
+      expect(result).toBe(false);
+    });
+  });
+
+  describe('Real-world voice event types', () => {
+    const voiceEventTypes = [
+      'input_audio_buffer.append',
+      'input_audio_buffer.speech_started',
+      'input_audio_buffer.speech_stopped',
+      'conversation.item.input_audio_transcription.completed',
+      'response.audio.delta',
+      'response.audio_transcript.delta',
+      'response.audio_transcript.done',
+      'response.done',
+      'session.update',
+      'response.cancel'
+    ];
+
+    test.each(voiceEventTypes)('should return true for voice event type: %s', eventType => {
+      const activity = createMockVoiceActivity(eventType);
+
+      const result = isVoiceActivity(activity);
+
+      expect(result).toBe(true);
+    });
+  });
+});
diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts
new file mode 100644
index 0000000000..e16154e590
--- /dev/null
+++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts
@@ -0,0 +1,14 @@
+import { WebChatActivity } from '../../types/WebChatActivity';
+
+// This is interim type guard until activity protocol is ratified.
+const isVoiceActivity = (
+  activity: WebChatActivity
+): activity is WebChatActivity & {
+  value: { voiceLiveEvent: any };
+} =>
+  activity.type === 'event' &&
+  activity.value &&
+  typeof activity.value === 'object' &&
+  'voiceLiveEvent' in activity.value;
+
+export default isVoiceActivity;

From a98245729479f03422a3a2c04ff33222d3368b56 Mon Sep 17 00:00:00 2001
From: Pranav Joshi <pranavjoshi001@gmail.com>
Date: Fri, 12 Dec 2025 13:41:48 +0000
Subject: [PATCH 2/4] minor

---
 packages/core/src/index.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index c0580223d8..a81d494a07 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -42,6 +42,7 @@ import getOrgSchemaMessage from './utils/getOrgSchemaMessage';
 import isForbiddenPropertyName from './utils/isForbiddenPropertyName';
 import onErrorResumeNext from './utils/onErrorResumeNext';
 import singleToArray from './utils/singleToArray';
+import isVoiceActivity from './utils/voiceActivity/isVoiceActivity';
 
 export {
   CLEAR_SUGGESTED_ACTIONS,
@@ -81,7 +82,6 @@ import type { DefinedTerm as OrgSchemaDefinedTerm } from './types/external/OrgSc
 import type { Project as OrgSchemaProject } from './types/external/OrgSchema/Project';
 import type { Thing as OrgSchemaThing } from './types/external/OrgSchema/Thing';
 import type { UserReview as OrgSchemaUserReview } from './types/external/OrgSchema/UserReview';
-import isVoiceActivity from './utils/voiceActivity/isVoiceActivity';
 
 const Constants = { ActivityClientState, DictateState };
 

From 9ddc63c8878a8d8eefda425688ec69d5fae74492 Mon Sep 17 00:00:00 2001
From: Pranav Joshi <pranavjoshi001@gmail.com>
Date: Wed, 7 Jan 2026 17:10:14 +0000
Subject: [PATCH 3/4] refactor to align close to activity structure

---
 packages/api/src/hooks/index.ts               |   4 +-
 packages/api/src/hooks/useVoiceActivities.ts  |   9 +-
 .../SpeechToSpeech/SpeechToSpeechComposer.tsx | 100 +++++------
 .../SpeechToSpeech/private/useAudioPlayer.ts  |  12 +-
 .../private/useRecorder.spec.tsx              |   6 +-
 .../SpeechToSpeech/private/useRecorder.ts     |  83 +++++----
 .../activities/combineActivitiesReducer.ts    |  15 +-
 .../createGroupedActivitiesReducer.ts         |  24 ++-
 .../src/reducers/activities/sort/types.ts     |   1 +
 .../src/reducers/activities/sort/upsert.ts    |  16 +-
 .../voiceActivity/isVoiceActivity.spec.ts     |  84 +++++----
 .../utils/voiceActivity/isVoiceActivity.ts    |  11 +-
 .../isVoiceTranscriptActivity.spec.ts         | 164 ++++++++++++++++++
 .../isVoiceTranscriptActivity.ts              |  18 ++
 14 files changed, 404 insertions(+), 143 deletions(-)
 create mode 100644 packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts
 create mode 100644 packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts

diff --git a/packages/api/src/hooks/index.ts b/packages/api/src/hooks/index.ts
index f5a1a959d7..b1d027ee3a 100644
--- a/packages/api/src/hooks/index.ts
+++ b/packages/api/src/hooks/index.ts
@@ -71,6 +71,7 @@ import useUIState from './useUIState';
 import useUserID from './useUserID';
 import useUsername from './useUsername';
 import useVoiceSelector from './useVoiceSelector';
+import useVoiceActivities from './useVoiceActivities';
 
 export { useBuildRenderActivityCallback } from '@msinternal/botframework-webchat-api-middleware';
 export { useSuggestedActionsHooks } from '@msinternal/botframework-webchat-redux-store';
@@ -148,5 +149,6 @@ export {
   useUIState,
   useUserID,
   useUsername,
-  useVoiceSelector
+  useVoiceSelector,
+  useVoiceActivities
 };
diff --git a/packages/api/src/hooks/useVoiceActivities.ts b/packages/api/src/hooks/useVoiceActivities.ts
index d65e142b17..0abff2229f 100644
--- a/packages/api/src/hooks/useVoiceActivities.ts
+++ b/packages/api/src/hooks/useVoiceActivities.ts
@@ -1,11 +1,6 @@
-import { isVoiceActivity, type WebChatActivity } from 'botframework-webchat-core';
+import { type WebChatActivity } from 'botframework-webchat-core';
 import { useSelector } from './internal/WebChatReduxContext';
 
-const activitiesSelector = (state: { activities: WebChatActivity[] }) => state.activities;
-
-const of = (predicate: (activity: WebChatActivity) => boolean) => (state: { activities: WebChatActivity[] }) =>
-  activitiesSelector(state).filter(predicate);
-
 export default function useVoiceActivities(): [WebChatActivity[]] {
-  return [useSelector(of(activity => isVoiceActivity(activity)))];
+  return [useSelector(({ voiceActivities }) => voiceActivities)];
 }
diff --git a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx
index 0ccf1a6f32..b1f978e42c 100644
--- a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx
+++ b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx
@@ -10,36 +10,28 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }>
   const [voiceActivities] = useVoiceActivities();
   const postActivity = usePostActivity();
   const [{ connectivitystatus }] = useDebouncedNotifications();
-  const { playAudio, stopAudio, isPlaying } = useAudioPlayer();
-
   const lastProcessedIndexRef = useRef(0);
-
-  // Remove when we have activity protocol changes, we would get this as part of signal activity.
   const [speechState, setSpeechState] = useState<SpeechState>('idle');
 
+  // config received from server on session start, for now ccv2 and mmrt runs on different sample rate and chunk interval.
+  // we will read those config, free form object as unsure of what all session config would be needed in future.
+  const [serverConfig, setServerConfig] = useState<Record<string, unknown> | null>(null);
+  const { playAudio, stopAudio, isPlaying } = useAudioPlayer(serverConfig);
+
   const isConnected = useMemo(() => connectivitystatus?.message === 'connected', [connectivitystatus]);
 
   const sendAudioChunk = useCallback(
-    (base64: string) => {
+    (base64: string, timestamp: string) => {
       postActivity({
         type: 'event',
         name: 'stream.chunk',
-        value: { voiceLiveEvent: { type: 'input_audio_buffer.append', audio: base64 } }
+        value: { voice: { contentUrl: base64, timestamp } }
       } as any);
     },
     [postActivity]
   );
 
-  const { recording, setRecording: baseSetRecording } = useRecorder(sendAudioChunk);
-
-  const cancelActiveResponse = useCallback(() => {
-    if (isPlaying) {
-      postActivity({
-        type: 'event',
-        value: { voiceLiveEvent: { type: 'response.cancel' } }
-      } as any);
-    }
-  }, [isPlaying, postActivity]);
+  const { recording, setRecording: baseSetRecording } = useRecorder(sendAudioChunk, serverConfig);
 
   const handleVoiceActivity = useCallback(
     (activity: WebChatActivity) => {
@@ -47,50 +39,51 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }>
         return;
       }
 
-      const { voiceLiveEvent } = activity.value;
-
-      switch (voiceLiveEvent.type) {
-        case 'input_audio_buffer.speech_started':
-          stopAudio();
-          setSpeechState('listening');
-          break;
-        case 'input_audio_buffer.speech_stopped':
-          setSpeechState('processing');
-          break;
-        case 'response.audio.delta':
-          if (voiceLiveEvent.delta && recording) {
-            playAudio(voiceLiveEvent.delta);
-          }
-          break;
-        case 'response.done':
-          if (!isPlaying) {
+      const { name, value } = activity;
+      const { voice } = value;
+
+      // TODO - this will be commandResult activity and not event, need to think on handling of command and commandResult activities.
+      if (name === 'session.init' && value.session?.config) {
+        setServerConfig(value.session.config as Record<string, unknown>);
+      } else if (name === 'session.update') {
+        switch (voice.bot_state) {
+          case 'voice.request.detected':
+            stopAudio();
             setSpeechState('listening');
-          }
-          break;
-        default:
-          break;
+            break;
+          case 'voice.request.processing':
+            setSpeechState('processing');
+            break;
+          default:
+            break;
+        }
+      } else if (name === 'stream.chunk' && voice.contentUrl) {
+        playAudio(voice.contentUrl);
       }
     },
-    [isPlaying, playAudio, recording, stopAudio]
+    [playAudio, stopAudio]
   );
-
   useEffect(() => {
     const startIndex = lastProcessedIndexRef.current;
-
     if (!voiceActivities.length || startIndex >= voiceActivities.length) {
       return;
     }
 
-    // If not recording, skip processing voice activities but update ref
-    // so next time we start recording, we only process new activities.
-    if (!recording) {
-      lastProcessedIndexRef.current = voiceActivities.length;
-      return;
-    }
-
     for (let i = startIndex; i < voiceActivities.length; i++) {
       // eslint-disable-next-line security/detect-object-injection
-      handleVoiceActivity(voiceActivities[i]);
+      const activity = voiceActivities[i];
+
+      // Skip processing the activity if it's from the user as we want to process only incoming voice activities.
+      // we may receive (optional) config from server as soon as socket is established
+      // at that time recording would be off but we still want to process to read config and act on it.
+      if (
+        activity.from?.role === 'user' ||
+        (!recording && isVoiceActivity(activity) && activity.name !== 'session.init')
+      ) {
+        continue;
+      }
+
+      handleVoiceActivity(activity);
     }
 
     if (isPlaying && speechState !== 'bot_speaking') {
@@ -100,25 +93,24 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }>
     }
 
     lastProcessedIndexRef.current = voiceActivities.length;
-  }, [voiceActivities, recording, postActivity, isPlaying, playAudio, speechState, stopAudio, handleVoiceActivity]);
+  }, [voiceActivities, recording, isPlaying, speechState, handleVoiceActivity]);
 
   const setRecording = useCallback(
-    (shouldRecord: boolean) => {
+    async (shouldRecord: boolean) => {
       if (!isConnected) {
         return;
       }
 
-      if (!recording) {
+      if (shouldRecord) {
         setSpeechState('listening');
       } else {
         stopAudio();
-        cancelActiveResponse();
         setSpeechState('idle');
       }
 
-      baseSetRecording(shouldRecord);
+      await baseSetRecording(shouldRecord);
     },
-    [isConnected, recording, baseSetRecording, stopAudio, cancelActiveResponse]
+    [isConnected, baseSetRecording, stopAudio]
   );
 
   const contextValue = useMemo(
diff --git a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts
index 6216932a8c..f9b8405387 100644
--- a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts
+++ b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts
@@ -1,18 +1,20 @@
 import { useRef, useCallback } from 'react';
 
-const SAMPLE_RATE = 24000;
+const DEFAULT_SAMPLE_RATE = 24000;
 const INT16_SCALE = 32768;
 
-export function useAudioPlayer() {
+export function useAudioPlayer(config?: Record<string, unknown> | null) {
   const audioCtxRef = useRef<AudioContext | null>(null);
   const nextPlayTimeRef = useRef(0);
 
+  const { sampleRate = DEFAULT_SAMPLE_RATE } = config || {};
+
   const initAudio = useCallback(() => {
     if (!audioCtxRef.current) {
-      audioCtxRef.current = new AudioContext({ sampleRate: SAMPLE_RATE });
+      audioCtxRef.current = new AudioContext({ sampleRate: sampleRate as number });
     }
     return audioCtxRef.current;
-  }, []);
+  }, [sampleRate]);
 
   const playAudio = useCallback(
     (base64: string) => {
@@ -29,7 +31,7 @@ export function useAudioPlayer() {
           float32[i] = int16[i] / INT16_SCALE;
         }
 
-        const buffer = audioCtx.createBuffer(1, float32.length, SAMPLE_RATE);
+        const buffer = audioCtx.createBuffer(1, float32.length, audioCtx.sampleRate);
         buffer.getChannelData(0).set(float32);
 
         const src = audioCtx.createBufferSource();
diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx
index 01368ceda2..f2b01ca6b5 100644
--- a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx
+++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx
@@ -7,6 +7,8 @@ import { useRecorder } from './useRecorder';
 
 // --- Mocks ---
 
+jest.mock('../../Ponyfill/usePonyfill', () => ({ __esModule: true, default: jest.fn(() => [{ Date: global.Date }]) }));
+
 const mockTrack = {
   stop: jest.fn()
 };
@@ -96,7 +98,9 @@ describe('useRecorder', () => {
     expect(navigator.mediaDevices.getUserMedia).toHaveBeenCalledTimes(1);
     expect(global.AudioContext).toHaveBeenCalledTimes(1);
     expect(mockAudioContext.audioWorklet.addModule).toHaveBeenCalledTimes(1);
-    expect(global.AudioWorkletNode).toHaveBeenCalledWith(expect.anything(), 'audio-recorder');
+    expect(global.AudioWorkletNode).toHaveBeenCalledWith(expect.anything(), 'audio-recorder', {
+      processorOptions: { bufferSize: 2400 }
+    });
     expect(mockWorkletNode.connect).toHaveBeenCalledTimes(1);
     expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'START' });
   });
diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts
index b9930cada1..7ba19ba244 100644
--- a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts
+++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts
@@ -1,54 +1,64 @@
 import { useRef, useState, useCallback } from 'react';
+import usePonyfill from '../../Ponyfill/usePonyfill';
 
 const audioProcessorCode = `
-class AudioRecorderProcessor extends AudioWorkletProcessor {
-  constructor() {
-    super()
-    this.recording = false
-    this.buffer = []
-    this.port.onmessage = e => {
-      if (e.data.command === 'START') this.recording = true
-      else if (e.data.command === 'STOP') {
-        this.recording = false
-        if (this.buffer.length) this.sendBuffer()
+  class AudioRecorderProcessor extends AudioWorkletProcessor {
+    constructor(options) {
+      super()
+      this.recording = false
+      this.buffer = []
+      this.bufferSize = options.processorOptions.bufferSize
+      this.port.onmessage = e => {
+        if (e.data.command === 'START') this.recording = true
+        else if (e.data.command === 'STOP') {
+          this.recording = false
+          this.buffer = []
+        }
       }
     }
-  }
-  sendBuffer() {
-    if (this.buffer.length) {
-      this.port.postMessage({
-        eventType: 'audio',
-        audioData: new Float32Array(this.buffer)
-      })
-      this.buffer = []
+    sendBuffer() {
+      while (this.buffer.length >= this.bufferSize) {
+        const chunk = this.buffer.splice(0, this.bufferSize)
+        this.port.postMessage({
+          eventType: 'audio',
+          audioData: new Float32Array(chunk)
+        })
+      }
     }
-  }
-  process(inputs) {
-    if (inputs[0]?.length && this.recording) {
-      this.buffer.push(...inputs[0][0])
-      if (this.buffer.length >= 2400) this.sendBuffer()
+    process(inputs) {
+      if (inputs[0]?.length && this.recording) {
+        this.buffer.push(...inputs[0][0])
+        if (this.buffer.length >= this.bufferSize) this.sendBuffer()
+      }
+      return true
     }
-    return true
   }
-}
-registerProcessor('audio-recorder', AudioRecorderProcessor)
-`;
+  registerProcessor('audio-recorder', AudioRecorderProcessor)`;
 
 const INT16_MIN = -32768;
 const INT16_MAX = 32767;
 const INT16_SCALE = 32767;
+const DEFAULT_SAMPLE_RATE = 24000;
+const DEFAULT_CHUNK_SIZE_IN_MS = 100;
+const MS_IN_SECOND = 1000;
 
-export function useRecorder(onAudioChunk: (base64: string) => void) {
+export function useRecorder(
+  onAudioChunk: (base64: string, timestamp: string) => void,
+  config?: Record<string, unknown> | null
+) {
   const [recording, setRecordingInternal] = useState(false);
   const audioCtxRef = useRef<AudioContext | null>(null);
   const workletRef = useRef<AudioWorkletNode | null>(null);
   const streamRef = useRef<MediaStream | null>(null);
+  const [{ Date }] = usePonyfill();
+
+  const { sampleRate = DEFAULT_SAMPLE_RATE, chunkIntervalMs = DEFAULT_CHUNK_SIZE_IN_MS } = config || {};
 
   const initAudio = useCallback(async () => {
     if (audioCtxRef.current) {
       return;
     }
-    const audioCtx = new AudioContext({ sampleRate: 24000 });
+    const audioCtx = new AudioContext({ sampleRate: sampleRate as number });
     const blob = new Blob([audioProcessorCode], {
       type: 'application/javascript'
     });
@@ -58,7 +68,7 @@ export function useRecorder(onAudioChunk: (base64: string) => void) {
     URL.revokeObjectURL(url);
     // eslint-disable-next-line require-atomic-updates
     audioCtxRef.current = audioCtx;
-  }, []);
+  }, [sampleRate]);
 
   const startRecording = useCallback(async () => {
     await initAudio();
@@ -69,16 +79,21 @@ export function useRecorder(onAudioChunk: (base64: string) => void) {
     const stream = await navigator.mediaDevices.getUserMedia({
       audio: {
         channelCount: 1,
-        sampleRate: 24000,
+        sampleRate,
         echoCancellation: true
       }
     });
     streamRef.current = stream;
     const source = audioCtx.createMediaStreamSource(stream);
-    const worklet = new AudioWorkletNode(audioCtx, 'audio-recorder');
+    const worklet = new AudioWorkletNode(audioCtx, 'audio-recorder', {
+      processorOptions: {
+        bufferSize: ((sampleRate as number) * (chunkIntervalMs as number)) / MS_IN_SECOND
+      }
+    });
 
     worklet.port.onmessage = e => {
       if (e.data.eventType === 'audio') {
+        const timestamp = new Date().toISOString();
         const float32 = e.data.audioData;
         const int16 = new Int16Array(float32.length);
         for (let i = 0; i < float32.length; i++) {
@@ -86,7 +101,7 @@ export function useRecorder(onAudioChunk: (base64: string) => void) {
           int16[i] = Math.max(INT16_MIN, Math.min(INT16_MAX, float32[i] * INT16_SCALE));
         }
         const base64 = btoa(String.fromCharCode(...new Uint8Array(int16.buffer)));
-        onAudioChunk(base64);
+        onAudioChunk(base64, timestamp);
       }
     };
 
@@ -95,7 +110,7 @@ export function useRecorder(onAudioChunk: (base64: string) => void) {
     worklet.port.postMessage({ command: 'START' });
     workletRef.current = worklet;
     setRecordingInternal(true);
-  }, [initAudio, onAudioChunk]);
+  }, [Date, chunkIntervalMs, initAudio, onAudioChunk, sampleRate]);
 
   const stopRecording = useCallback(() => {
     if (workletRef.current) {
diff --git a/packages/core/src/reducers/activities/combineActivitiesReducer.ts b/packages/core/src/reducers/activities/combineActivitiesReducer.ts
index 488df1be62..49ee3cee3d 100644
--- a/packages/core/src/reducers/activities/combineActivitiesReducer.ts
+++ b/packages/core/src/reducers/activities/combineActivitiesReducer.ts
@@ -10,6 +10,7 @@ import createGroupedActivitiesReducer, {
 
 type ActivitiesState = {
   activities: readonly WebChatActivity[];
+  voiceActivities: readonly WebChatActivity[];
   groupedActivities: GroupedActivitiesState;
 };
 
@@ -34,7 +35,12 @@ export default function combineActivitiesReducer<M>(
     state: (ExistingState & ActivitiesState) | undefined,
     action: ExistingAction & GroupedActivitiesAction
   ): ExistingState & ActivitiesState {
-    const { activities: _activities, groupedActivities, ...existingState } = state ?? {};
+    const {
+      activities: _activities,
+      voiceActivities: _voiceActivities,
+      groupedActivities,
+      ...existingState
+    } = state ?? {};
     const nextState = existingSlicedReducer(existingState as ExistingState, action);
     const nextGroupedActivities = groupedActivitiesReducer(groupedActivities, action);
 
@@ -52,7 +58,12 @@ export default function combineActivitiesReducer<M>(
       );
 
     return hasChanged
-      ? { ...nextState, activities: nextGroupedActivities.sortedActivities, groupedActivities: nextGroupedActivities }
+      ? {
+          ...nextState,
+          activities: nextGroupedActivities.sortedActivities,
+          voiceActivities: nextGroupedActivities.voiceActivities,
+          groupedActivities: nextGroupedActivities
+        }
       : state;
   };
 }
diff --git a/packages/core/src/reducers/activities/createGroupedActivitiesReducer.ts b/packages/core/src/reducers/activities/createGroupedActivitiesReducer.ts
index f333c67af1..e7179ac44e 100644
--- a/packages/core/src/reducers/activities/createGroupedActivitiesReducer.ts
+++ b/packages/core/src/reducers/activities/createGroupedActivitiesReducer.ts
@@ -32,7 +32,7 @@ import type { WebChatActivity } from '../../types/WebChatActivity';
 import patchActivity from './patchActivity';
 import deleteActivityByLocalId from './sort/deleteActivityByLocalId';
 import { generateLocalIdInActivity, getLocalIdFromActivity, setLocalIdInActivity } from './sort/property/LocalId';
-import { getPositionFromActivity, setPositionInActivity } from './sort/property/Position';
+import { getPositionFromActivity, queryPositionFromActivity, setPositionInActivity } from './sort/property/Position';
 import { setReceivedAtInActivity } from './sort/property/ReceivedAt';
 import { querySendStatusFromOutgoingActivity, setSendStatusInOutgoingActivity } from './sort/property/SendStatus';
 import queryLocalIdAByActivityId from './sort/queryLocalIdByActivityId';
@@ -42,6 +42,8 @@ import updateActivityChannelData, {
   updateActivityChannelDataInternalSkipNameCheck
 } from './sort/updateActivityChannelData';
 import upsert, { INITIAL_STATE } from './sort/upsert';
+import isVoiceActivity from '../../utils/voiceActivity/isVoiceActivity';
+import isVoiceTranscriptActivity from '../../utils/voiceActivity/isVoiceTranscriptActivity';
 
 type GroupedActivitiesAction =
   | DeleteActivityAction
@@ -100,6 +102,13 @@ function createGroupedActivitiesReducer(
           payload: { activity }
         } = action;
 
+        // Not transcript voice does not render on UI and mostly fire and forget as we dont't have replay etc.
+        // hence we don't want to process and simply pass through.
+        if (isVoiceActivity(activity) && !isVoiceTranscriptActivity(activity)) {
+          state = upsert(ponyfill, state, activity);
+          break;
+        }
+
         // Patch activity so the outgoing blob: URL is not re-downloadable.
         // Related to /__tests__/html2/accessibility/liveRegion/attachment/file.
 
@@ -151,6 +160,12 @@ function createGroupedActivitiesReducer(
       }
 
       case POST_ACTIVITY_FULFILLED: {
+        // Not transcript voice does not render on UI and mostly fire and forget as we dont't have replay etc.
+        // hence we don't want to process and simply pass through.
+        if (isVoiceActivity(action.payload.activity) && !isVoiceTranscriptActivity(action.payload.activity)) {
+          state = upsert(ponyfill, state, action.payload.activity);
+          break;
+        }
         const localId = queryLocalIdAByClientActivityId(state, action.meta.clientActivityID);
 
         const existingActivity = localId && state.activityMap.get(localId)?.activity;
@@ -175,8 +190,11 @@ function createGroupedActivitiesReducer(
         activity = setSendStatusInOutgoingActivity(activity, SENT);
         activity = setLocalIdInActivity(activity, localId);
 
-        // Keep existing position.
-        activity = setPositionInActivity(activity, getPositionFromActivity(existingActivity));
+        // Keep existing position (if it exists - voice activities don't have positions)
+        const existingPosition = queryPositionFromActivity(existingActivity);
+        if (typeof existingPosition !== 'undefined') {
+          activity = setPositionInActivity(activity, getPositionFromActivity(existingActivity));
+        }
 
         // Compare the INCOMING_ACTIVITY below:
         // - POST_ACTIVITY_FULFILLED will mark send status as SENT
diff --git a/packages/core/src/reducers/activities/sort/types.ts b/packages/core/src/reducers/activities/sort/types.ts
index 286711854f..28be6d1149 100644
--- a/packages/core/src/reducers/activities/sort/types.ts
+++ b/packages/core/src/reducers/activities/sort/types.ts
@@ -57,6 +57,7 @@ type State = {
   readonly livestreamSessionMap: LivestreamSessionMap;
   readonly sortedChatHistoryList: SortedChatHistory;
   readonly sortedActivities: readonly Activity[];
+  readonly voiceActivities: readonly Activity[];
 };
 
 export {
diff --git a/packages/core/src/reducers/activities/sort/upsert.ts b/packages/core/src/reducers/activities/sort/upsert.ts
index c917d77568..9528d1c956 100644
--- a/packages/core/src/reducers/activities/sort/upsert.ts
+++ b/packages/core/src/reducers/activities/sort/upsert.ts
@@ -18,6 +18,8 @@ import {
   type SortedChatHistoryEntry,
   type State
 } from './types';
+import isVoiceActivity from '../../../utils/voiceActivity/isVoiceActivity';
+import isVoiceTranscriptActivity from '../../../utils/voiceActivity/isVoiceTranscriptActivity';
 
 // Honoring timestamp or not:
 //
@@ -48,7 +50,8 @@ const INITIAL_STATE = Object.freeze({
   livestreamSessionMap: Object.freeze(new Map()),
   howToGroupingMap: Object.freeze(new Map()),
   sortedActivities: Object.freeze([]),
-  sortedChatHistoryList: Object.freeze([])
+  sortedChatHistoryList: Object.freeze([]),
+  voiceActivities: Object.freeze([])
 } satisfies State);
 
 // Question: Why insertion sort works but not quick sort?
@@ -58,6 +61,14 @@ const INITIAL_STATE = Object.freeze({
 // - Duplicate timestamps: activities without timestamp can't be sort deterministically with quick sort
 
 function upsert(ponyfill: Pick<GlobalScopePonyfill, 'Date'>, state: State, activity: Activity): State {
+  // we only want to process transcript voice activities thorugh this as those will be rendered.
+  // all other voice activities will be stored in separate slice and we don't want to perform any operation on them.
+  if (isVoiceActivity(activity) && !isVoiceTranscriptActivity(activity)) {
+    return Object.freeze({
+      ...state,
+      voiceActivities: Object.freeze([...state.voiceActivities, activity])
+    } satisfies State);
+  }
   const nextActivityIdToLocalIdMap = new Map(state.activityIdToLocalIdMap);
   const nextActivityMap = new Map(state.activityMap);
   const nextClientActivityIdToLocalIdMap = new Map(state.clientActivityIdToLocalIdMap);
@@ -336,7 +347,8 @@ function upsert(ponyfill: Pick<GlobalScopePonyfill, 'Date'>, state: State, activ
     howToGroupingMap: Object.freeze(nextHowToGroupingMap),
     livestreamSessionMap: Object.freeze(nextLivestreamSessionMap),
     sortedActivities: Object.freeze(nextSortedActivities),
-    sortedChatHistoryList: Object.freeze(nextSortedChatHistoryList)
+    sortedChatHistoryList: Object.freeze(nextSortedChatHistoryList),
+    voiceActivities: state.voiceActivities
   } satisfies State);
 }
 
diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts
index c8d744595e..1b2c3abf77 100644
--- a/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts
+++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts
@@ -2,36 +2,34 @@ import isVoiceActivity from './isVoiceActivity';
 import { WebChatActivity } from '../../types/WebChatActivity';
 
 // Mock activity factory for testing
-const createMockActivity = (type: string = 'event', value?: any): WebChatActivity => ({
+const createMockActivity = (type: string = 'event', name?: string, value?: any): WebChatActivity => ({
   type: type as any,
   id: 'test-activity-id',
   from: { id: 'test-user' },
   channelData: {
     'webchat:sequence-id': 1
   },
+  ...(name && { name }),
   ...(value && { value })
 });
 
-const createMockVoiceActivity = (voiceEventType: string, additionalProps?: any): WebChatActivity =>
-  createMockActivity('event', {
-    voiceLiveEvent: {
-      type: voiceEventType,
-      ...additionalProps
-    }
+const createMockVoiceActivity = (name: string, voiceProps: Record<string, any>): WebChatActivity =>
+  createMockActivity('event', name, {
+    voice: voiceProps
   });
 
 describe('isVoiceActivity', () => {
   describe('Valid voice activities', () => {
-    test('should return true for event activity with voiceLiveEvent', () => {
-      const activity = createMockVoiceActivity('response.audio.delta', { delta: 'audiodata' });
+    test('should return true for event activity with voice', () => {
+      const activity = createMockVoiceActivity('stream.chunk', { contentUrl: 'base64' });
 
       const result = isVoiceActivity(activity);
 
       expect(result).toBe(true);
     });
 
-    test('should return true for voice activity with minimal voiceLiveEvent', () => {
-      const activity = createMockActivity('event', { voiceLiveEvent: {} });
+    test('should return true for voice activity with minimal voice', () => {
+      const activity = createMockActivity('event', 'stream.chunk', { voice: {} });
 
       const result = isVoiceActivity(activity);
 
@@ -41,18 +39,29 @@ describe('isVoiceActivity', () => {
 
   describe('Invalid activities', () => {
     const testCases = [
-      // Invalid by activity type
       {
-        name: 'message activity with voiceLiveEvent',
-        activity: () => createMockActivity('message', { voiceLiveEvent: { type: 'response.audio.delta' } })
+        name: 'message activity with voice',
+        activity: () => createMockActivity('message', 'stream.chunk', { voice: { contentUrl: 'base64' } })
       },
       {
         name: 'typing activity',
         activity: () => createMockActivity('typing')
       },
       {
-        name: 'event activity with value',
-        activity: () => ({ ...createMockActivity('event'), value: 'not an object' })
+        name: 'event activity with non-object value',
+        activity: () => ({ ...createMockActivity('event', 'test'), value: 'not an object' })
+      },
+      {
+        name: 'event activity without voice property',
+        activity: () => createMockActivity('event', 'test', { someOtherProp: 'value' })
+      },
+      {
+        name: 'event activity with no value',
+        activity: () => createMockActivity('event', 'test')
+      },
+      {
+        name: 'event activity with no name',
+        activity: () => createMockActivity('event', undefined, { voice: {} })
       }
     ];
 
@@ -63,22 +72,37 @@ describe('isVoiceActivity', () => {
     });
   });
 
-  describe('Real-world voice event types', () => {
-    const voiceEventTypes = [
-      'input_audio_buffer.append',
-      'input_audio_buffer.speech_started',
-      'input_audio_buffer.speech_stopped',
-      'conversation.item.input_audio_transcription.completed',
-      'response.audio.delta',
-      'response.audio_transcript.delta',
-      'response.audio_transcript.done',
-      'response.done',
-      'session.update',
-      'response.cancel'
+  describe('Real-world voice activity scenarios', () => {
+    const voiceScenarios = [
+      {
+        name: 'session.update with speech detected state',
+        eventName: 'session.update',
+        voiceProps: { bot_state: 'voice.request.detected', message: 'Your request is identified' }
+      },
+      {
+        name: 'session.update with processing state',
+        eventName: 'session.update',
+        voiceProps: { bot_state: 'voice.request.processing', message: 'Your request is being processed' }
+      },
+      {
+        name: 'stream.end with user transcription',
+        eventName: 'stream.end',
+        voiceProps: { transcription: 'My destination is bangalore', origin: 'user' }
+      },
+      {
+        name: 'stream.chunk with server audio response',
+        eventName: 'stream.chunk',
+        voiceProps: { contentUrl: 'base64chunk' }
+      },
+      {
+        name: 'stream.end with bot transcription',
+        eventName: 'stream.end',
+        voiceProps: { transcription: 'Your destination is at 1000m above sea level', origin: 'bot' }
+      }
     ];
 
-    test.each(voiceEventTypes)('should return true for voice event type: %s', eventType => {
-      const activity = createMockVoiceActivity(eventType);
+    test.each(voiceScenarios)('should return true for $name', ({ eventName, voiceProps }) => {
+      const activity = createMockVoiceActivity(eventName, voiceProps);
 
       const result = isVoiceActivity(activity);
 
diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts
index e16154e590..a17937d8ba 100644
--- a/packages/core/src/utils/voiceActivity/isVoiceActivity.ts
+++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts
@@ -1,14 +1,17 @@
 import { WebChatActivity } from '../../types/WebChatActivity';
 
-// This is interim type guard until activity protocol is ratified.
+// This is interim until activity protocol is ratified.
 const isVoiceActivity = (
   activity: WebChatActivity
 ): activity is WebChatActivity & {
-  value: { voiceLiveEvent: any };
+  name: string;
+  type: 'event';
+  value: { voice: any };
 } =>
   activity.type === 'event' &&
-  activity.value &&
+  !!activity.name &&
+  !!activity.value &&
   typeof activity.value === 'object' &&
-  'voiceLiveEvent' in activity.value;
+  'voice' in activity.value;
 
 export default isVoiceActivity;
diff --git a/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts
new file mode 100644
index 0000000000..e061e24813
--- /dev/null
+++ b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts
@@ -0,0 +1,164 @@
+import isVoiceTranscriptActivity from './isVoiceTranscriptActivity';
+import { WebChatActivity } from '../../types/WebChatActivity';
+
+// Mock activity factory for testing
+const createMockActivity = (type: string = 'event', name?: string, value?: any): WebChatActivity => ({
+  type: type as any,
+  id: 'test-activity-id',
+  from: { id: 'test-user' },
+  channelData: {
+    'webchat:sequence-id': 1
+  },
+  ...(name && { name }),
+  ...(value && { value })
+});
+
+const createMockVoiceActivity = (name: string, voiceProps: Record<string, any>): WebChatActivity =>
+  createMockActivity('event', name, {
+    voice: voiceProps
+  });
+
+describe('isVoiceTranscriptActivity', () => {
+  describe('Valid transcript activities', () => {
+    test('should return true for stream.end with user transcription', () => {
+      const activity = createMockVoiceActivity('stream.end', {
+        transcription: 'Hello world',
+        origin: 'user'
+      });
+
+      const result = isVoiceTranscriptActivity(activity);
+
+      expect(result).toBe(true);
+    });
+
+    test('should return true for stream.end with bot transcription', () => {
+      const activity = createMockVoiceActivity('stream.end', {
+        transcription: 'Hi there!',
+        origin: 'bot'
+      });
+
+      const result = isVoiceTranscriptActivity(activity);
+
+      expect(result).toBe(true);
+    });
+
+    test('should return true for stream.end with empty transcription string', () => {
+      const activity = createMockVoiceActivity('stream.end', {
+        transcription: '',
+        origin: 'user'
+      });
+
+      const result = isVoiceTranscriptActivity(activity);
+
+      expect(result).toBe(true);
+    });
+  });
+
+  describe('Invalid activities', () => {
+    const testCases = [
+      {
+        name: 'stream.chunk voice activity',
+        activity: () => createMockVoiceActivity('stream.chunk', { contentUrl: 'base64' })
+      },
+      {
+        name: 'session.update voice activity',
+        activity: () => createMockVoiceActivity('session.update', { bot_state: 'voice.request.detected' })
+      },
+      {
+        name: 'stream.end without transcription',
+        activity: () => createMockVoiceActivity('stream.end', { origin: 'user' })
+      },
+      {
+        name: 'stream.end with non-string transcription',
+        activity: () => createMockVoiceActivity('stream.end', { transcription: 123, origin: 'user' })
+      },
+      {
+        name: 'stream.end with null transcription',
+        activity: () => createMockVoiceActivity('stream.end', { transcription: null, origin: 'user' })
+      },
+      {
+        name: 'regular message activity',
+        activity: () => createMockActivity('message', 'test')
+      },
+      {
+        name: 'typing activity',
+        activity: () => createMockActivity('typing')
+      },
+      {
+        name: 'event activity without voice data',
+        activity: () => createMockActivity('event', 'stream.end', { someData: 'test' })
+      },
+      {
+        name: 'event activity with null value',
+        activity: () => ({ ...createMockActivity('event', 'stream.end'), value: null })
+      },
+      {
+        name: 'event activity without value',
+        activity: () => createMockActivity('event', 'stream.end')
+      },
+      {
+        name: 'event activity without name',
+        activity: () => createMockActivity('event', undefined, { voice: { transcription: 'test' } })
+      }
+    ];
+
+    test.each(testCases)('should return false for $name', ({ activity }) => {
+      const result = isVoiceTranscriptActivity(activity());
+
+      expect(result).toBe(false);
+    });
+  });
+
+  describe('Real-world scenarios', () => {
+    test('should identify user transcript in conversation flow', () => {
+      const conversationActivities = [
+        createMockVoiceActivity('session.update', { bot_state: 'voice.request.detected' }),
+        createMockVoiceActivity('session.update', { bot_state: 'voice.request.processing' }),
+        createMockVoiceActivity('stream.end', {
+          transcription: 'What is the weather today?',
+          origin: 'user'
+        })
+      ];
+
+      const transcriptResults = conversationActivities.map(activity => isVoiceTranscriptActivity(activity));
+
+      expect(transcriptResults).toEqual([false, false, true]);
+    });
+
+    test('should identify bot transcript in response flow', () => {
+      const responseActivities = [
+        createMockVoiceActivity('session.update', { bot_state: 'voice.response.available' }),
+        createMockVoiceActivity('stream.chunk', { contentUrl: 'chunk1' }),
+        createMockVoiceActivity('stream.chunk', { contentUrl: 'chunk2' }),
+        createMockVoiceActivity('stream.end', {
+          transcription: 'Today will be sunny with a high of 75 degrees.',
+          origin: 'bot'
+        })
+      ];
+
+      const transcriptResults = responseActivities.map(activity => isVoiceTranscriptActivity(activity));
+
+      expect(transcriptResults).toEqual([false, false, false, true]);
+    });
+
+    test('should handle complete conversation with mixed activities', () => {
+      const mixedActivities = [
+        createMockActivity('message', 'test'),
+        createMockVoiceActivity('stream.end', {
+          transcription: 'Hello',
+          origin: 'user'
+        }),
+        createMockVoiceActivity('stream.chunk', { contentUrl: 'audio' }),
+        createMockVoiceActivity('stream.end', {
+          transcription: 'Hi there!',
+          origin: 'bot'
+        }),
+        createMockActivity('typing')
+      ];
+
+      const transcriptResults = mixedActivities.map(activity => isVoiceTranscriptActivity(activity));
+
+      expect(transcriptResults).toEqual([false, true, false, true, false]);
+    });
+  });
+});
diff --git a/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts
new file mode 100644
index 0000000000..f6da7a746a
--- /dev/null
+++ b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts
@@ -0,0 +1,18 @@
+import isVoiceActivity from './isVoiceActivity';
+import { WebChatActivity } from '../../types/WebChatActivity';
+
+const isVoiceTranscriptActivity = (
+  activity: WebChatActivity
+): activity is WebChatActivity & {
+  value: {
+    voice: {
+      transcription: string;
+      origin: 'user' | 'bot';
+    };
+  };
+} =>
+  isVoiceActivity(activity) &&
+  activity.name === 'stream.end' &&
+  typeof activity.value?.voice?.transcription === 'string';
+
+export default isVoiceTranscriptActivity;

From 0838e44d585e450425b04990ceea415d754981fe Mon Sep 17 00:00:00 2001
From: Pranav Joshi <pranavjoshi001@gmail.com>
Date: Thu, 8 Jan 2026 10:54:44 +0000
Subject: [PATCH 4/4] refactor composer to not use direct state inside effect

---
 .../api/src/hooks/internal/useStateRef.ts     | 31 +++++++++
 .../SpeechToSpeech/SpeechToSpeechComposer.tsx | 64 ++++++++++++-------
 .../isVoiceTranscriptActivity.ts              |  2 +-
 3 files changed, 73 insertions(+), 24 deletions(-)
 create mode 100644 packages/api/src/hooks/internal/useStateRef.ts

diff --git a/packages/api/src/hooks/internal/useStateRef.ts b/packages/api/src/hooks/internal/useStateRef.ts
new file mode 100644
index 0000000000..a6f517fcc3
--- /dev/null
+++ b/packages/api/src/hooks/internal/useStateRef.ts
@@ -0,0 +1,31 @@
+import { useCallback, useRef, useState } from 'react';
+
+import type { Dispatch, MutableRefObject, SetStateAction } from 'react';
+
+export default function useStateRef<T>(
+  initialValue?: T
+): readonly [T, Dispatch<SetStateAction<T>>, MutableRefObject<T>] {
+  const [_, forceRender] = useState<unknown>();
+  const valueRef: MutableRefObject<T> = useRef<T>(initialValue);
+
+  const setter: Dispatch<SetStateAction<T>> = useCallback(
+    (value: SetStateAction<T>) => {
+      const { current } = valueRef;
+
+      value = value instanceof Function ? value(current) : value;
+
+      if (current !== value) {
+        valueRef.current = value;
+
+        forceRender({});
+      }
+    },
+    [forceRender, valueRef]
+  );
+
+  return Object.freeze([valueRef.current, setter, valueRef]) as readonly [
+    T,
+    Dispatch<SetStateAction<T>>,
+    MutableRefObject<T>
+  ];
+}
diff --git a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx
index b1f978e42c..70d9c3aa4a 100644
--- a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx
+++ b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx
@@ -1,21 +1,22 @@
-import React, { useCallback, useEffect, useMemo, useRef, useState, type ReactNode } from 'react';
+import React, { useCallback, useEffect, useMemo, useRef, type ReactNode } from 'react';
 import { isVoiceActivity, WebChatActivity } from 'botframework-webchat-core';
 import { useAudioPlayer } from './private/useAudioPlayer';
 import { useRecorder } from './private/useRecorder';
 import { useDebouncedNotifications, usePostActivity, useVoiceActivities } from '../../hooks';
 import SpeechToSpeechContext from './private/Context';
 import { SpeechState } from './types/SpeechState';
+import useStateRef from '../../hooks/internal/useStateRef';
 
 export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> = ({ children }) => {
   const [voiceActivities] = useVoiceActivities();
   const postActivity = usePostActivity();
   const [{ connectivitystatus }] = useDebouncedNotifications();
   const lastProcessedIndexRef = useRef(0);
-  const [speechState, setSpeechState] = useState<SpeechState>('idle');
+  const [speechState, setSpeechState] = useStateRef<SpeechState>('idle');
 
-  // config received from server on session start, for now ccv2 and mmrt runs on different sample rate and chunk interval.
+  // config received from server on session init (only once), for now ccv2 and mmrt runs on different sample rate and chunk interval.
   // we will read those config, free form object as unsure of what all session config would be needed in future.
-  const [serverConfig, setServerConfig] = useState<Record<string, unknown> | null>(null);
+  const [serverConfig, setServerConfig] = useStateRef<Record<string, unknown> | null>(null);
   const { playAudio, stopAudio, isPlaying } = useAudioPlayer(serverConfig);
 
   const isConnected = useMemo(() => connectivitystatus?.message === 'connected', [connectivitystatus]);
@@ -42,27 +43,44 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }>
       const { name, value } = activity;
       const { voice } = value;
 
-      // TODO - this will be commandResult activity and not event, need to think on handling of command and commandResult activities.
-      if (name === 'session.init' && value.session?.config) {
-        setServerConfig(value.session.config as Record<string, unknown>);
-      } else if (name === 'session.update') {
-        switch (voice.bot_state) {
-          case 'voice.request.detected':
-            stopAudio();
-            setSpeechState('listening');
-            break;
-          case 'voice.request.processing':
-            setSpeechState('processing');
-            break;
-          default:
-            break;
+      switch (name) {
+        // TODO - this will be commandResult activity and not event, need to think on handling of command and commandResult activities.
+        case 'session.init': {
+          setServerConfig(value.session?.config as Record<string, unknown>);
+          break;
         }
-      } else if (name === 'stream.chunk' && voice.contentUrl) {
-        playAudio(voice.contentUrl);
+
+        case 'session.update': {
+          switch (voice.bot_state) {
+            case 'voice.request.detected':
+              stopAudio();
+              setSpeechState('listening');
+              break;
+
+            case 'voice.request.processing':
+              setSpeechState('processing');
+              break;
+
+            default:
+              break;
+          }
+          break;
+        }
+
+        case 'stream.chunk': {
+          if (voice.contentUrl) {
+            playAudio(voice.contentUrl);
+          }
+          break;
+        }
+
+        default:
+          break;
       }
     },
-    [playAudio, stopAudio]
+    [playAudio, setServerConfig, setSpeechState, stopAudio]
   );
+
   useEffect(() => {
     const startIndex = lastProcessedIndexRef.current;
     if (!voiceActivities.length || startIndex >= voiceActivities.length) {
@@ -93,7 +111,7 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }>
     }
 
     lastProcessedIndexRef.current = voiceActivities.length;
-  }, [voiceActivities, recording, isPlaying, speechState, handleVoiceActivity]);
+  }, [handleVoiceActivity, isPlaying, recording, setSpeechState, speechState, voiceActivities]);
 
   const setRecording = useCallback(
     async (shouldRecord: boolean) => {
@@ -110,7 +128,7 @@ export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }>
 
       await baseSetRecording(shouldRecord);
     },
-    [isConnected, baseSetRecording, stopAudio]
+    [isConnected, baseSetRecording, setSpeechState, stopAudio]
   );
 
   const contextValue = useMemo(
diff --git a/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts
index f6da7a746a..c6ae5bd742 100644
--- a/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts
+++ b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts
@@ -7,7 +7,7 @@ const isVoiceTranscriptActivity = (
   value: {
     voice: {
       transcription: string;
-      origin: 'user' | 'bot';
+      origin: 'user' | 'agent';
     };
   };
 } =>