daydreamlive · leszko · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/frontend/src/components/InputAndControlsPanel.tsx b/frontend/src/components/InputAndControlsPanel.tsx
@@ -116,6 +116,7 @@ export function InputAndControlsPanel({
 
   // Check if this pipeline supports multiple input modes
   const isMultiMode = pipelineIsMultiMode(pipelineId);
+  const promptLabel = pipelineId === "vibevoice" ? "Text input" : "Prompts";
 
   useEffect(() => {
     if (videoRef.current && localStream) {
@@ -288,7 +289,7 @@ export function InputAndControlsPanel({
             return (
               <div>
                 <div className="flex items-center justify-between mb-2">
-                  <h3 className="text-sm font-medium">Prompts</h3>
+                  <h3 className="text-sm font-medium">{promptLabel}</h3>
                   {isEditMode && (
                     <Badge variant="secondary" className="text-xs">
                       Editing

diff --git a/frontend/src/components/VideoOutput.tsx b/frontend/src/components/VideoOutput.tsx
@@ -14,6 +14,7 @@ interface VideoOutputProps {
   onPlayPauseToggle?: () => void;
   onStartStream?: () => void;
   onVideoPlaying?: () => void;
+  isAudioOnly?: boolean;
 }
 
 export function VideoOutput({
@@ -27,17 +28,60 @@ export function VideoOutput({
   onPlayPauseToggle,
   onStartStream,
   onVideoPlaying,
+  isAudioOnly = false,
 }: VideoOutputProps) {
   const videoRef = useRef<HTMLVideoElement>(null);
+  const audioRef = useRef<HTMLAudioElement>(null);
   const [showOverlay, setShowOverlay] = useState(false);
   const [isFadingOut, setIsFadingOut] = useState(false);
+  const [isAudioPlaying, setIsAudioPlaying] = useState(false);
   const overlayTimeoutRef = useRef<number | null>(null);
 
   useEffect(() => {
-    if (videoRef.current && remoteStream) {
+    if (!isAudioOnly && videoRef.current && remoteStream) {
       videoRef.current.srcObject = remoteStream;
     }
-  }, [remoteStream]);
+  }, [remoteStream, isAudioOnly]);
+
+  useEffect(() => {
+    if (isAudioOnly && audioRef.current && remoteStream) {
+      audioRef.current.srcObject = remoteStream;
+    }
+  }, [remoteStream, isAudioOnly]);
+
+  // Notify when audio starts playing (parity with video)
+  useEffect(() => {
+    if (!isAudioOnly) return;
+    const audio = audioRef.current;
+    if (!audio || !remoteStream) return;
+
+    const handlePlaying = () => {
+      setIsAudioPlaying(true);
+      onVideoPlaying?.();
+    };
+
+    const handlePause = () => {
+      setIsAudioPlaying(false);
+    };
+
+    const handleEnded = () => {
+      setIsAudioPlaying(false);
+    };
+
+    if (!audio.paused && audio.currentTime > 0 && !audio.ended) {
+      setIsAudioPlaying(true);
+      setTimeout(() => onVideoPlaying?.(), 0);
+    }
+
+    audio.addEventListener("playing", handlePlaying);
+    audio.addEventListener("pause", handlePause);
+    audio.addEventListener("ended", handleEnded);
+    return () => {
+      audio.removeEventListener("playing", handlePlaying);
+      audio.removeEventListener("pause", handlePause);
+      audio.removeEventListener("ended", handleEnded);
+    };
+  }, [isAudioOnly, onVideoPlaying, remoteStream]);
 
   // Listen for video playing event to notify parent
   useEffect(() => {
@@ -132,7 +176,50 @@ export function VideoOutput({
         <CardTitle className="text-base font-medium">Video Output</CardTitle>
       </CardHeader>
       <CardContent className="flex-1 flex items-center justify-center min-h-0 p-4">
-        {remoteStream ? (
+        {remoteStream && isAudioOnly ? (
+          <div className="w-full h-full flex flex-col items-center justify-center">
+            {/* Hidden audio element for actual playback */}
+            <audio
+              ref={audioRef}
+              className="hidden"
+              autoPlay
+              playsInline
+            />
+            {/* Audio visualization */}
+            <div className="relative flex items-center justify-center mb-6">
+              {/* Animated concentric circles */}
+              {isAudioPlaying && (
+                <>
+                  <div className="absolute w-32 h-32 rounded-full border-2 border-purple-500/30 animate-ping" />
+                  <div className="absolute w-40 h-40 rounded-full border-2 border-blue-500/20 animate-ping" style={{ animationDelay: "0.2s" }} />
+                  <div className="absolute w-48 h-48 rounded-full border-2 border-purple-500/10 animate-ping" style={{ animationDelay: "0.4s" }} />
+                </>
+              )}
+              {/* Speaker icon */}
+              <div className="relative z-10 w-24 h-24 flex items-center justify-center">
+                <svg
+                  className="w-full h-full text-white"
+                  fill="none"
+                  viewBox="0 0 24 24"
+                  stroke="currentColor"
+                  strokeWidth={2}
+                >
+                  <path
+                    strokeLinecap="round"
+                    strokeLinejoin="round"
+                    d="M19.114 5.636a9 9 0 010 12.728M16.463 8.289a5 5 0 010 7.072M12 3v18M8 8l-4-4v12l4-4"
+                  />
+                </svg>
+              </div>
+            </div>
+            {/* Status text */}
+            <div className="text-center">
+              <p className="text-lg font-medium text-foreground">
+                {isAudioPlaying ? "Playing audio..." : "Audio ready"}
+              </p>
+            </div>
+          </div>
+        ) : remoteStream ? (
           <div
             className="relative w-full h-full cursor-pointer flex items-center justify-center"
             onClick={handleVideoClick}

diff --git a/frontend/src/data/pipelines.ts b/frontend/src/data/pipelines.ts
@@ -98,6 +98,13 @@ export const PIPELINES: Record<string, PipelineInfo> = {
     supportedModes: ["video"],
     defaultMode: "video",
   },
+  vibevoice: {
+    name: "VibeVoice",
+    about: "Streaming text-to-speech pipeline that outputs audio chunks.",
+    requiresModels: false,
+    supportedModes: ["text"],
+    defaultMode: "text",
+  },
 };
 
 export function pipelineSupportsLoRA(pipelineId: string): boolean {

diff --git a/frontend/src/hooks/useWebRTC.ts b/frontend/src/hooks/useWebRTC.ts
@@ -44,7 +44,11 @@ export function useWebRTC(options?: UseWebRTCOptions) {
   const queuedCandidatesRef = useRef<RTCIceCandidate[]>([]);
 
   const startStream = useCallback(
-    async (initialParameters?: InitialParameters, stream?: MediaStream) => {
+    async (
+      initialParameters?: InitialParameters,
+      stream?: MediaStream,
+      pipelineId?: string
+    ) => {
       if (isConnecting || peerConnectionRef.current) return;
 
       setIsConnecting(true);
@@ -127,7 +131,7 @@ export function useWebRTC(options?: UseWebRTCOptions) {
           console.error("Data channel error:", error);
         };
 
-        // Add video track for sending to server only if stream is provided
+        // Add media tracks based on pipeline type
         if (stream) {
           stream.getTracks().forEach(track => {
             if (track.kind === "video") {
@@ -137,10 +141,11 @@ export function useWebRTC(options?: UseWebRTCOptions) {
           });
         } else {
           console.log(
-            "No video stream provided - adding video transceiver for no-input pipelines"
+            "No media stream provided - adding transceiver for no-input pipelines"
           );
-          // For no-video-input pipelines, add a video transceiver to establish proper WebRTC connection
-          pc.addTransceiver("video");
+          const mediaKind = pipelineId === "vibevoice" ? "audio" : "video";
+          // For no-input pipelines, add a transceiver to establish proper WebRTC connection
+          pc.addTransceiver(mediaKind);
         }
 
         // Named event handlers

diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts
@@ -23,6 +23,7 @@ export interface WebRTCOfferRequest {
     noise_controller?: boolean;
     manage_cache?: boolean;
     kv_cache_attention_bias?: number;
+    text?: string;
   };
 }
 
@@ -63,13 +64,19 @@ export interface KreaRealtimeVideoLoadParams extends PipelineLoadParams {
   lora_merge_mode?: "permanent_merge" | "runtime_peft";
 }
 
+export interface VibeVoiceLoadParams extends PipelineLoadParams {
+  audio_path?: string;
+  chunk_size?: number;
+}
+
 export interface PipelineLoadRequest {
   pipeline_id?: string;
   load_params?:
     | PassthroughLoadParams
     | StreamDiffusionV2LoadParams
     | LongLiveLoadParams
     | KreaRealtimeVideoLoadParams
+    | VibeVoiceLoadParams
     | null;
 }
 

diff --git a/frontend/src/pages/StreamPage.tsx b/frontend/src/pages/StreamPage.tsx
@@ -722,6 +722,7 @@ export function StreamPage() {
         kv_cache_attention_bias?: number;
         spout_sender?: { enabled: boolean; name: string };
         spout_receiver?: { enabled: boolean; name: string };
+        text?: string;
       } = {
         // Signal the intended input mode to the backend so it doesn't
         // briefly fall back to text mode before video frames arrive
@@ -766,11 +767,15 @@ export function StreamPage() {
         initialParameters.spout_receiver = settings.spoutReceiver;
       }
 
+      if (pipelineIdToUse === "vibevoice") {
+        initialParameters.text = promptItems[0]?.text ?? "";
+      }
+
       // Reset paused state when starting a fresh stream
       updateSettings({ paused: false });
 
       // Pipeline is loaded, now start WebRTC stream
-      startStream(initialParameters, streamToSend);
+      startStream(initialParameters, streamToSend, pipelineIdToUse);
 
       return true; // Stream started successfully
     } catch (error) {
@@ -849,6 +854,7 @@ export function StreamPage() {
               pipelineError={pipelineError}
               isPlaying={!settings.paused}
               isDownloading={isDownloading}
+              isAudioOnly={settings.pipelineId === "vibevoice"}
               onPlayPauseToggle={() => {
                 // Use timeline's play/pause handler instead of direct video toggle
                 if (timelinePlayPauseRef.current) {

diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts
@@ -3,7 +3,8 @@ export type PipelineId =
   | "passthrough"
   | "longlive"
   | "krea-realtime-video"
-  | "reward-forcing";
+  | "reward-forcing"
+  | "vibevoice";
 
 // Input mode for pipeline operation
 export type InputMode = "text" | "video";

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,7 +38,8 @@ dependencies = [
     "easydict>=1.13",
     "diffusers>=0.31.0",
     "ftfy>=6.3.1",
-    "transformers>=4.49.0",
+    "transformers==4.51.3",  # Pinned for VibeVoice compatibility
+    "vibevoice @ git+https://github.com/microsoft/VibeVoice.git",
     "einops>=0.8.1",
     "lmdb>=1.7.3",
     "omegaconf>=2.3.0",
@@ -149,6 +150,9 @@ known-first-party = ["src"]
 requires = ["hatchling", "editables"]
 build-backend = "hatchling.build"
 
+[tool.hatch.metadata]
+allow-direct-references = true
+
 [tool.hatch.build.targets.wheel]
 packages = ["src/scope"]
 

diff --git a/src/scope/core/pipelines/__init__.py b/src/scope/core/pipelines/__init__.py
@@ -24,6 +24,10 @@ def __getattr__(name):
         from .passthrough.pipeline import PassthroughPipeline
 
         return PassthroughPipeline
+    elif name == "VibeVoicePipeline":
+        from .vibevoice.pipeline import VibeVoicePipeline
+
+        return VibeVoicePipeline
     # Config classes
     elif name == "BasePipelineConfig":
         from .schema import BasePipelineConfig
@@ -45,6 +49,10 @@ def __getattr__(name):
         from .schema import PassthroughConfig
 
         return PassthroughConfig
+    elif name == "VibeVoiceConfig":
+        from .schema import VibeVoiceConfig
+
+        return VibeVoiceConfig
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
@@ -55,10 +63,12 @@ def __getattr__(name):
     "RewardForcingPipeline",
     "StreamDiffusionV2Pipeline",
     "PassthroughPipeline",
+    "VibeVoicePipeline",
     # Config classes
     "BasePipelineConfig",
     "LongLiveConfig",
     "StreamDiffusionV2Config",
     "KreaRealtimeVideoConfig",
     "PassthroughConfig",
+    "VibeVoiceConfig",
 ]
diff --git a/src/scope/core/pipelines/registry.py b/src/scope/core/pipelines/registry.py
@@ -73,6 +73,7 @@ def _register_pipelines():
     from .passthrough.pipeline import PassthroughPipeline
     from .reward_forcing.pipeline import RewardForcingPipeline
     from .streamdiffusionv2.pipeline import StreamDiffusionV2Pipeline
+    from .vibevoice.pipeline import VibeVoicePipeline
 
     # Register each pipeline with its ID from its config class
     for pipeline_class in [
@@ -81,6 +82,7 @@ def _register_pipelines():
         StreamDiffusionV2Pipeline,
         PassthroughPipeline,
         RewardForcingPipeline,
+        VibeVoicePipeline,
     ]:
         config_class = pipeline_class.get_config_class()
         PipelineRegistry.register(config_class.pipeline_id, pipeline_class)

diff --git a/src/scope/core/pipelines/schema.py b/src/scope/core/pipelines/schema.py
@@ -418,13 +418,30 @@ class PassthroughConfig(BasePipelineConfig):
     )
 
 
+class VibeVoiceConfig(BasePipelineConfig):
+    """Configuration for VibeVoice text-to-speech pipeline."""
+
+    pipeline_id: ClassVar[str] = "vibevoice"
+    pipeline_name: ClassVar[str] = "VibeVoice"
+    pipeline_description: ClassVar[str] = "Streamed text-to-speech generation"
+
+    # Text-only pipeline
+    supported_modes: ClassVar[list[InputMode]] = ["text"]
+    default_mode: ClassVar[InputMode] = "text"
+
+    # Keep minimal defaults (not used for audio generation but required by base model)
+    height: int = Field(default=256, ge=1, description="Placeholder height")
+    width: int = Field(default=256, ge=1, description="Placeholder width")
+
+
 # Registry of pipeline config classes
 PIPELINE_CONFIGS: dict[str, type[BasePipelineConfig]] = {
     "longlive": LongLiveConfig,
     "streamdiffusionv2": StreamDiffusionV2Config,
     "krea-realtime-video": KreaRealtimeVideoConfig,
     "reward-forcing": RewardForcingConfig,
     "passthrough": PassthroughConfig,
+    "vibevoice": VibeVoiceConfig,
 }
 
 

diff --git a/src/scope/core/pipelines/vibevoice/__init__.py b/src/scope/core/pipelines/vibevoice/__init__.py
@@ -0,0 +1,3 @@
+from .pipeline import VibeVoicePipeline
+
+__all__ = ["VibeVoicePipeline"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .pipeline import VibeVoicePipeline

		__all__ = ["VibeVoicePipeline"]