Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion frontend/src/components/InputAndControlsPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ export function InputAndControlsPanel({

// Check if this pipeline supports multiple input modes
const isMultiMode = pipelineIsMultiMode(pipelineId);
const promptLabel = pipelineId === "vibevoice" ? "Text input" : "Prompts";

useEffect(() => {
if (videoRef.current && localStream) {
Expand Down Expand Up @@ -288,7 +289,7 @@ export function InputAndControlsPanel({
return (
<div>
<div className="flex items-center justify-between mb-2">
<h3 className="text-sm font-medium">Prompts</h3>
<h3 className="text-sm font-medium">{promptLabel}</h3>
{isEditMode && (
<Badge variant="secondary" className="text-xs">
Editing
Expand Down
93 changes: 90 additions & 3 deletions frontend/src/components/VideoOutput.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ interface VideoOutputProps {
onPlayPauseToggle?: () => void;
onStartStream?: () => void;
onVideoPlaying?: () => void;
isAudioOnly?: boolean;
}

export function VideoOutput({
Expand All @@ -27,17 +28,60 @@ export function VideoOutput({
onPlayPauseToggle,
onStartStream,
onVideoPlaying,
isAudioOnly = false,
}: VideoOutputProps) {
const videoRef = useRef<HTMLVideoElement>(null);
const audioRef = useRef<HTMLAudioElement>(null);
const [showOverlay, setShowOverlay] = useState(false);
const [isFadingOut, setIsFadingOut] = useState(false);
const [isAudioPlaying, setIsAudioPlaying] = useState(false);
const overlayTimeoutRef = useRef<number | null>(null);

useEffect(() => {
if (videoRef.current && remoteStream) {
if (!isAudioOnly && videoRef.current && remoteStream) {
videoRef.current.srcObject = remoteStream;
}
}, [remoteStream]);
}, [remoteStream, isAudioOnly]);

useEffect(() => {
if (isAudioOnly && audioRef.current && remoteStream) {
audioRef.current.srcObject = remoteStream;
}
}, [remoteStream, isAudioOnly]);

// Notify when audio starts playing (parity with video)
useEffect(() => {
if (!isAudioOnly) return;
const audio = audioRef.current;
if (!audio || !remoteStream) return;

const handlePlaying = () => {
setIsAudioPlaying(true);
onVideoPlaying?.();
};

const handlePause = () => {
setIsAudioPlaying(false);
};

const handleEnded = () => {
setIsAudioPlaying(false);
};

if (!audio.paused && audio.currentTime > 0 && !audio.ended) {
setIsAudioPlaying(true);
setTimeout(() => onVideoPlaying?.(), 0);
}

audio.addEventListener("playing", handlePlaying);
audio.addEventListener("pause", handlePause);
audio.addEventListener("ended", handleEnded);
return () => {
audio.removeEventListener("playing", handlePlaying);
audio.removeEventListener("pause", handlePause);
audio.removeEventListener("ended", handleEnded);
};
}, [isAudioOnly, onVideoPlaying, remoteStream]);

// Listen for video playing event to notify parent
useEffect(() => {
Expand Down Expand Up @@ -132,7 +176,50 @@ export function VideoOutput({
<CardTitle className="text-base font-medium">Video Output</CardTitle>
</CardHeader>
<CardContent className="flex-1 flex items-center justify-center min-h-0 p-4">
{remoteStream ? (
{remoteStream && isAudioOnly ? (
<div className="w-full h-full flex flex-col items-center justify-center">
{/* Hidden audio element for actual playback */}
<audio
ref={audioRef}
className="hidden"
autoPlay
playsInline
/>
{/* Audio visualization */}
<div className="relative flex items-center justify-center mb-6">
{/* Animated concentric circles */}
{isAudioPlaying && (
<>
<div className="absolute w-32 h-32 rounded-full border-2 border-purple-500/30 animate-ping" />
<div className="absolute w-40 h-40 rounded-full border-2 border-blue-500/20 animate-ping" style={{ animationDelay: "0.2s" }} />
<div className="absolute w-48 h-48 rounded-full border-2 border-purple-500/10 animate-ping" style={{ animationDelay: "0.4s" }} />
</>
)}
{/* Speaker icon */}
<div className="relative z-10 w-24 h-24 flex items-center justify-center">
<svg
className="w-full h-full text-white"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
strokeWidth={2}
>
<path
strokeLinecap="round"
strokeLinejoin="round"
d="M19.114 5.636a9 9 0 010 12.728M16.463 8.289a5 5 0 010 7.072M12 3v18M8 8l-4-4v12l4-4"
/>
</svg>
</div>
</div>
{/* Status text */}
<div className="text-center">
<p className="text-lg font-medium text-foreground">
{isAudioPlaying ? "Playing audio..." : "Audio ready"}
</p>
</div>
</div>
) : remoteStream ? (
<div
className="relative w-full h-full cursor-pointer flex items-center justify-center"
onClick={handleVideoClick}
Expand Down
7 changes: 7 additions & 0 deletions frontend/src/data/pipelines.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ export const PIPELINES: Record<string, PipelineInfo> = {
supportedModes: ["video"],
defaultMode: "video",
},
vibevoice: {
name: "VibeVoice",
about: "Streaming text-to-speech pipeline that outputs audio chunks.",
requiresModels: false,
supportedModes: ["text"],
defaultMode: "text",
},
};

export function pipelineSupportsLoRA(pipelineId: string): boolean {
Expand Down
15 changes: 10 additions & 5 deletions frontend/src/hooks/useWebRTC.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ export function useWebRTC(options?: UseWebRTCOptions) {
const queuedCandidatesRef = useRef<RTCIceCandidate[]>([]);

const startStream = useCallback(
async (initialParameters?: InitialParameters, stream?: MediaStream) => {
async (
initialParameters?: InitialParameters,
stream?: MediaStream,
pipelineId?: string
) => {
if (isConnecting || peerConnectionRef.current) return;

setIsConnecting(true);
Expand Down Expand Up @@ -127,7 +131,7 @@ export function useWebRTC(options?: UseWebRTCOptions) {
console.error("Data channel error:", error);
};

// Add video track for sending to server only if stream is provided
// Add media tracks based on pipeline type
if (stream) {
stream.getTracks().forEach(track => {
if (track.kind === "video") {
Expand All @@ -137,10 +141,11 @@ export function useWebRTC(options?: UseWebRTCOptions) {
});
} else {
console.log(
"No video stream provided - adding video transceiver for no-input pipelines"
"No media stream provided - adding transceiver for no-input pipelines"
);
// For no-video-input pipelines, add a video transceiver to establish proper WebRTC connection
pc.addTransceiver("video");
const mediaKind = pipelineId === "vibevoice" ? "audio" : "video";
// For no-input pipelines, add a transceiver to establish proper WebRTC connection
pc.addTransceiver(mediaKind);
}

// Named event handlers
Expand Down
7 changes: 7 additions & 0 deletions frontend/src/lib/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export interface WebRTCOfferRequest {
noise_controller?: boolean;
manage_cache?: boolean;
kv_cache_attention_bias?: number;
text?: string;
};
}

Expand Down Expand Up @@ -63,13 +64,19 @@ export interface KreaRealtimeVideoLoadParams extends PipelineLoadParams {
lora_merge_mode?: "permanent_merge" | "runtime_peft";
}

export interface VibeVoiceLoadParams extends PipelineLoadParams {
audio_path?: string;
chunk_size?: number;
}

export interface PipelineLoadRequest {
pipeline_id?: string;
load_params?:
| PassthroughLoadParams
| StreamDiffusionV2LoadParams
| LongLiveLoadParams
| KreaRealtimeVideoLoadParams
| VibeVoiceLoadParams
| null;
}

Expand Down
8 changes: 7 additions & 1 deletion frontend/src/pages/StreamPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,7 @@ export function StreamPage() {
kv_cache_attention_bias?: number;
spout_sender?: { enabled: boolean; name: string };
spout_receiver?: { enabled: boolean; name: string };
text?: string;
} = {
// Signal the intended input mode to the backend so it doesn't
// briefly fall back to text mode before video frames arrive
Expand Down Expand Up @@ -766,11 +767,15 @@ export function StreamPage() {
initialParameters.spout_receiver = settings.spoutReceiver;
}

if (pipelineIdToUse === "vibevoice") {
initialParameters.text = promptItems[0]?.text ?? "";
}

// Reset paused state when starting a fresh stream
updateSettings({ paused: false });

// Pipeline is loaded, now start WebRTC stream
startStream(initialParameters, streamToSend);
startStream(initialParameters, streamToSend, pipelineIdToUse);

return true; // Stream started successfully
} catch (error) {
Expand Down Expand Up @@ -849,6 +854,7 @@ export function StreamPage() {
pipelineError={pipelineError}
isPlaying={!settings.paused}
isDownloading={isDownloading}
isAudioOnly={settings.pipelineId === "vibevoice"}
onPlayPauseToggle={() => {
// Use timeline's play/pause handler instead of direct video toggle
if (timelinePlayPauseRef.current) {
Expand Down
3 changes: 2 additions & 1 deletion frontend/src/types/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ export type PipelineId =
| "passthrough"
| "longlive"
| "krea-realtime-video"
| "reward-forcing";
| "reward-forcing"
| "vibevoice";

// Input mode for pipeline operation
export type InputMode = "text" | "video";
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ dependencies = [
"easydict>=1.13",
"diffusers>=0.31.0",
"ftfy>=6.3.1",
"transformers>=4.49.0",
"transformers==4.51.3", # Pinned for VibeVoice compatibility
"vibevoice @ git+https://github.com/microsoft/VibeVoice.git",
"einops>=0.8.1",
"lmdb>=1.7.3",
"omegaconf>=2.3.0",
Expand Down Expand Up @@ -149,6 +150,9 @@ known-first-party = ["src"]
requires = ["hatchling", "editables"]
build-backend = "hatchling.build"

[tool.hatch.metadata]
allow-direct-references = true

[tool.hatch.build.targets.wheel]
packages = ["src/scope"]

Expand Down
10 changes: 10 additions & 0 deletions src/scope/core/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ def __getattr__(name):
from .passthrough.pipeline import PassthroughPipeline

return PassthroughPipeline
elif name == "VibeVoicePipeline":
from .vibevoice.pipeline import VibeVoicePipeline

return VibeVoicePipeline
# Config classes
elif name == "BasePipelineConfig":
from .schema import BasePipelineConfig
Expand All @@ -45,6 +49,10 @@ def __getattr__(name):
from .schema import PassthroughConfig

return PassthroughConfig
elif name == "VibeVoiceConfig":
from .schema import VibeVoiceConfig

return VibeVoiceConfig
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


Expand All @@ -55,10 +63,12 @@ def __getattr__(name):
"RewardForcingPipeline",
"StreamDiffusionV2Pipeline",
"PassthroughPipeline",
"VibeVoicePipeline",
# Config classes
"BasePipelineConfig",
"LongLiveConfig",
"StreamDiffusionV2Config",
"KreaRealtimeVideoConfig",
"PassthroughConfig",
"VibeVoiceConfig",
]
2 changes: 2 additions & 0 deletions src/scope/core/pipelines/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def _register_pipelines():
from .passthrough.pipeline import PassthroughPipeline
from .reward_forcing.pipeline import RewardForcingPipeline
from .streamdiffusionv2.pipeline import StreamDiffusionV2Pipeline
from .vibevoice.pipeline import VibeVoicePipeline

# Register each pipeline with its ID from its config class
for pipeline_class in [
Expand All @@ -81,6 +82,7 @@ def _register_pipelines():
StreamDiffusionV2Pipeline,
PassthroughPipeline,
RewardForcingPipeline,
VibeVoicePipeline,
]:
config_class = pipeline_class.get_config_class()
PipelineRegistry.register(config_class.pipeline_id, pipeline_class)
Expand Down
17 changes: 17 additions & 0 deletions src/scope/core/pipelines/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,13 +418,30 @@ class PassthroughConfig(BasePipelineConfig):
)


class VibeVoiceConfig(BasePipelineConfig):
"""Configuration for VibeVoice text-to-speech pipeline."""

pipeline_id: ClassVar[str] = "vibevoice"
pipeline_name: ClassVar[str] = "VibeVoice"
pipeline_description: ClassVar[str] = "Streamed text-to-speech generation"

# Text-only pipeline
supported_modes: ClassVar[list[InputMode]] = ["text"]
default_mode: ClassVar[InputMode] = "text"

# Keep minimal defaults (not used for audio generation but required by base model)
height: int = Field(default=256, ge=1, description="Placeholder height")
width: int = Field(default=256, ge=1, description="Placeholder width")


# Registry of pipeline config classes
PIPELINE_CONFIGS: dict[str, type[BasePipelineConfig]] = {
"longlive": LongLiveConfig,
"streamdiffusionv2": StreamDiffusionV2Config,
"krea-realtime-video": KreaRealtimeVideoConfig,
"reward-forcing": RewardForcingConfig,
"passthrough": PassthroughConfig,
"vibevoice": VibeVoiceConfig,
}


Expand Down
3 changes: 3 additions & 0 deletions src/scope/core/pipelines/vibevoice/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .pipeline import VibeVoicePipeline

__all__ = ["VibeVoicePipeline"]
Loading
Loading