From ab3e8d5b4a256fea2533b7603b5d679df0c9d44e Mon Sep 17 00:00:00 2001 From: Prince Canuma Date: Sun, 18 Jan 2026 23:11:17 +0100 Subject: [PATCH 1/3] Update README.md to enhance documentation for LTX-2 model features, CLI options, and usage examples. Add sections for audio-video generation, image-to-video conditioning, and detailed configuration options. --- README.md | 420 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 399 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 190bdf7..76a323a 100644 --- a/README.md +++ b/README.md @@ -16,32 +16,47 @@ pip install git+https://github.com/Blaizzy/mlx-video.git uv pip install git+https://github.com/Blaizzy/mlx-video.git ``` -Supported models: +## Supported Models ### LTX-2 -[LTX-2](https://huggingface.co/Lightricks/LTX-Video) is 19B parameter video generation model from Lightricks +[LTX-2](https://huggingface.co/Lightricks/LTX-Video) is a 19B parameter video generation model from Lightricks. ## Features - Text-to-video generation with the LTX-2 19B DiT model -- Two-stage generation pipeline for high-quality output -- 2x spatial upscaling for images and videos +- Image-to-video (I2V) conditioning +- Synchronized audio-video generation +- Two-stage generation pipeline (distilled model) +- Single-stage generation with CFG (dev model) +- Prompt enhancement using Gemma +- Memory-efficient tiled decoding +- Streaming frame output - Optimized for Apple Silicon using MLX +--- -## Usage +## Quick Start -> **ℹ️ Info:** Currently, only the distilled variant is supported. Full LTX-2 feature support is coming soon. - -### Text-to-Video Generation +### Text-to-Video Generation (Distilled Model) ```bash -uv run mlx_video.generate --prompt "Two dogs of the poodle breed wearing sunglasses, close up, cinematic, sunset" -n 100 --width 768 +python -m mlx_video.generate \ + --prompt "Two dogs of the poodle breed wearing sunglasses, close up, cinematic, sunset" \ + --num-frames 100 \ + --width 768 ``` Poodles demo -With custom settings: +--- + +## CLI Reference + +### 1. Distilled Model - Two-Stage Generation + +**Command**: `python -m mlx_video.generate` + +Uses a two-stage pipeline: generate at half resolution, upsample, then refine. ```bash python -m mlx_video.generate \ @@ -53,29 +68,383 @@ python -m mlx_video.generate \ --output my_video.mp4 ``` -### CLI Options +#### CLI Options | Option | Default | Description | |--------|---------|-------------| | `--prompt`, `-p` | (required) | Text description of the video | -| `--height`, `-H` | 512 | Output height (must be divisible by 64) | -| `--width`, `-W` | 512 | Output width (must be divisible by 64) | -| `--num-frames`, `-n` | 100 | Number of frames | +| `--height`, `-H` | 512 | Output height (divisible by 64) | +| `--width`, `-W` | 512 | Output width (divisible by 64) | +| `--num-frames`, `-n` | 100 | Number of frames (1 + 8*k format) | | `--seed`, `-s` | 42 | Random seed for reproducibility | | `--fps` | 24 | Frames per second | -| `--output`, `-o` | output.mp4 | Output video path | +| `--output-path`, `-o` | output.mp4 | Output video path | | `--save-frames` | false | Save individual frames as images | | `--model-repo` | Lightricks/LTX-2 | HuggingFace model repository | +| `--text-encoder-repo` | - | Custom text encoder repository | +| `--verbose` | false | Enable verbose output | +| `--enhance-prompt` | false | Enhance prompt using Gemma | +| `--max-tokens` | 512 | Max tokens for prompt enhancement | +| `--temperature` | 0.7 | Temperature for prompt enhancement | +| `--image`, `-i` | - | Path to image for I2V conditioning | +| `--image-strength` | 1.0 | I2V conditioning strength (0.0-1.0) | +| `--image-frame-idx` | 0 | Frame index to condition on | +| `--tiling` | auto | Tiling mode (see below) | +| `--stream` | false | Stream frames as they're decoded | +| `--audio` | false | Generate synchronized audio | + +--- + +### 2. Dev Model - Single-Stage with CFG + +**Command**: `python -m mlx_video.generate_dev` + +Uses classifier-free guidance (CFG) for higher quality generation. + +```bash +python -m mlx_video.generate_dev \ + --prompt "A cinematic shot of a mountain landscape at golden hour" \ + --negative-prompt "blurry, low quality" \ + --height 512 \ + --width 768 \ + --num-frames 33 \ + --steps 40 \ + --cfg-scale 4.0 +``` + +#### CLI Options + +| Option | Default | Description | +|--------|---------|-------------| +| `--prompt`, `-p` | (required) | Text description of the video | +| `--negative-prompt` | (comprehensive default) | CFG negative prompt | +| `--height`, `-H` | 512 | Video height (divisible by 64) | +| `--width`, `-W` | 768 | Video width (divisible by 64) | +| `--num-frames`, `-n` | 33 | Number of frames | +| `--steps` | 40 | Number of inference steps | +| `--cfg-scale` | 4.0 | CFG guidance scale | +| `--seed`, `-s` | 42 | Random seed | +| `--fps` | 24 | Frames per second | +| `--output-path` | output_dev.mp4 | Output video path | +| `--output-audio` | - | Output audio path (if `--audio` enabled) | +| `--save-frames` | false | Save individual frames | +| `--audio` | false | Generate synchronized audio | +| `--verbose` | false | Enable verbose output | +| `--enhance-prompt` | false | Enhance prompt using Gemma | +| `--image`, `-i` | - | Path to image for I2V | +| `--image-strength` | 1.0 | I2V conditioning strength | +| `--image-frame-idx` | 0 | Frame index to condition | +| `--tiling` | none | Tiling mode | + +--- + +### 3. Audio-Video Generation (Distilled) + +**Command**: `python -m mlx_video.generate_av` + +Generates synchronized video and audio. + +```bash +python -m mlx_video.generate_av \ + --prompt "Ocean waves crashing on rocks, seagulls calling" \ + --height 512 \ + --width 512 \ + --num-frames 65 \ + --output-path output_av.mp4 \ + --output-audio output.wav +``` + +#### CLI Options + +| Option | Default | Description | +|--------|---------|-------------| +| `--prompt`, `-p` | (required) | Text description | +| `--height`, `-H` | 512 | Video height | +| `--width`, `-W` | 512 | Video width | +| `--num-frames`, `-n` | 65 | Number of frames | +| `--seed`, `-s` | 42 | Random seed | +| `--fps` | 24 | Frames per second | +| `--output-path` | output_av.mp4 | Output video path | +| `--output-audio` | - | Output audio path (.wav) | +| `--verbose` | false | Enable verbose output | +| `--enhance-prompt` | false | Enhance prompt | +| `--image`, `-i` | - | Path to image for I2V | +| `--image-strength` | 1.0 | I2V conditioning strength | +| `--image-frame-idx` | 0 | Frame index to condition | +| `--tiling` | auto | Tiling mode | + +--- + +## Tiling Modes + +Tiling enables memory-efficient decoding for large videos: + +| Mode | Description | +|------|-------------| +| `auto` | Automatically determines based on video size | +| `none` | Disable tiling (requires more memory) | +| `default` | 512px spatial, 64 frame temporal | +| `aggressive` | 256px spatial, 32 frame temporal (lowest memory) | +| `conservative` | 768px spatial, 96 frame temporal (faster) | +| `spatial` | Spatial tiling only | +| `temporal` | Temporal tiling only | + +--- + +## Image-to-Video (I2V) Conditioning + +Condition video generation on an input image: + +```bash +# First frame conditioning +python -m mlx_video.generate \ + --prompt "A cat walking across a sunny garden" \ + --image cat.jpg \ + --image-strength 1.0 \ + --image-frame-idx 0 + +# Middle frame conditioning +python -m mlx_video.generate_dev \ + --prompt "A person turning around" \ + --image person.jpg \ + --image-frame-idx 16 \ + --num-frames 33 +``` + +--- + +## Python API + +### Basic Video Generation + +```python +from mlx_video.generate import generate_video + +# Generate a video +generate_video( + model_repo="Lightricks/LTX-2", + prompt="A beautiful sunset over the ocean", + height=512, + width=768, + num_frames=65, + seed=42, + fps=24, + output_path="output.mp4", +) +``` + +### Dev Model with CFG + +```python +from mlx_video.generate_dev import generate_video_dev + +generate_video_dev( + model_repo="Lightricks/LTX-2", + prompt="Cinematic shot of a forest", + negative_prompt="blurry, low quality", + height=512, + width=768, + num_frames=33, + num_inference_steps=40, + cfg_scale=4.0, + output_path="output_dev.mp4", +) +``` + +### Audio-Video Generation + +```python +from mlx_video.generate_av import generate_video_with_audio + +generate_video_with_audio( + model_repo="Lightricks/LTX-2", + prompt="Thunder and lightning in a storm", + height=512, + width=512, + num_frames=65, + output_path="output_av.mp4", + output_audio_path="output.wav", +) +``` + +### Image-to-Video Conditioning + +```python +from mlx_video.generate import generate_video + +# Condition on first frame +generate_video( + model_repo="Lightricks/LTX-2", + prompt="A cat walking", + image="cat.jpg", + image_strength=1.0, + image_frame_idx=0, + output_path="output.mp4", +) +``` + +### Model Loading + +```python +from mlx_video.convert import ( + get_model_path, + load_transformer_weights, + load_vae_weights, +) +from mlx_video.models.ltx import LTXModel, LTXModelConfig + +# Get model path (downloads if needed) +model_path = get_model_path("Lightricks/LTX-2") + +# Load transformer +config = LTXModelConfig() +model = LTXModel(config) +weights = load_transformer_weights(model_path) +model.load_weights(list(weights.items())) + +# Load VAE weights +vae_weights = load_vae_weights(model_path) +``` + +### VAE Encoder/Decoder + +```python +from mlx_video.models.ltx.video_vae.encoder import load_vae_encoder +from mlx_video.models.ltx.video_vae.decoder import load_vae_decoder +from mlx_video.models.ltx.video_vae.tiling import TilingConfig + +# Load encoder +encoder = load_vae_encoder(model_path) +latents = encoder(image) # (B, 3, H, W) -> (B, 128, H/32, W/32) + +# Load decoder +decoder = load_vae_decoder(model_path) + +# Standard decode +video = decoder.decode(latents) + +# Memory-efficient tiled decode +video = decoder.decode_tiled( + latents, + tiling_config=TilingConfig.auto(height, width, num_frames), + tiling_mode="auto", +) + +# Streaming decode (callback per batch of frames) +def on_frames_ready(frames): + # Process frames as they're decoded + pass + +decoder.decode_tiled( + latents, + tiling_config=TilingConfig.auto(height, width, num_frames), + on_frames_ready=on_frames_ready, +) +``` + +### Audio VAE + +```python +from mlx_video.generate_av import load_audio_decoder, load_vocoder, save_audio + +# Load audio components +audio_decoder = load_audio_decoder(model_path) +vocoder = load_vocoder(model_path) + +# Decode audio latents to mel-spectrogram +mel_spectrogram = audio_decoder(audio_latents) + +# Convert mel to waveform +audio_waveform = vocoder(mel_spectrogram) + +# Save audio +save_audio(audio_waveform, "output.wav", sample_rate=24000) +``` + +### Text Encoder with Prompt Enhancement + +```python +from mlx_video.models.ltx.text_encoder import LTX2TextEncoder + +# Load text encoder +text_encoder = LTX2TextEncoder.load(model_path, text_encoder_path) + +# Get embeddings +video_embeddings, audio_embeddings = text_encoder( + prompt="A cat walking", + return_audio_embeddings=True, +) + +# Enhance prompt with Gemma +enhanced_prompt = text_encoder.enhance_t2v( + prompt="cat walking", + max_tokens=512, + temperature=0.7, + verbose=True, +) +``` + +### Configuration + +```python +from mlx_video.models.ltx.config import ( + LTXModelConfig, + LTXModelType, + LTXRopeType, +) + +# Create config for different model types +config = LTXModelConfig( + model_type=LTXModelType.AudioVideo, # or VideoOnly, AudioOnly + num_attention_heads=32, + attention_head_dim=128, + num_layers=48, + rope_type=LTXRopeType.SPLIT, + double_precision_rope=True, +) + +# Access derived properties +print(config.inner_dim) # Video inner dimension +print(config.audio_inner_dim) # Audio inner dimension +``` + +--- ## How It Works -The pipeline uses a two-stage generation process: +### Two-Stage Pipeline (Distilled Model) 1. **Stage 1**: Generate at half resolution (e.g., 384x384) with 8 denoising steps 2. **Upsample**: 2x spatial upsampling via LatentUpsampler 3. **Stage 2**: Refine at full resolution (e.g., 768x768) with 3 denoising steps 4. **Decode**: VAE decoder converts latents to RGB video +### Single-Stage Pipeline (Dev Model) + +1. **Denoise**: Full resolution with CFG guidance over N steps +2. **Decode**: VAE decoder converts latents to RGB video + +### Audio Generation + +Audio is generated in parallel with video using: +- Shared transformer backbone with modality-specific attention +- Audio VAE for latent encoding/decoding +- HiFi-GAN vocoder for mel-to-waveform conversion + +--- + +## Audio Configuration + +| Constant | Value | Description | +|----------|-------|-------------| +| `AUDIO_SAMPLE_RATE` | 24000 | Output audio sample rate | +| `AUDIO_LATENT_SAMPLE_RATE` | 16000 | VAE internal sample rate | +| `AUDIO_HOP_LENGTH` | 160 | Mel-spectrogram hop length | +| `AUDIO_LATENT_CHANNELS` | 8 | Audio latent channels | +| `AUDIO_MEL_BINS` | 16 | Mel frequency bins | + +--- + ## Requirements - macOS with Apple Silicon @@ -85,27 +454,36 @@ The pipeline uses a two-stage generation process: ## Model Specifications - **Transformer**: 48 layers, 32 attention heads, 128 dim per head -- **Latent channels**: 128 -- **Text encoder**: Gemma 3 with 3840-dim output +- **Latent channels**: 128 (video), 8 (audio) +- **Text encoder**: Gemma 3 with 3840-dim output (video), 2048-dim (audio) - **RoPE**: Split mode with double precision ## Project Structure ``` mlx_video/ -├── generate.py # Video generation pipeline +├── __init__.py +├── generate.py # Two-stage video generation (distilled) +├── generate_av.py # Audio-video generation (distilled) +├── generate_dev.py # Single-stage generation with CFG (dev) ├── convert.py # Weight conversion (PyTorch -> MLX) ├── postprocess.py # Video post-processing utilities ├── utils.py # Helper functions +├── components/ +│ └── patchifiers.py # Video/Audio patchifiers +├── conditioning/ +│ ├── keyframe.py # Keyframe conditioning +│ └── latent.py # Latent state & I2V conditioning └── models/ └── ltx/ ├── ltx.py # Main LTXModel (DiT transformer) ├── config.py # Model configuration ├── transformer.py # Transformer blocks ├── attention.py # Multi-head attention with RoPE - ├── text_encoder.py # Text encoder + ├── text_encoder.py # Text encoder with AV support ├── upsampler.py # 2x spatial upsampler - └── video_vae/ # VAE encoder/decoder + ├── video_vae/ # Video VAE encoder/decoder + └── audio_vae/ # Audio VAE encoder/decoder/vocoder ``` ## License From 32f6925c1e3994d18eb8112d507075bf913e6999 Mon Sep 17 00:00:00 2001 From: Prince Canuma Date: Sun, 18 Jan 2026 23:48:32 +0100 Subject: [PATCH 2/3] Enhance get_model_path function documentation with detailed argument and return descriptions for improved clarity. --- mlx_video/utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mlx_video/utils.py b/mlx_video/utils.py index cebbed7..990cde2 100644 --- a/mlx_video/utils.py +++ b/mlx_video/utils.py @@ -10,7 +10,19 @@ from PIL import Image def get_model_path(model_repo: str): - """Get or download LTX-2 model path.""" + """Get or download model path. + + Args: + model_repo: Either a Hugging Face repo ID (e.g., 'namespace/repo_name') + or a local path to the model directory. + + Returns: + Path to the model directory. + """ + local_path = Path(model_repo) + if local_path.exists() and local_path.is_dir(): + return local_path + try: return Path(snapshot_download(repo_id=model_repo, local_files_only=True)) except Exception: From 9f0457c31f8883007898d75f14538e431fa0f625 Mon Sep 17 00:00:00 2001 From: Prince Canuma Date: Mon, 19 Jan 2026 00:29:51 +0100 Subject: [PATCH 3/3] Update model repository defaults --- mlx_video/generate.py | 4 ++-- mlx_video/generate_av.py | 4 ++-- mlx_video/generate_dev.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mlx_video/generate.py b/mlx_video/generate.py index 9a72fe9..20dcb0e 100644 --- a/mlx_video/generate.py +++ b/mlx_video/generate.py @@ -636,8 +636,8 @@ def main(): parser.add_argument( "--model-repo", type=str, - default="Lightricks/LTX-2", - help="Model repository to use (default: Lightricks/LTX-2)" + default="mlx-community/LTX-2-distilled-bf16", + help="Model repository to use (default: mlx-community/LTX-2-distilled-bf16)" ) parser.add_argument( "--text-encoder-repo", diff --git a/mlx_video/generate_av.py b/mlx_video/generate_av.py index e0fb22b..bf241c7 100644 --- a/mlx_video/generate_av.py +++ b/mlx_video/generate_av.py @@ -769,8 +769,8 @@ def main(): help="Output video path (default: output_av.mp4)") parser.add_argument("--output-audio", type=str, default=None, help="Output audio path (default: same as video with .wav)") - parser.add_argument("--model-repo", type=str, default="Lightricks/LTX-2", - help="Model repository (default: Lightricks/LTX-2)") + parser.add_argument("--model-repo", type=str, default="mlx-community/LTX-2-distilled-bf16", + help="Model repository (default: mlx-community/LTX-2-distilled-bf16)") parser.add_argument("--text-encoder-repo", type=str, default=None, help="Text encoder repository") parser.add_argument("--verbose", action="store_true", diff --git a/mlx_video/generate_dev.py b/mlx_video/generate_dev.py index 791c9ba..6717abd 100644 --- a/mlx_video/generate_dev.py +++ b/mlx_video/generate_dev.py @@ -1180,8 +1180,8 @@ def main(): parser.add_argument( "--model-repo", type=str, - default="Lightricks/LTX-2", - help="Model repository to use (default: Lightricks/LTX-2)" + default="mlx-community/LTX-2-dev-bf16", + help="Model repository to use (default: mlx-community/LTX-2-dev-bf16)" ) parser.add_argument( "--text-encoder-repo",