From ab3e8d5b4a256fea2533b7603b5d679df0c9d44e Mon Sep 17 00:00:00 2001
From: Prince Canuma <prince.gdt@gmail.com>
Date: Sun, 18 Jan 2026 23:11:17 +0100
Subject: [PATCH 1/3] Update README.md to enhance documentation for LTX-2 model
 features, CLI options, and usage examples. Add sections for audio-video
 generation, image-to-video conditioning, and detailed configuration options.

---
 README.md | 420 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 399 insertions(+), 21 deletions(-)
diff --git a/README.md b/README.md
index 190bdf7..76a323a 100644
--- a/README.md
+++ b/README.md
@@ -16,32 +16,47 @@ pip install git+https://github.com/Blaizzy/mlx-video.git
 uv pip install git+https://github.com/Blaizzy/mlx-video.git
 ```
 
-Supported models:
+## Supported Models
 
 ### LTX-2
-[LTX-2](https://huggingface.co/Lightricks/LTX-Video) is 19B parameter video generation model from Lightricks
+[LTX-2](https://huggingface.co/Lightricks/LTX-Video) is a 19B parameter video generation model from Lightricks.
 
 ## Features
 
 - Text-to-video generation with the LTX-2 19B DiT model
-- Two-stage generation pipeline for high-quality output
-- 2x spatial upscaling for images and videos
+- Image-to-video (I2V) conditioning
+- Synchronized audio-video generation
+- Two-stage generation pipeline (distilled model)
+- Single-stage generation with CFG (dev model)
+- Prompt enhancement using Gemma
+- Memory-efficient tiled decoding
+- Streaming frame output
 - Optimized for Apple Silicon using MLX
 
+---
 
-## Usage
+## Quick Start
 
-> **ℹ️ Info:** Currently, only the distilled variant is supported. Full LTX-2 feature support is coming soon.
-
-### Text-to-Video Generation
+### Text-to-Video Generation (Distilled Model)
 
 ```bash
-uv run mlx_video.generate --prompt "Two dogs of the poodle breed wearing sunglasses, close up, cinematic, sunset" -n 100 --width 768
+python -m mlx_video.generate \
+    --prompt "Two dogs of the poodle breed wearing sunglasses, close up, cinematic, sunset" \
+    --num-frames 100 \
+    --width 768
 ```
 
 <img src="https://github.com/Blaizzy/mlx-video/raw/main/examples/poodles.gif" width="512" alt="Poodles demo">
 
-With custom settings:
+---
+
+## CLI Reference
+
+### 1. Distilled Model - Two-Stage Generation
+
+**Command**: `python -m mlx_video.generate`
+
+Uses a two-stage pipeline: generate at half resolution, upsample, then refine.
 
 ```bash
 python -m mlx_video.generate \
@@ -53,29 +68,383 @@ python -m mlx_video.generate \
     --output my_video.mp4
 ```
 
-### CLI Options
+#### CLI Options
 
 | Option | Default | Description |
 |--------|---------|-------------|
 | `--prompt`, `-p` | (required) | Text description of the video |
-| `--height`, `-H` | 512 | Output height (must be divisible by 64) |
-| `--width`, `-W` | 512 | Output width (must be divisible by 64) |
-| `--num-frames`, `-n` | 100 | Number of frames |
+| `--height`, `-H` | 512 | Output height (divisible by 64) |
+| `--width`, `-W` | 512 | Output width (divisible by 64) |
+| `--num-frames`, `-n` | 100 | Number of frames (1 + 8*k format) |
 | `--seed`, `-s` | 42 | Random seed for reproducibility |
 | `--fps` | 24 | Frames per second |
-| `--output`, `-o` | output.mp4 | Output video path |
+| `--output-path`, `-o` | output.mp4 | Output video path |
 | `--save-frames` | false | Save individual frames as images |
 | `--model-repo` | Lightricks/LTX-2 | HuggingFace model repository |
+| `--text-encoder-repo` | - | Custom text encoder repository |
+| `--verbose` | false | Enable verbose output |
+| `--enhance-prompt` | false | Enhance prompt using Gemma |
+| `--max-tokens` | 512 | Max tokens for prompt enhancement |
+| `--temperature` | 0.7 | Temperature for prompt enhancement |
+| `--image`, `-i` | - | Path to image for I2V conditioning |
+| `--image-strength` | 1.0 | I2V conditioning strength (0.0-1.0) |
+| `--image-frame-idx` | 0 | Frame index to condition on |
+| `--tiling` | auto | Tiling mode (see below) |
+| `--stream` | false | Stream frames as they're decoded |
+| `--audio` | false | Generate synchronized audio |
+
+---
+
+### 2. Dev Model - Single-Stage with CFG
+
+**Command**: `python -m mlx_video.generate_dev`
+
+Uses classifier-free guidance (CFG) for higher quality generation.
+
+```bash
+python -m mlx_video.generate_dev \
+    --prompt "A cinematic shot of a mountain landscape at golden hour" \
+    --negative-prompt "blurry, low quality" \
+    --height 512 \
+    --width 768 \
+    --num-frames 33 \
+    --steps 40 \
+    --cfg-scale 4.0
+```
+
+#### CLI Options
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--prompt`, `-p` | (required) | Text description of the video |
+| `--negative-prompt` | (comprehensive default) | CFG negative prompt |
+| `--height`, `-H` | 512 | Video height (divisible by 64) |
+| `--width`, `-W` | 768 | Video width (divisible by 64) |
+| `--num-frames`, `-n` | 33 | Number of frames |
+| `--steps` | 40 | Number of inference steps |
+| `--cfg-scale` | 4.0 | CFG guidance scale |
+| `--seed`, `-s` | 42 | Random seed |
+| `--fps` | 24 | Frames per second |
+| `--output-path` | output_dev.mp4 | Output video path |
+| `--output-audio` | - | Output audio path (if `--audio` enabled) |
+| `--save-frames` | false | Save individual frames |
+| `--audio` | false | Generate synchronized audio |
+| `--verbose` | false | Enable verbose output |
+| `--enhance-prompt` | false | Enhance prompt using Gemma |
+| `--image`, `-i` | - | Path to image for I2V |
+| `--image-strength` | 1.0 | I2V conditioning strength |
+| `--image-frame-idx` | 0 | Frame index to condition |
+| `--tiling` | none | Tiling mode |
+
+---
+
+### 3. Audio-Video Generation (Distilled)
+
+**Command**: `python -m mlx_video.generate_av`
+
+Generates synchronized video and audio.
+
+```bash
+python -m mlx_video.generate_av \
+    --prompt "Ocean waves crashing on rocks, seagulls calling" \
+    --height 512 \
+    --width 512 \
+    --num-frames 65 \
+    --output-path output_av.mp4 \
+    --output-audio output.wav
+```
+
+#### CLI Options
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--prompt`, `-p` | (required) | Text description |
+| `--height`, `-H` | 512 | Video height |
+| `--width`, `-W` | 512 | Video width |
+| `--num-frames`, `-n` | 65 | Number of frames |
+| `--seed`, `-s` | 42 | Random seed |
+| `--fps` | 24 | Frames per second |
+| `--output-path` | output_av.mp4 | Output video path |
+| `--output-audio` | - | Output audio path (.wav) |
+| `--verbose` | false | Enable verbose output |
+| `--enhance-prompt` | false | Enhance prompt |
+| `--image`, `-i` | - | Path to image for I2V |
+| `--image-strength` | 1.0 | I2V conditioning strength |
+| `--image-frame-idx` | 0 | Frame index to condition |
+| `--tiling` | auto | Tiling mode |
+
+---
+
+## Tiling Modes
+
+Tiling enables memory-efficient decoding for large videos:
+
+| Mode | Description |
+|------|-------------|
+| `auto` | Automatically determines based on video size |
+| `none` | Disable tiling (requires more memory) |
+| `default` | 512px spatial, 64 frame temporal |
+| `aggressive` | 256px spatial, 32 frame temporal (lowest memory) |
+| `conservative` | 768px spatial, 96 frame temporal (faster) |
+| `spatial` | Spatial tiling only |
+| `temporal` | Temporal tiling only |
+
+---
+
+## Image-to-Video (I2V) Conditioning
+
+Condition video generation on an input image:
+
+```bash
+# First frame conditioning
+python -m mlx_video.generate \
+    --prompt "A cat walking across a sunny garden" \
+    --image cat.jpg \
+    --image-strength 1.0 \
+    --image-frame-idx 0
+
+# Middle frame conditioning
+python -m mlx_video.generate_dev \
+    --prompt "A person turning around" \
+    --image person.jpg \
+    --image-frame-idx 16 \
+    --num-frames 33
+```
+
+---
+
+## Python API
+
+### Basic Video Generation
+
+```python
+from mlx_video.generate import generate_video
+
+# Generate a video
+generate_video(
+    model_repo="Lightricks/LTX-2",
+    prompt="A beautiful sunset over the ocean",
+    height=512,
+    width=768,
+    num_frames=65,
+    seed=42,
+    fps=24,
+    output_path="output.mp4",
+)
+```
+
+### Dev Model with CFG
+
+```python
+from mlx_video.generate_dev import generate_video_dev
+
+generate_video_dev(
+    model_repo="Lightricks/LTX-2",
+    prompt="Cinematic shot of a forest",
+    negative_prompt="blurry, low quality",
+    height=512,
+    width=768,
+    num_frames=33,
+    num_inference_steps=40,
+    cfg_scale=4.0,
+    output_path="output_dev.mp4",
+)
+```
+
+### Audio-Video Generation
+
+```python
+from mlx_video.generate_av import generate_video_with_audio
+
+generate_video_with_audio(
+    model_repo="Lightricks/LTX-2",
+    prompt="Thunder and lightning in a storm",
+    height=512,
+    width=512,
+    num_frames=65,
+    output_path="output_av.mp4",
+    output_audio_path="output.wav",
+)
+```
+
+### Image-to-Video Conditioning
+
+```python
+from mlx_video.generate import generate_video
+
+# Condition on first frame
+generate_video(
+    model_repo="Lightricks/LTX-2",
+    prompt="A cat walking",
+    image="cat.jpg",
+    image_strength=1.0,
+    image_frame_idx=0,
+    output_path="output.mp4",
+)
+```
+
+### Model Loading
+
+```python
+from mlx_video.convert import (
+    get_model_path,
+    load_transformer_weights,
+    load_vae_weights,
+)
+from mlx_video.models.ltx import LTXModel, LTXModelConfig
+
+# Get model path (downloads if needed)
+model_path = get_model_path("Lightricks/LTX-2")
+
+# Load transformer
+config = LTXModelConfig()
+model = LTXModel(config)
+weights = load_transformer_weights(model_path)
+model.load_weights(list(weights.items()))
+
+# Load VAE weights
+vae_weights = load_vae_weights(model_path)
+```
+
+### VAE Encoder/Decoder
+
+```python
+from mlx_video.models.ltx.video_vae.encoder import load_vae_encoder
+from mlx_video.models.ltx.video_vae.decoder import load_vae_decoder
+from mlx_video.models.ltx.video_vae.tiling import TilingConfig
+
+# Load encoder
+encoder = load_vae_encoder(model_path)
+latents = encoder(image)  # (B, 3, H, W) -> (B, 128, H/32, W/32)
+
+# Load decoder
+decoder = load_vae_decoder(model_path)
+
+# Standard decode
+video = decoder.decode(latents)
+
+# Memory-efficient tiled decode
+video = decoder.decode_tiled(
+    latents,
+    tiling_config=TilingConfig.auto(height, width, num_frames),
+    tiling_mode="auto",
+)
+
+# Streaming decode (callback per batch of frames)
+def on_frames_ready(frames):
+    # Process frames as they're decoded
+    pass
+
+decoder.decode_tiled(
+    latents,
+    tiling_config=TilingConfig.auto(height, width, num_frames),
+    on_frames_ready=on_frames_ready,
+)
+```
+
+### Audio VAE
+
+```python
+from mlx_video.generate_av import load_audio_decoder, load_vocoder, save_audio
+
+# Load audio components
+audio_decoder = load_audio_decoder(model_path)
+vocoder = load_vocoder(model_path)
+
+# Decode audio latents to mel-spectrogram
+mel_spectrogram = audio_decoder(audio_latents)
+
+# Convert mel to waveform
+audio_waveform = vocoder(mel_spectrogram)
+
+# Save audio
+save_audio(audio_waveform, "output.wav", sample_rate=24000)
+```
+
+### Text Encoder with Prompt Enhancement
+
+```python
+from mlx_video.models.ltx.text_encoder import LTX2TextEncoder
+
+# Load text encoder
+text_encoder = LTX2TextEncoder.load(model_path, text_encoder_path)
+
+# Get embeddings
+video_embeddings, audio_embeddings = text_encoder(
+    prompt="A cat walking",
+    return_audio_embeddings=True,
+)
+
+# Enhance prompt with Gemma
+enhanced_prompt = text_encoder.enhance_t2v(
+    prompt="cat walking",
+    max_tokens=512,
+    temperature=0.7,
+    verbose=True,
+)
+```
+
+### Configuration
+
+```python
+from mlx_video.models.ltx.config import (
+    LTXModelConfig,
+    LTXModelType,
+    LTXRopeType,
+)
+
+# Create config for different model types
+config = LTXModelConfig(
+    model_type=LTXModelType.AudioVideo,  # or VideoOnly, AudioOnly
+    num_attention_heads=32,
+    attention_head_dim=128,
+    num_layers=48,
+    rope_type=LTXRopeType.SPLIT,
+    double_precision_rope=True,
+)
+
+# Access derived properties
+print(config.inner_dim)        # Video inner dimension
+print(config.audio_inner_dim)  # Audio inner dimension
+```
+
+---
 
 ## How It Works
 
-The pipeline uses a two-stage generation process:
+### Two-Stage Pipeline (Distilled Model)
 
 1. **Stage 1**: Generate at half resolution (e.g., 384x384) with 8 denoising steps
 2. **Upsample**: 2x spatial upsampling via LatentUpsampler
 3. **Stage 2**: Refine at full resolution (e.g., 768x768) with 3 denoising steps
 4. **Decode**: VAE decoder converts latents to RGB video
 
+### Single-Stage Pipeline (Dev Model)
+
+1. **Denoise**: Full resolution with CFG guidance over N steps
+2. **Decode**: VAE decoder converts latents to RGB video
+
+### Audio Generation
+
+Audio is generated in parallel with video using:
+- Shared transformer backbone with modality-specific attention
+- Audio VAE for latent encoding/decoding
+- HiFi-GAN vocoder for mel-to-waveform conversion
+
+---
+
+## Audio Configuration
+
+| Constant | Value | Description |
+|----------|-------|-------------|
+| `AUDIO_SAMPLE_RATE` | 24000 | Output audio sample rate |
+| `AUDIO_LATENT_SAMPLE_RATE` | 16000 | VAE internal sample rate |
+| `AUDIO_HOP_LENGTH` | 160 | Mel-spectrogram hop length |
+| `AUDIO_LATENT_CHANNELS` | 8 | Audio latent channels |
+| `AUDIO_MEL_BINS` | 16 | Mel frequency bins |
+
+---
+
 ## Requirements
 
 - macOS with Apple Silicon
@@ -85,27 +454,36 @@ The pipeline uses a two-stage generation process:
 ## Model Specifications
 
 - **Transformer**: 48 layers, 32 attention heads, 128 dim per head
-- **Latent channels**: 128
-- **Text encoder**: Gemma 3 with 3840-dim output
+- **Latent channels**: 128 (video), 8 (audio)
+- **Text encoder**: Gemma 3 with 3840-dim output (video), 2048-dim (audio)
 - **RoPE**: Split mode with double precision
 
 ## Project Structure
 
 ```
 mlx_video/
-├── generate.py             # Video generation pipeline
+├── __init__.py
+├── generate.py             # Two-stage video generation (distilled)
+├── generate_av.py          # Audio-video generation (distilled)
+├── generate_dev.py         # Single-stage generation with CFG (dev)
 ├── convert.py              # Weight conversion (PyTorch -> MLX)
 ├── postprocess.py          # Video post-processing utilities
 ├── utils.py                # Helper functions
+├── components/
+│   └── patchifiers.py      # Video/Audio patchifiers
+├── conditioning/
+│   ├── keyframe.py         # Keyframe conditioning
+│   └── latent.py           # Latent state & I2V conditioning
 └── models/
     └── ltx/
         ├── ltx.py          # Main LTXModel (DiT transformer)
         ├── config.py       # Model configuration
         ├── transformer.py  # Transformer blocks
         ├── attention.py    # Multi-head attention with RoPE
-        ├── text_encoder.py # Text encoder
+        ├── text_encoder.py # Text encoder with AV support
         ├── upsampler.py    # 2x spatial upsampler
-        └── video_vae/      # VAE encoder/decoder
+        ├── video_vae/      # Video VAE encoder/decoder
+        └── audio_vae/      # Audio VAE encoder/decoder/vocoder
 ```
 
 ## License

From 32f6925c1e3994d18eb8112d507075bf913e6999 Mon Sep 17 00:00:00 2001
From: Prince Canuma <prince.gdt@gmail.com>
Date: Sun, 18 Jan 2026 23:48:32 +0100
Subject: [PATCH 2/3] Enhance get_model_path function documentation with
 detailed argument and return descriptions for improved clarity.

---
 mlx_video/utils.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/mlx_video/utils.py b/mlx_video/utils.py
index cebbed7..990cde2 100644
--- a/mlx_video/utils.py
+++ b/mlx_video/utils.py
@@ -10,7 +10,19 @@
 from PIL import Image
 
 def get_model_path(model_repo: str):
-    """Get or download LTX-2 model path."""
+    """Get or download model path.
+    
+    Args:
+        model_repo: Either a Hugging Face repo ID (e.g., 'namespace/repo_name')
+                   or a local path to the model directory.
+    
+    Returns:
+        Path to the model directory.
+    """
+    local_path = Path(model_repo)
+    if local_path.exists() and local_path.is_dir():
+        return local_path
+    
     try:
         return Path(snapshot_download(repo_id=model_repo, local_files_only=True))
     except Exception:

From 9f0457c31f8883007898d75f14538e431fa0f625 Mon Sep 17 00:00:00 2001
From: Prince Canuma <prince.gdt@gmail.com>
Date: Mon, 19 Jan 2026 00:29:51 +0100
Subject: [PATCH 3/3] Update model repository defaults

---
 mlx_video/generate.py     | 4 ++--
 mlx_video/generate_av.py  | 4 ++--
 mlx_video/generate_dev.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mlx_video/generate.py b/mlx_video/generate.py
index 9a72fe9..20dcb0e 100644
--- a/mlx_video/generate.py
+++ b/mlx_video/generate.py
@@ -636,8 +636,8 @@ def main():
     parser.add_argument(
         "--model-repo",
         type=str,
-        default="Lightricks/LTX-2",
-        help="Model repository to use (default: Lightricks/LTX-2)"
+        default="mlx-community/LTX-2-distilled-bf16",
+        help="Model repository to use (default: mlx-community/LTX-2-distilled-bf16)"
     )
     parser.add_argument(
         "--text-encoder-repo",
diff --git a/mlx_video/generate_av.py b/mlx_video/generate_av.py
index e0fb22b..bf241c7 100644
--- a/mlx_video/generate_av.py
+++ b/mlx_video/generate_av.py
@@ -769,8 +769,8 @@ def main():
                         help="Output video path (default: output_av.mp4)")
     parser.add_argument("--output-audio", type=str, default=None,
                         help="Output audio path (default: same as video with .wav)")
-    parser.add_argument("--model-repo", type=str, default="Lightricks/LTX-2",
-                        help="Model repository (default: Lightricks/LTX-2)")
+    parser.add_argument("--model-repo", type=str, default="mlx-community/LTX-2-distilled-bf16",
+                        help="Model repository (default: mlx-community/LTX-2-distilled-bf16)")
     parser.add_argument("--text-encoder-repo", type=str, default=None,
                         help="Text encoder repository")
     parser.add_argument("--verbose", action="store_true",
diff --git a/mlx_video/generate_dev.py b/mlx_video/generate_dev.py
index 791c9ba..6717abd 100644
--- a/mlx_video/generate_dev.py
+++ b/mlx_video/generate_dev.py
@@ -1180,8 +1180,8 @@ def main():
     parser.add_argument(
         "--model-repo",
         type=str,
-        default="Lightricks/LTX-2",
-        help="Model repository to use (default: Lightricks/LTX-2)"
+        default="mlx-community/LTX-2-dev-bf16",
+        help="Model repository to use (default: mlx-community/LTX-2-dev-bf16)"
     )
     parser.add_argument(
         "--text-encoder-repo",