demo_.mp4
This repo contains the code for controlnet module for Wan2.2.
Same approach as controlnet for Wan2.1.
Currently, chess artifacts are observed in the 5B model inference. Perhaps this will be corrected in the future.
Use the cool ComfyUI-WanVideoWrapper.

The latest version of the diffusers is required.
You can install it like this:
pip install git+https://github.com/huggingface/diffusers.git| Model | Processor | Huggingface Link |
|---|---|---|
| TI2V-5B | Tile | Link |
| TI2V-5B | Depth | Link |
| TI2V-5B | Canny | Link |
| TI2V-5B | Hed | Link |
| T2V-A14B | Depth | Link |
| T2V-A14B | Hed | Link |
Clone repo
git clone https://github.com/TheDenk/wan2.2-controlnet.git
cd wan2.2-controlnetCreate venv
python -m venv venv
source venv/bin/activateInstall requirements
pip install -r requirements.txtpython -m inference.cli_demo \
--video_path "resources/bubble.mp4" \
--prompt "Close-up shot with soft lighting, focusing sharply on the lower half of a young woman's face. Her lips are slightly parted as she blows an enormous bubblegum bubble. The bubble is semi-transparent, shimmering gently under the light, and surprisingly contains a miniature aquarium inside, where two orange-and-white goldfish slowly swim, their fins delicately fluttering as if in an aquatic universe. The background is a pure light blue color." \
--controlnet_type "depth" \
--base_model_path Wan-AI/Wan2.2-TI2V-5B-Diffusers \
--controlnet_model_path TheDenk/wan2.2-ti2v-5b-controlnet-depth-v1python -m inference.cli_demo \
--video_path "resources/bubble.mp4" \
--prompt "Close-up shot with soft lighting, focusing sharply on the lower half of a young woman's face. Her lips are slightly parted as she blows an enormous bubblegum bubble. The bubble is semi-transparent, shimmering gently under the light, and surprisingly contains a miniature aquarium inside, where two orange-and-white goldfish slowly swim, their fins delicately fluttering as if in an aquatic universe. The background is a pure light blue color." \
--controlnet_type "depth" \
--base_model_path Wan-AI/Wan2.2-TI2V-5B-Diffusers \
--controlnet_model_path TheDenk/wan2.2-ti2v-5b-controlnet-depth-v1 \
--controlnet_weight 0.8 \
--controlnet_guidance_start 0.0 \
--controlnet_guidance_end 0.8 \
--controlnet_stride 3 \
--num_inference_steps 50 \
--guidance_scale 5.0 \
--video_height 480 \
--video_width 832 \
--num_frames 121 \
--negative_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
--seed 42 \
--out_fps 24 \
--output_path "result.mp4" \
--teacache_treshold 0.6import cv2
from PIL import Image
def apply_gaussian_blur(image, ksize=5, sigmaX=1.0):
image_np = np.array(image)
if ksize % 2 == 0:
ksize += 1
blurred_image = cv2.GaussianBlur(image_np, (ksize, ksize), sigmaX=sigmaX)
return Image.fromarray(blurred_image)
ksize = 5
downscale_coef = 4
controlnet_frames = [x.resize((img_w // downscale_coef, img_h // downscale_coef)) for x in video_frames]
controlnet_frames = [apply_gaussian_blur(x, ksize=ksize, sigmaX=ksize // 2) for x in controlnet_frames]
controlnet_frames = [x.resize((img_w, img_h)) for x in controlnet_frames]import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
from diffusers.utils import load_video, export_to_video
from diffusers import AutoencoderKLWan, UniPCMultistepScheduler
from controlnet_aux import MidasDetector
from wan_controlnet import WanControlnet
from wan_transformer import CustomWanTransformer3DModel
from wan_t2v_controlnet_pipeline import WanTextToVideoControlnetPipeline
base_model_path = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
controlnet_model_path = "TheDenk/wan2.2-ti2v-5b-controlnet-depth-v1"
vae = AutoencoderKLWan.from_pretrained(base_model_path, subfolder="vae", torch_dtype=torch.float32)
transformer = CustomWanTransformer3DModel.from_pretrained(base_model_path, subfolder="transformer", torch_dtype=torch.bfloat16)
controlnet = WanControlnet.from_pretrained(controlnet_model_path, torch_dtype=torch.bfloat16)
pipe = WanTextToVideoControlnetPipeline.from_pretrained(
pretrained_model_name_or_path=base_model_path,
controlnet=controlnet,
transformer=transformer,
vae=vae,
torch_dtype=torch.bfloat16
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
pipe.enable_model_cpu_offload()
controlnet_processor = MidasDetector.from_pretrained('lllyasviel/Annotators')
img_h = 704 # 704 480
img_w = 1280 # 1280 832
num_frames = 121 # 121 81 49
video_path = 'bubble.mp4'
video_frames = load_video(video_path)[:num_frames]
video_frames = [x.resize((img_w, img_h)) for x in video_frames]
controlnet_frames = [controlnet_processor(x) for x in video_frames]
prompt = "Close-up shot with soft lighting, focusing sharply on the lower half of a young woman's face. Her lips are slightly parted as she blows an enormous bubblegum bubble. The bubble is semi-transparent, shimmering gently under the light, and surprisingly contains a miniature aquarium inside, where two orange-and-white goldfish slowly swim, their fins delicately fluttering as if in an aquatic universe. The background is a pure light blue color."
negative_prompt = "bad quality, worst quality"
output = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
height=img_h,
width=img_w,
num_frames=num_frames,
guidance_scale=5,
generator=torch.Generator(device="cuda").manual_seed(42),
output_type="pil",
controlnet_frames=controlnet_frames,
controlnet_guidance_start=0.0,
controlnet_guidance_end=0.8,
controlnet_weight=0.8,
teacache_treshold=0.6,
).frames[0]
export_to_video(output, "output.mp4", fps=16)Original code and models Wan2.2.
@misc{TheDenk,
title={Wan2.2 Controlnet},
author={Karachev Denis},
url={https://github.com/TheDenk/wan2.2-controlnet},
publisher={Github},
year={2025}
}
Issues should be raised directly in the repository. For professional support and recommendations please welcomedenk@gmail.com.