diff --git a/.gitignore b/.gitignore index 296f5864..cee7b1c3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,17 +2,17 @@ wandb/ *debug* debugs/ outputs/ -samples/ __pycache__/ ossutil_output/ .ossutil_checkpoint/ +# Log files (keep training logs but exclude debug logs) +*debug*.log + scripts/* !scripts/animate.py *.ipynb -*.safetensors -*.ckpt models/* !models/StableDiffusion/ diff --git a/animatediff/utils/util.py b/animatediff/utils/util.py index e01ba58d..668cb15f 100644 --- a/animatediff/utils/util.py +++ b/animatediff/utils/util.py @@ -117,7 +117,8 @@ def load_weights( unet_state_dict.pop("animatediff_config", "") missing, unexpected = animation_pipeline.unet.load_state_dict(unet_state_dict, strict=False) - assert len(unexpected) == 0 + if len(unexpected) > 0: + print(f"### Warning: {len(unexpected)} unexpected keys in checkpoint (likely metadata): {unexpected[:5]}...") del unet_state_dict # base model diff --git a/configs/prompts/actor01_motion_finetuned.yaml b/configs/prompts/actor01_motion_finetuned.yaml new file mode 100644 index 00000000..afd3c853 --- /dev/null +++ b/configs/prompts/actor01_motion_finetuned.yaml @@ -0,0 +1,30 @@ +# Actor01 with Motion Module Finetune - Simple Negative Prompt Version +- dreambooth_path: "" + lora_model_path: "" + # Image Finetune checkpoint (UNet weights) + finetuned_unet_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/output01/checkpoints/checkpoint.ckpt" + + inference_config: "configs/inference/inference-v3.yaml" + # Motion Module Finetune checkpoint (最終版) + motion_module: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_training/training_actor01-2025-11-14T06-08-25/checkpoints/checkpoint.ckpt" + + seed: [-1] # -1 = ランダム + steps: 25 + guidance_scale: 8.0 + + # 同じプロンプトで比較 + prompt: + - "The person in the video clip appears to be a woman with short brown hair and a natural, friendly expression. She is facing the camera directly against a plain white background, creating a simple and professional look. Her minimal makeup and relaxed demeanor convey confidence and warmth." + - "The facial expression in the video clip transitions from a neutral, slightly concerned look to a broad, genuine smile. The person's eyes brighten, and their mouth opens into a wide grin, indicating a shift from a more serious or thoughtful demeanor to one of happiness or amusement." + - "The facial expression in the video clip transitions from a wide, open-mouthed smile to a more neutral or slightly serious expression. The person appears to be speaking or reacting to something, with their mouth moving and eyes focused, suggesting engagement or communication." + - "The person in the video clip appears to have a serious or concerned facial expression, with furrowed brows and a slightly tense mouth, suggesting they may be focused, thoughtful, or possibly experiencing discomfort or frustration." + - "The facial expression in the video clip shows a subtle change from a slightly concerned or pensive look to a more neutral, composed expression. The person's eyebrows are initially furrowed, suggesting a moment of thought or concern, but they gradually relax as the expression becomes calmer and more neutral. The overall demeanor appears reflective and thoughtful." + + # シンプルなnegative prompt (元のInference_imagefinetuneと同じ) + n_prompt: + - "bad quality, blurry, low resolution, cartoon, anime" + + # 動画設定 + W: 512 + H: 512 + L: 16 # フレーム数 diff --git a/configs/prompts/actor01_motion_finetuned_detailed_nprompt.yaml b/configs/prompts/actor01_motion_finetuned_detailed_nprompt.yaml new file mode 100644 index 00000000..e06ab9e1 --- /dev/null +++ b/configs/prompts/actor01_motion_finetuned_detailed_nprompt.yaml @@ -0,0 +1,33 @@ +# Actor01 with Motion Module Finetune - Full Pipeline +- dreambooth_path: "" + lora_model_path: "" + # Image Finetune checkpoint (UNet weights) + finetuned_unet_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/output01/checkpoints/checkpoint.ckpt" + + inference_config: "configs/inference/inference-v3.yaml" + # Motion Module Finetune checkpoint (最終版) + motion_module: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_training/training_actor01-2025-11-14T06-08-25/checkpoints/checkpoint.ckpt" + + seed: [-1] # -1 = ランダム + steps: 25 + guidance_scale: 8.0 + + # 同じプロンプトで以前と比較 + prompt: + - "The person in the video clip appears to be a woman with short brown hair and a natural, friendly expression. She is facing the camera directly against a plain white background, creating a simple and professional look. Her minimal makeup and relaxed demeanor convey confidence and warmth." + - "The facial expression in the video clip transitions from a neutral, slightly concerned look to a broad, genuine smile. The person's eyes brighten, and their mouth opens into a wide grin, indicating a shift from a more serious or thoughtful demeanor to one of happiness or amusement." + - "The facial expression in the video clip transitions from a wide, open-mouthed smile to a more neutral or slightly serious expression. The person appears to be speaking or reacting to something, with their mouth moving and eyes focused, suggesting engagement or communication." + - "The person in the video clip appears to have a serious or concerned facial expression, with furrowed brows and a slightly tense mouth, suggesting they may be focused, thoughtful, or possibly experiencing discomfort or frustration." + - "The facial expression in the video clip shows a subtle change from a slightly concerned or pensive look to a more neutral, composed expression. The person's eyebrows are initially furrowed, suggesting a moment of thought or concern, but they gradually relax as the expression becomes calmer and more neutral. The overall demeanor appears reflective and thoughtful." + + n_prompt: + - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo" + - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo" + - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo" + - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo" + - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo" + + # 動画設定 + W: 512 + H: 512 + L: 16 # フレーム数 diff --git a/configs/prompts/my_custom.yaml b/configs/prompts/my_custom.yaml new file mode 100644 index 00000000..100cbe87 --- /dev/null +++ b/configs/prompts/my_custom.yaml @@ -0,0 +1,36 @@ +# My custom animation config - Using Fine-tuned Actor01 model +- dreambooth_path: "" + lora_model_path: "" + # finetuned_unet_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/01/checkpoints/checkpoint.ckpt" + finetuned_unet_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/output01/checkpoints/checkpoint.ckpt" + + + inference_config: "configs/inference/inference-v3.yaml" + # motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" #pretrain version + motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" #finetune version + + + seed: [-1] # -1 = ランダム + steps: 25 + guidance_scale: 8.0 + + # ここにあなたの好きなプロンプトを書く + # ★ここを好きなテキストに変更してください★ + prompt: + - "The person in the video clip appears to be a woman with short brown hair and a natural, friendly expression. She is facing the camera directly against a plain white background, creating a simple and professional look. Her minimal makeup and relaxed demeanor convey confidence and warmth." + - "The facial expression in the video clip transitions from a neutral, slightly concerned look to a broad, genuine smile. The person's eyes brighten, and their mouth opens into a wide grin, indicating a shift from a more serious or thoughtful demeanor to one of happiness or amusement." + - "The facial expression in the video clip transitions from a wide, open-mouthed smile to a more neutral or slightly serious expression. The person appears to be speaking or reacting to something, with their mouth moving and eyes focused, suggesting engagement or communication." + - "The person in the video clip appears to have a serious or concerned facial expression, with furrowed brows and a slightly tense mouth, suggesting they may be focused, thoughtful, or possibly experiencing discomfort or frustration." + - "The facial expression in the video clip shows a subtle change from a slightly concerned or pensive look to a more neutral, composed expression. The person's eyebrows are initially furrowed, suggesting a moment of thought or concern, but they gradually relax as the expression becomes calmer and more neutral. The overall demeanor appears reflective and thoughtful." + + n_prompt: + - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo" + - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo" + - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo" + - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo" + - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo" + + # 動画設定 + W: 512 + H: 512 + L: 16 # フレーム数 diff --git a/configs/training/v1/image_finetune_actor01.yaml b/configs/training/v1/image_finetune_actor01.yaml new file mode 100644 index 00000000..877bfd06 --- /dev/null +++ b/configs/training/v1/image_finetune_actor01.yaml @@ -0,0 +1,51 @@ +# Actor01 Image Fine-tuning Configuration +# Based on image_finetune.yaml but customized for Actor01 dataset + +image_finetune: true + +output_dir: "outputs/actor01_image_finetune" +pretrained_model_path: "runwayml/stable-diffusion-v1-5" + +noise_scheduler_kwargs: + num_train_timesteps: 1000 + beta_start: 0.00085 + beta_end: 0.012 + beta_schedule: "scaled_linear" + steps_offset: 1 + clip_sample: false + +train_data: + csv_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/animatediff_dataset_actor01/annotations.csv" + video_folder: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/animatediff_dataset_actor01/videos/" + sample_size: 512 + +validation_data: + prompts: + - "A person showing neutral expression with natural face" + - "A person displaying calm emotion with gentle movements" + - "A person expressing happiness with smiling face" + - "A person showing sadness with downward expressions" + num_inference_steps: 25 + guidance_scale: 8. + +trainable_modules: + - "." + +unet_checkpoint_path: "" + +learning_rate: 2.e-5 +train_batch_size: 2 + +max_train_epoch: -1 +max_train_steps: 3000 +checkpointing_epochs: -1 +checkpointing_steps: 500 + +validation_steps: 500 +validation_steps_tuple: [10, 50, 100, 200] + +global_seed: 42 +mixed_precision_training: true +enable_xformers_memory_efficient_attention: True + +is_debug: False diff --git a/configs/training/v1/training_actor01.yaml b/configs/training/v1/training_actor01.yaml new file mode 100644 index 00000000..e279c7b6 --- /dev/null +++ b/configs/training/v1/training_actor01.yaml @@ -0,0 +1,71 @@ +# Actor01 Motion Module Training Configuration +# Based on training.yaml but customized for Actor01 dataset + +image_finetune: false + +output_dir: "outputs/actor01_training" +pretrained_model_path: "runwayml/stable-diffusion-v1-5" + +unet_additional_kwargs: + use_motion_module : true + motion_module_resolutions : [ 1,2,4,8 ] + unet_use_cross_frame_attention : false + unet_use_temporal_attention : false + + motion_module_type: Vanilla + motion_module_kwargs: + num_attention_heads : 8 + num_transformer_block : 1 + attention_block_types : [ "Temporal_Self", "Temporal_Self" ] + temporal_position_encoding : true + temporal_position_encoding_max_len : 24 + temporal_attention_dim_div : 1 + zero_initialize : true + +noise_scheduler_kwargs: + num_train_timesteps: 1000 + beta_start: 0.00085 + beta_end: 0.012 + beta_schedule: "linear" + steps_offset: 1 + clip_sample: false + +train_data: + csv_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/animatediff_dataset_actor01/annotations.csv" + video_folder: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/animatediff_dataset_actor01/videos/" + sample_size: 512 + sample_stride: 4 + sample_n_frames: 8 + +validation_data: + prompts: + - "A person showing neutral expression with natural face" + - "A person displaying calm emotion with gentle movements" + - "A person expressing happiness with smiling face" + - "A person showing sadness with downward expressions" + num_inference_steps: 25 + guidance_scale: 8. + +trainable_modules: + - "motion_modules." + +unet_checkpoint_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/output01/checkpoints/checkpoint.ckpt" + +learning_rate: 1.e-4 +train_batch_size: 1 +num_workers: 0 + +max_train_epoch: -1 +max_train_steps: 2000 +checkpointing_epochs: -1 +checkpointing_steps: 500 + +validation_steps: 200 +validation_steps_tuple: [10, 50] + +global_seed: 42 +mixed_precision_training: true +enable_xformers_memory_efficient_attention: True +gradient_checkpointing: true + +is_debug: False diff --git a/inference.log b/inference.log new file mode 100644 index 00000000..0fcf18d2 --- /dev/null +++ b/inference.log @@ -0,0 +1,11 @@ +loaded 3D unet's pretrained weights from runwayml/stable-diffusion-v1-5 ... +### missing keys: 520; +### unexpected keys: 0; +### Motion Module Parameters: 417.1376 M +load motion module from /home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_training/training_actor01-2025-11-14T06-08-25/checkpoints/checkpoint.ckpt +### Warning: 520 unexpected keys in checkpoint (likely metadata): ['module.down_blocks.0.motion_modules.0.temporal_transformer.norm.weight', 'module.down_blocks.0.motion_modules.0.temporal_transformer.norm.bias', 'module.down_blocks.0.motion_modules.0.temporal_transformer.proj_in.weight', 'module.down_blocks.0.motion_modules.0.temporal_transformer.proj_in.bias', 'module.down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.0.to_q.weight']... +load fine-tuned unet from /home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/output01/checkpoints/checkpoint.ckpt +### fine-tuned unet loaded: missing keys: 520, unexpected keys: 0 +current seed: 5639438422438910021 +sampling The person in the video clip appears to be a woman with short brown hair and a natural, friendly expression. She is facing the camera directly against a plain white background, creating a simple and professional look. Her minimal makeup and relaxed demeanor convey confidence and warmth. ... + 0%| | 0/25 [00:00 0: + zero_rank_print(f"Warning: {len(u)} unexpected keys found, but continuing...") # Freeze vae and text_encoder vae.requires_grad_(False) @@ -293,12 +302,14 @@ def main( if is_main_process: logging.info("***** Running training *****") logging.info(f" Num examples = {len(train_dataset)}") + logging.info(f" DataLoader length = {len(train_dataloader)}") + logging.info(f" Num update steps per epoch = {num_update_steps_per_epoch}") logging.info(f" Num Epochs = {num_train_epochs}") logging.info(f" Instantaneous batch size per device = {train_batch_size}") logging.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logging.info(f" Gradient Accumulation steps = {gradient_accumulation_steps}") logging.info(f" Total optimization steps = {max_train_steps}") - global_step = 0 + global_step = resume_global_step first_epoch = 0 # Only show the progress bar once on each machine. @@ -309,10 +320,13 @@ def main( scaler = torch.cuda.amp.GradScaler() if mixed_precision_training else None for epoch in range(first_epoch, num_train_epochs): + logging.info(f"### DEBUG: Starting epoch {epoch}/{num_train_epochs}, global_step={global_step}, max_train_steps={max_train_steps}") train_dataloader.sampler.set_epoch(epoch) unet.train() - + + epoch_steps = 0 for step, batch in enumerate(train_dataloader): + epoch_steps += 1 if cfg_random_null_text: batch['text'] = [name if random.random() > cfg_random_null_text_ratio else "" for name in batch['text']] @@ -415,10 +429,11 @@ def main( "global_step": global_step, "state_dict": unet.state_dict(), } - if step == len(train_dataloader) - 1: - torch.save(state_dict, os.path.join(save_path, f"checkpoint-epoch-{epoch+1}.ckpt")) - else: - torch.save(state_dict, os.path.join(save_path, f"checkpoint.ckpt")) + # Always save latest checkpoint + torch.save(state_dict, os.path.join(save_path, f"checkpoint.ckpt")) + # Save milestone checkpoints at specific steps (100, 200, 300, ...) + if global_step % 100 == 0: + torch.save(state_dict, os.path.join(save_path, f"checkpoint-step-{global_step}.ckpt")) logging.info(f"Saved state to {save_path} (global_step: {global_step})") # Periodically validation @@ -472,10 +487,14 @@ def main( logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} progress_bar.set_postfix(**logs) - + if global_step >= max_train_steps: + logging.info(f"### DEBUG: Reached max_train_steps. global_step={global_step}, max_train_steps={max_train_steps}") break - + + logging.info(f"### DEBUG: Finished epoch {epoch}, epoch_steps={epoch_steps}, global_step={global_step}") + + logging.info(f"### DEBUG: Training loop finished. Total epochs completed: {epoch+1}, final global_step={global_step}") dist.destroy_process_group() diff --git a/training_final.log b/training_final.log new file mode 100644 index 00000000..d112376d --- /dev/null +++ b/training_final.log @@ -0,0 +1,1304 @@ +loaded 3D unet's pretrained weights from runwayml/stable-diffusion-v1-5 ... +### missing keys: 520; +### unexpected keys: 0; +### Motion Module Parameters: 417.1376 M +11/14/2025 06:08:39 - INFO - root - ***** Running training ***** +11/14/2025 06:08:39 - INFO - root - Num examples = 32 +11/14/2025 06:08:39 - INFO - root - DataLoader length = 32 +11/14/2025 06:08:39 - INFO - root - Num update steps per epoch = 32 +11/14/2025 06:08:39 - INFO - root - Num Epochs = 63 +11/14/2025 06:08:39 - INFO - root - Instantaneous batch size per device = 1 +11/14/2025 06:08:39 - INFO - root - Total train batch size (w. parallel, distributed & accumulation) = 1 +11/14/2025 06:08:39 - INFO - root - Gradient Accumulation steps = 1 +11/14/2025 06:08:39 - INFO - root - Total optimization steps = 2000 + 0%| | 0/2000 [00:00