guoyww · takuya-13068 · Nov 14, 2025 · Nov 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -2,17 +2,17 @@ wandb/
 *debug*
 debugs/
 outputs/
-samples/
 __pycache__/
 ossutil_output/
 .ossutil_checkpoint/
 
+# Log files (keep training logs but exclude debug logs)
+*debug*.log
+
 scripts/*
 !scripts/animate.py
 
 *.ipynb
-*.safetensors
-*.ckpt
 
 models/*
 !models/StableDiffusion/

diff --git a/animatediff/utils/util.py b/animatediff/utils/util.py
@@ -117,7 +117,8 @@ def load_weights(
         unet_state_dict.pop("animatediff_config", "")
 
     missing, unexpected = animation_pipeline.unet.load_state_dict(unet_state_dict, strict=False)
-    assert len(unexpected) == 0
+    if len(unexpected) > 0:
+        print(f"### Warning: {len(unexpected)} unexpected keys in checkpoint (likely metadata): {unexpected[:5]}...")
     del unet_state_dict
 
     # base model

diff --git a/configs/prompts/actor01_motion_finetuned.yaml b/configs/prompts/actor01_motion_finetuned.yaml
@@ -0,0 +1,30 @@
+# Actor01 with Motion Module Finetune - Simple Negative Prompt Version
+- dreambooth_path: ""
+  lora_model_path: ""
+  # Image Finetune checkpoint (UNet weights)
+  finetuned_unet_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/output01/checkpoints/checkpoint.ckpt"
+
+  inference_config: "configs/inference/inference-v3.yaml"
+  # Motion Module Finetune checkpoint (最終版)
+  motion_module: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_training/training_actor01-2025-11-14T06-08-25/checkpoints/checkpoint.ckpt"
+
+  seed: [-1]  # -1 = ランダム
+  steps: 25
+  guidance_scale: 8.0
+
+  # 同じプロンプトで比較
+  prompt:
+    - "The person in the video clip appears to be a woman with short brown hair and a natural, friendly expression. She is facing the camera directly against a plain white background, creating a simple and professional look. Her minimal makeup and relaxed demeanor convey confidence and warmth."
+    - "The facial expression in the video clip transitions from a neutral, slightly concerned look to a broad, genuine smile. The person's eyes brighten, and their mouth opens into a wide grin, indicating a shift from a more serious or thoughtful demeanor to one of happiness or amusement."
+    - "The facial expression in the video clip transitions from a wide, open-mouthed smile to a more neutral or slightly serious expression. The person appears to be speaking or reacting to something, with their mouth moving and eyes focused, suggesting engagement or communication."
+    - "The person in the video clip appears to have a serious or concerned facial expression, with furrowed brows and a slightly tense mouth, suggesting they may be focused, thoughtful, or possibly experiencing discomfort or frustration."
+    - "The facial expression in the video clip shows a subtle change from a slightly concerned or pensive look to a more neutral, composed expression. The person's eyebrows are initially furrowed, suggesting a moment of thought or concern, but they gradually relax as the expression becomes calmer and more neutral. The overall demeanor appears reflective and thoughtful."
+
+  # シンプルなnegative prompt (元のInference_imagefinetuneと同じ)
+  n_prompt:
+    - "bad quality, blurry, low resolution, cartoon, anime"
+
+  # 動画設定
+  W: 512
+  H: 512
+  L: 16  # フレーム数
diff --git a/configs/prompts/actor01_motion_finetuned_detailed_nprompt.yaml b/configs/prompts/actor01_motion_finetuned_detailed_nprompt.yaml
@@ -0,0 +1,33 @@
+# Actor01 with Motion Module Finetune - Full Pipeline
+- dreambooth_path: ""
+  lora_model_path: ""
+  # Image Finetune checkpoint (UNet weights)
+  finetuned_unet_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/output01/checkpoints/checkpoint.ckpt"
+
+  inference_config: "configs/inference/inference-v3.yaml"
+  # Motion Module Finetune checkpoint (最終版)
+  motion_module: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_training/training_actor01-2025-11-14T06-08-25/checkpoints/checkpoint.ckpt"
+
+  seed: [-1]  # -1 = ランダム
+  steps: 25
+  guidance_scale: 8.0
+
+  # 同じプロンプトで以前と比較
+  prompt:
+    - "The person in the video clip appears to be a woman with short brown hair and a natural, friendly expression. She is facing the camera directly against a plain white background, creating a simple and professional look. Her minimal makeup and relaxed demeanor convey confidence and warmth."
+    - "The facial expression in the video clip transitions from a neutral, slightly concerned look to a broad, genuine smile. The person's eyes brighten, and their mouth opens into a wide grin, indicating a shift from a more serious or thoughtful demeanor to one of happiness or amusement."
+    - "The facial expression in the video clip transitions from a wide, open-mouthed smile to a more neutral or slightly serious expression. The person appears to be speaking or reacting to something, with their mouth moving and eyes focused, suggesting engagement or communication."
+    - "The person in the video clip appears to have a serious or concerned facial expression, with furrowed brows and a slightly tense mouth, suggesting they may be focused, thoughtful, or possibly experiencing discomfort or frustration."
+    - "The facial expression in the video clip shows a subtle change from a slightly concerned or pensive look to a more neutral, composed expression. The person's eyebrows are initially furrowed, suggesting a moment of thought or concern, but they gradually relax as the expression becomes calmer and more neutral. The overall demeanor appears reflective and thoughtful."
+
+  n_prompt:
+    - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo"
+    - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo"
+    - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo"
+    - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo"
+    - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo"
+
+  # 動画設定
+  W: 512
+  H: 512
+  L: 16  # フレーム数
diff --git a/configs/prompts/my_custom.yaml b/configs/prompts/my_custom.yaml
@@ -0,0 +1,36 @@
+# My custom animation config - Using Fine-tuned Actor01 model
+- dreambooth_path: ""
+  lora_model_path: ""
+  # finetuned_unet_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/01/checkpoints/checkpoint.ckpt"
+  finetuned_unet_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/output01/checkpoints/checkpoint.ckpt"
+
+
+  inference_config: "configs/inference/inference-v3.yaml"
+  # motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" #pretrain version
+  motion_module: "models/Motion_Module/v3_sd15_mm.ckpt" #finetune version
+
+
+  seed: [-1]  # -1 = ランダム
+  steps: 25
+  guidance_scale: 8.0
+
+  # ここにあなたの好きなプロンプトを書く
+  # ★ここを好きなテキストに変更してください★
+  prompt:
+    - "The person in the video clip appears to be a woman with short brown hair and a natural, friendly expression. She is facing the camera directly against a plain white background, creating a simple and professional look. Her minimal makeup and relaxed demeanor convey confidence and warmth."
+    - "The facial expression in the video clip transitions from a neutral, slightly concerned look to a broad, genuine smile. The person's eyes brighten, and their mouth opens into a wide grin, indicating a shift from a more serious or thoughtful demeanor to one of happiness or amusement."
+    - "The facial expression in the video clip transitions from a wide, open-mouthed smile to a more neutral or slightly serious expression. The person appears to be speaking or reacting to something, with their mouth moving and eyes focused, suggesting engagement or communication."
+    - "The person in the video clip appears to have a serious or concerned facial expression, with furrowed brows and a slightly tense mouth, suggesting they may be focused, thoughtful, or possibly experiencing discomfort or frustration."
+    - "The facial expression in the video clip shows a subtle change from a slightly concerned or pensive look to a more neutral, composed expression. The person's eyebrows are initially furrowed, suggesting a moment of thought or concern, but they gradually relax as the expression becomes calmer and more neutral. The overall demeanor appears reflective and thoughtful."
+
+  n_prompt:
+    - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo"
+    - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo"
+    - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo"
+    - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo"
+    - "bad quality, blurry, low resolution, cartoon, anime, distorted face, deformed, disfigured, extra limbs, extra fingers, poorly drawn hands, poorly drawn face, mutation, ugly, bad anatomy, bad proportions, cropped, worst quality, jpeg artifacts, watermark, signature, text, logo"
+
+  # 動画設定
+  W: 512
+  H: 512
+  L: 16  # フレーム数
diff --git a/configs/training/v1/image_finetune_actor01.yaml b/configs/training/v1/image_finetune_actor01.yaml
@@ -0,0 +1,51 @@
+# Actor01 Image Fine-tuning Configuration
+# Based on image_finetune.yaml but customized for Actor01 dataset
+
+image_finetune: true
+
+output_dir: "outputs/actor01_image_finetune"
+pretrained_model_path: "runwayml/stable-diffusion-v1-5"
+
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+  beta_start:          0.00085
+  beta_end:            0.012
+  beta_schedule:       "scaled_linear"
+  steps_offset:        1
+  clip_sample:         false
+
+train_data:
+  csv_path:     "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/animatediff_dataset_actor01/annotations.csv"
+  video_folder: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/animatediff_dataset_actor01/videos/"
+  sample_size:  512
+
+validation_data:
+  prompts:
+    - "A person showing neutral expression with natural face"
+    - "A person displaying calm emotion with gentle movements"
+    - "A person expressing happiness with smiling face"
+    - "A person showing sadness with downward expressions"
+  num_inference_steps: 25
+  guidance_scale: 8.
+
+trainable_modules:
+  - "."
+
+unet_checkpoint_path: ""
+
+learning_rate:    2.e-5
+train_batch_size: 2
+
+max_train_epoch:      -1
+max_train_steps:      3000
+checkpointing_epochs: -1
+checkpointing_steps:  500
+
+validation_steps:       500
+validation_steps_tuple: [10, 50, 100, 200]
+
+global_seed: 42
+mixed_precision_training: true
+enable_xformers_memory_efficient_attention: True
+
+is_debug: False
diff --git a/configs/training/v1/training_actor01.yaml b/configs/training/v1/training_actor01.yaml
@@ -0,0 +1,71 @@
+# Actor01 Motion Module Training Configuration
+# Based on training.yaml but customized for Actor01 dataset
+
+image_finetune: false
+
+output_dir: "outputs/actor01_training"
+pretrained_model_path: "runwayml/stable-diffusion-v1-5"
+
+unet_additional_kwargs:
+  use_motion_module              : true
+  motion_module_resolutions      : [ 1,2,4,8 ]
+  unet_use_cross_frame_attention : false
+  unet_use_temporal_attention    : false
+
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads                : 8
+    num_transformer_block              : 1
+    attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
+    temporal_position_encoding         : true
+    temporal_position_encoding_max_len : 24
+    temporal_attention_dim_div         : 1
+    zero_initialize                    : true
+
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+  beta_start:          0.00085
+  beta_end:            0.012
+  beta_schedule:       "linear"
+  steps_offset:        1
+  clip_sample:         false
+
+train_data:
+  csv_path:        "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/animatediff_dataset_actor01/annotations.csv"
+  video_folder:    "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/animatediff_dataset_actor01/videos/"
+  sample_size:     512
+  sample_stride:   4
+  sample_n_frames: 8
+
+validation_data:
+  prompts:
+    - "A person showing neutral expression with natural face"
+    - "A person displaying calm emotion with gentle movements"
+    - "A person expressing happiness with smiling face"
+    - "A person showing sadness with downward expressions"
+  num_inference_steps: 25
+  guidance_scale: 8.
+
+trainable_modules:
+  - "motion_modules."
+
+unet_checkpoint_path: "/home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/output01/checkpoints/checkpoint.ckpt"
+
+learning_rate:    1.e-4
+train_batch_size: 1
+num_workers:      0
+
+max_train_epoch:      -1
+max_train_steps:      2000
+checkpointing_epochs: -1
+checkpointing_steps:  500
+
+validation_steps:       200
+validation_steps_tuple: [10, 50]
+
+global_seed: 42
+mixed_precision_training: true
+enable_xformers_memory_efficient_attention: True
+gradient_checkpointing: true
+
+is_debug: False
diff --git a/inference.log b/inference.log
@@ -0,0 +1,11 @@
+loaded 3D unet's pretrained weights from runwayml/stable-diffusion-v1-5 ...
+### missing keys: 520; 
+### unexpected keys: 0;
+### Motion Module Parameters: 417.1376 M
+load motion module from /home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_training/training_actor01-2025-11-14T06-08-25/checkpoints/checkpoint.ckpt
+### Warning: 520 unexpected keys in checkpoint (likely metadata): ['module.down_blocks.0.motion_modules.0.temporal_transformer.norm.weight', 'module.down_blocks.0.motion_modules.0.temporal_transformer.norm.bias', 'module.down_blocks.0.motion_modules.0.temporal_transformer.proj_in.weight', 'module.down_blocks.0.motion_modules.0.temporal_transformer.proj_in.bias', 'module.down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.0.to_q.weight']...
+load fine-tuned unet from /home/takahashit/FastStorage/brain2024/style_transfer/VAE_encoder/AnimateDiff/outputs/actor01_image_finetune/output01/checkpoints/checkpoint.ckpt
+### fine-tuned unet loaded: missing keys: 520, unexpected keys: 0
+current seed: 5639438422438910021
+sampling The person in the video clip appears to be a woman with short brown hair and a natural, friendly expression. She is facing the camera directly against a plain white background, creating a simple and professional look. Her minimal makeup and relaxed demeanor convey confidence and warmth. ...
+  0%|          | 0/25 [00:00<?, ?it/s]  4%|▍         | 1/25 [00:01<00:32,  1.37s/it]  8%|▊         | 2/25 [00:02<00:29,  1.28s/it] 12%|█▏        | 3/25 [00:03<00:27,  1.24s/it] 16%|█▌        | 4/25 [00:04<00:25,  1.23s/it] 20%|██        | 5/25 [00:06<00:24,  1.22s/it]

diff --git a/models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors b/models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff_t2i_backups/blobs/15012c538f503ce2ebfc2c8547b268c75ccdaff7a281db55399940ff1d70e21d
diff --git a/models/MotionLoRA/v2_lora_PanLeft.ckpt b/models/MotionLoRA/v2_lora_PanLeft.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/ed79025f8bea018c8925f43b6304a27e462335b6ec5e6f8a222c2726153844b3
diff --git a/models/MotionLoRA/v2_lora_PanRight.ckpt b/models/MotionLoRA/v2_lora_PanRight.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/f4eb9154623c628c76dbd83109f125617c985490fec36ddca5464eb61ac7f6d5
diff --git a/models/MotionLoRA/v2_lora_RollingAnticlockwise.ckpt b/models/MotionLoRA/v2_lora_RollingAnticlockwise.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/7ae6cbc81044895243bba9a64df9666db763a52acfd8e496c490af84e812748a
diff --git a/models/MotionLoRA/v2_lora_RollingClockwise.ckpt b/models/MotionLoRA/v2_lora_RollingClockwise.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/361b1af8500d7fd09c2f884fac5dc0397a4323bae8fab5233443de5383d13630
diff --git a/models/MotionLoRA/v2_lora_TiltDown.ckpt b/models/MotionLoRA/v2_lora_TiltDown.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/09e4d5448aba4ea51b3bcd4b5d2b058ed4b47bb72d94d8c05a3ccce3368db6d9
diff --git a/models/MotionLoRA/v2_lora_TiltUp.ckpt b/models/MotionLoRA/v2_lora_TiltUp.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/c0ee2f181fc69d7fe26e013ad5cfea11f25cb9f5e8fded3c9942b61803cd6c3d
diff --git a/models/MotionLoRA/v2_lora_ZoomIn.ckpt b/models/MotionLoRA/v2_lora_ZoomIn.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/70ce8b9057b173b9249c48aca5d66c8aa1d8aaa040fda394e50e37f3e278195e
diff --git a/models/MotionLoRA/v2_lora_ZoomOut.ckpt b/models/MotionLoRA/v2_lora_ZoomOut.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/4172fb2d36410ef638ae0e29d604b66c11ee44b94db9c7cc5ee34d7f865c55d9
diff --git a/models/Motion_Module/mm_sd_v15_v2.ckpt b/models/Motion_Module/mm_sd_v15_v2.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/69ed0f5fef82b110aca51bcab73b21104242bc65d6ab4b8b2a2a94d31cad1bf0
diff --git a/models/Motion_Module/v3_sd15_adapter.ckpt b/models/Motion_Module/v3_sd15_adapter.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/fd2d8e26480f6ab013c1e6af86fdf1dedbb1ed5baf850ccd5f365f39d6c3472c
diff --git a/models/Motion_Module/v3_sd15_mm.ckpt b/models/Motion_Module/v3_sd15_mm.ckpt
@@ -0,0 +1 @@
+../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff/blobs/2412711886f61091846f53204aabc38aa6e09356d62a9808abe4daa802168343
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../../../../../../../home/takahashit/.cache/huggingface/hub/models--guoyww--animatediff_t2i_backups/blobs/15012c538f503ce2ebfc2c8547b268c75ccdaff7a281db55399940ff1d70e21d