diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..017d29a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,37 @@
+# 不纳入版本控制：模型与测评结果（仅保留核心代码）
+models/
+results/
+
+# 日志与临时
+logs/
+*.log
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+*.egg
+.eggs/
+dist/
+build/
+
+# 虚拟环境 / conda
+.venv/
+venv/
+env/
+
+# IDE / 编辑
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Jupyter
+.ipynb_checkpoints/
+
+# 系统
+.DS_Store
+Thumbs.db
diff --git a/evaluation/robotwin/eval_polict_client_openpi.py b/evaluation/robotwin/eval_polict_client_openpi.py
index 7ed5265..01cda25 100644
--- a/evaluation/robotwin/eval_polict_client_openpi.py
+++ b/evaluation/robotwin/eval_polict_client_openpi.py
@@ -1,12 +1,13 @@
 import sys
 import os
 import subprocess
+import time
 import matplotlib.pyplot as plt
 from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
 import cv2
 from pathlib import Path
 
-robowin_root = Path("/path/to/your/robowin")
+robowin_root = Path(os.environ.get("ROBOTWIN_ROOT", "/path/to/your/robowin"))
 if str(robowin_root) not in sys.path:
     sys.path.insert(0, str(robowin_root))
 
@@ -205,36 +206,42 @@ def save_comparison_video(real_obs_list, imagined_video, action_history, save_pa
     
     print(f"Saving video: Real {n_real} frames, Imagined {n_imagined} frames...")
 
-    final_frames = []
+    # 首帧确定统一尺寸，保证整段视频每帧一致，避免 imageio "All images should have same size"
+    obs0 = real_obs_list[0]
+    base_h = obs0["observation.images.cam_high"].shape[0]
+
+    def resize_h(img, h):
+        if img.shape[0] != h:
+            w = int(img.shape[1] * h / img.shape[0])
+            img = cv2.resize(img, (w, h))
+        img = np.ascontiguousarray(img)
+        if img.dtype != np.uint8:
+            img = (img * 255).astype(np.uint8)
+        return img
+
+    # Real 行为左-中-右：High | Left wrist | Right wrist，与 obs_cam_keys 顺序一致
+    part_high_0 = resize_h(obs0["observation.images.cam_high"], base_h)
+    part_left_0 = resize_h(obs0["observation.images.cam_left_wrist"], base_h)
+    part_right_0 = resize_h(obs0["observation.images.cam_right_wrist"], base_h)
+    w_high, w_left, w_right = part_high_0.shape[1], part_left_0.shape[1], part_right_0.shape[1]
+    row_real0 = np.hstack([part_high_0, part_left_0, part_right_0])
+    row_real0 = add_title_bar(row_real0, "Real Observation (High / Left / Right)")
+    target_width = row_real0.shape[1]
+    real_row_h = row_real0.shape[0]
+    imagined_row_h = 256
 
+    final_frames = []
     for i in range(n_frames):
         obs = real_obs_list[i]
-        cam_high = obs["observation.images.cam_high"]
-        cam_left = obs["observation.images.cam_left_wrist"]
-        cam_right = obs["observation.images.cam_right_wrist"]
-
-        base_h = cam_high.shape[0]
-        
-        def resize_h(img, h):
-            if img.shape[0] != h:
-                w = int(img.shape[1] * h / img.shape[0])
-                img = cv2.resize(img, (w, h))
-            img = np.ascontiguousarray(img)
-            if img.dtype != np.uint8:
-                img = (img * 255).astype(np.uint8)
-            return img
-
         row_real = np.hstack([
-            resize_h(cam_high, base_h), 
-            resize_h(cam_left, base_h), 
-            resize_h(cam_right, base_h)
+            resize_h(obs["observation.images.cam_high"], base_h),
+            resize_h(obs["observation.images.cam_left_wrist"], base_h),
+            resize_h(obs["observation.images.cam_right_wrist"], base_h),
         ])
-        
         row_real = np.ascontiguousarray(row_real)
-
         row_real = add_title_bar(row_real, "Real Observation (High / Left / Right)")
-
-        target_width = row_real.shape[1]
+        if row_real.shape[1] != target_width or row_real.shape[0] != real_row_h:
+            row_real = cv2.resize(row_real, (target_width, real_row_h))
 
         if imagined_video is not None and i < n_imagined:
             img_frame = imagined_video[i]
@@ -242,20 +249,33 @@ def resize_h(img, h):
                 img_frame = (img_frame * 255).astype(np.uint8)
             elif img_frame.dtype != np.uint8:
                 img_frame = img_frame.astype(np.uint8)
-
-            h = int(img_frame.shape[0] * target_width / img_frame.shape[1])
-            row_imagined = cv2.resize(img_frame, (target_width, h))
+            # 与 real 一致：左-中-右 = High | Left wrist | Right wrist（模型输出顺序与 obs_cam_keys 一致）
+            H_im, W_im = img_frame.shape[0], img_frame.shape[1]
+            if W_im >= 3:
+                third = W_im // 3
+                im_high = cv2.resize(img_frame[:, 0:third], (w_high, imagined_row_h))
+                im_left = cv2.resize(img_frame[:, third : 2 * third], (w_left, imagined_row_h))
+                im_right = cv2.resize(img_frame[:, 2 * third :], (w_right, imagined_row_h))
+                row_imagined = np.hstack([im_high, im_left, im_right])
+            else:
+                row_imagined = cv2.resize(img_frame, (target_width, imagined_row_h))
         else:
-            row_imagined = np.zeros((300, target_width, 3), dtype=np.uint8)
-            cv2.putText(row_imagined, "Coming soon", (target_width//2 - 100, 150), 
+            row_imagined = np.zeros((imagined_row_h, target_width, 3), dtype=np.uint8)
+            cv2.putText(row_imagined, "Coming soon", (target_width//2 - 100, imagined_row_h//2 - 20),
                         cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 100, 100), 2)
 
         row_imagined = np.ascontiguousarray(row_imagined)
-        row_imagined = add_title_bar(row_imagined, "Imagined Video Stream")
+        row_imagined = add_title_bar(row_imagined, "Imagined Video (High / Left / Right)")
         full_frame = np.vstack([row_real, row_imagined])
         full_frame = np.ascontiguousarray(full_frame)
         final_frames.append(full_frame)
 
+    # 统一为第一帧尺寸，防止 add_title_bar 等导致细微差异
+    ref_h, ref_w = final_frames[0].shape[0], final_frames[0].shape[1]
+    for idx in range(len(final_frames)):
+        if final_frames[idx].shape[0] != ref_h or final_frames[idx].shape[1] != ref_w:
+            final_frames[idx] = cv2.resize(final_frames[idx], (ref_w, ref_h))
+
     imageio.mimsave(save_path, final_frames, fps=fps)
     print(f"Combined video saved to: {save_path}")
 
@@ -305,6 +325,7 @@ def main(usr_args):
     policy_name = usr_args["policy_name"]
     video_guidance_scale = usr_args["video_guidance_scale"]
     action_guidance_scale = usr_args["action_guidance_scale"]
+    save_visualization = bool(usr_args.get("save_visualization", True))
     instruction_type = 'seen'
     save_dir = None
     video_save_dir = None
@@ -397,7 +418,7 @@ def get_embodiment_file(embodiment_type):
     test_num = usr_args["test_num"]
 
     
-    model = WebsocketClientPolicy(port=usr_args['port'])
+    model = WebsocketClientPolicy(host="127.0.0.1", port=usr_args['port'])
 
     st_seed, suc_num = eval_policy(task_name,
                                    TASK_ENV,
@@ -407,7 +428,7 @@ def get_embodiment_file(embodiment_type):
                                    test_num=test_num,
                                    video_size=video_size,
                                    instruction_type=instruction_type,
-                                   save_visualization=True,
+                                   save_visualization=save_visualization,
                                    video_guidance_scale=video_guidance_scale,
                                    action_guidance_scale=action_guidance_scale)
     suc_nums.append(suc_num)
@@ -462,7 +483,7 @@ def eval_policy(task_name,
     now_id = 0
     succ_seed = 0
     suc_test_seed_list = []
-
+    all_trajectory_timings = []  # 每条 trajectory 的耗时汇总，用于最后样本级统计
 
     now_seed = st_seed
     clear_cache_freq = args["clear_cache_freq"]
@@ -545,6 +566,7 @@ def eval_policy(task_name,
         full_obs_list = []
         gen_video_list = []
         full_action_history = []
+        trajectory_timings = []  # 当前 trajectory 每次 infer 的 timing
 
         initial_obs = TASK_ENV.get_obs() 
         inint_eef_pose = initial_obs['endpose']['left_endpose'] + \
@@ -561,16 +583,21 @@ def eval_policy(task_name,
                 first_obs = format_obs(observation, prompt)
 
             ret = model.infer(dict(obs=first_obs, prompt=prompt, save_visualization=save_visualization, video_guidance_scale=video_guidance_scale, action_guidance_scale=action_guidance_scale)) #(TASK_ENV, model, observation)
+            if 'timing' in ret:
+                trajectory_timings.append(ret['timing'])
             action = ret['action']
             if 'video' in ret:
                 imagined_video = ret['video']
                 gen_video_list.append(imagined_video)
+                print(f"  [eval] received predicted video chunk, shape {getattr(imagined_video, 'shape', '?')}")
             key_frame_list = []
 
             assert action.shape[2] % 4 == 0
             action_per_frame = action.shape[2] // 4
 
             start_idx = 1 if first else 0
+            t0_env_step = time.perf_counter()
+            env_step_count = 0
             for i in range(start_idx, action.shape[1]):
                 for j in range(action.shape[2]):
                     raw_action_step = action[:, i, j].flatten() 
@@ -597,20 +624,58 @@ def eval_policy(task_name,
                     else:
                         raise NotImplementedError
                     TASK_ENV.take_action(ee_action, action_type='ee')
+                    env_step_count += 1
                    
                     if (j+1) % action_per_frame == 0:
                         obs = format_obs(TASK_ENV.get_obs(), prompt)
                         full_obs_list.append(obs)
                         key_frame_list.append(obs)
+            trajectory_timings.append(
+                dict(env_step_update=(time.perf_counter() - t0_env_step), env_step_count=env_step_count)
+            )
                     
             first = False
 
-            model.infer(dict(obs = key_frame_list, compute_kv_cache=True, imagine=False, save_visualization=save_visualization, state=action))
-  
+            ret_kv = model.infer(dict(obs = key_frame_list, compute_kv_cache=True, imagine=False, save_visualization=save_visualization, state=action))
+            if 'timing' in ret_kv:
+                trajectory_timings.append(ret_kv['timing'])
+
             if TASK_ENV.eval_success:
                 succ = True
                 break
-      
+
+        # 当前 trajectory 耗时汇总与占比（以 trajectory 为单位输出）
+        if trajectory_timings:
+            keys = ['encode_obs', 'video_denoise', 'action_denoise', 'kv_cache', 'env_step_update', 'other']
+            summed = {k: sum(t.get(k, 0.0) for t in trajectory_timings) for k in keys}
+            total = sum(summed.values()) or 1e-9
+            pct = {k: 100.0 * summed[k] / total for k in keys}
+            total_env_steps = int(sum(t.get('env_step_count', 0) for t in trajectory_timings))
+            avg_env_step = summed['env_step_update'] / max(total_env_steps, 1)
+            print(f"\033[90m[Trajectory {TASK_ENV.test_num + 1}] 耗时(秒): encode_obs={summed['encode_obs']:.2f}, video_denoise={summed['video_denoise']:.2f}, action_denoise={summed['action_denoise']:.2f}, kv_cache={summed['kv_cache']:.2f}, env_step_update={summed['env_step_update']:.2f}, other={summed['other']:.2f} | 占比(%): encode_obs={pct['encode_obs']:.1f}%, video_denoise={pct['video_denoise']:.1f}%, action_denoise={pct['action_denoise']:.1f}%, kv_cache={pct['kv_cache']:.1f}%, env_step_update={pct['env_step_update']:.1f}%, other={pct['other']:.1f}% | env_step均值={avg_env_step*1000:.1f}ms ({total_env_steps} steps)\033[0m")
+            detail_keys = [
+                'encode_obs_cpu_preprocess',
+                'encode_obs_to_vae_device',
+                'encode_obs_vae_encode',
+                'encode_obs_latent_postprocess',
+            ]
+            detail_summed = {k: sum(t.get(k, 0.0) for t in trajectory_timings) for k in detail_keys}
+            encode_total = summed['encode_obs'] or 1e-9
+            detail_pct = {k: 100.0 * detail_summed[k] / encode_total for k in detail_keys}
+            print(
+                "\033[90m"
+                f"  └─ encode_obs细分(秒): cpu_preprocess={detail_summed['encode_obs_cpu_preprocess']:.2f}, "
+                f"to_vae_device={detail_summed['encode_obs_to_vae_device']:.2f}, "
+                f"vae_encode={detail_summed['encode_obs_vae_encode']:.2f}, "
+                f"latent_postprocess={detail_summed['encode_obs_latent_postprocess']:.2f} "
+                f"| 占encode_obs(%): cpu_preprocess={detail_pct['encode_obs_cpu_preprocess']:.1f}%, "
+                f"to_vae_device={detail_pct['encode_obs_to_vae_device']:.1f}%, "
+                f"vae_encode={detail_pct['encode_obs_vae_encode']:.1f}%, "
+                f"latent_postprocess={detail_pct['encode_obs_latent_postprocess']:.1f}%"
+                "\033[0m"
+            )
+            summed['env_step_count'] = total_env_steps
+            all_trajectory_timings.append(summed)
 
         vis_dir = Path(args['save_root']) / f'stseed-{st_seed}' / 'visualization' / task_name
         vis_dir.mkdir(parents=True, exist_ok=True)
@@ -618,7 +683,7 @@ def eval_policy(task_name,
         out_img_file = vis_dir / video_name
         save_comparison_video(
             real_obs_list=full_obs_list,
-            imagined_video=None, #gen_video_list,
+            imagined_video=gen_video_list if gen_video_list else None,
             action_history=full_action_history,
             save_path=str(out_img_file),
             fps=15 # Suggest adjusting fps based on simulation step
@@ -655,6 +720,25 @@ def eval_policy(task_name,
         )
         now_seed += 1
 
+    # 以样本为单位输出时间占比统计
+    if all_trajectory_timings:
+        keys = ['encode_obs', 'video_denoise', 'action_denoise', 'kv_cache', 'env_step_update', 'other']
+        total_summed = {k: sum(t.get(k, 0.0) for t in all_trajectory_timings) for k in keys}
+        total_sec = sum(total_summed.values()) or 1e-9
+        total_pct = {k: 100.0 * total_summed[k] / total_sec for k in keys}
+        n_samples = len(all_trajectory_timings)
+        total_env_steps = int(sum(t.get('env_step_count', 0) for t in all_trajectory_timings))
+        avg_env_step = total_summed['env_step_update'] / max(total_env_steps, 1)
+        print("\n\033[97m======== 样本级时间占比统计 ({} 条 trajectory) ========\033[0m".format(n_samples))
+        print("\033[97m总耗时(秒): encode_obs={:.2f}, video_denoise={:.2f}, action_denoise={:.2f}, kv_cache={:.2f}, env_step_update={:.2f}, other={:.2f}\033[0m".format(
+            total_summed['encode_obs'], total_summed['video_denoise'], total_summed['action_denoise'],
+            total_summed['kv_cache'], total_summed['env_step_update'], total_summed['other']))
+        print("\033[97m占比(%): encode_obs={:.1f}%, video_denoise={:.1f}%, action_denoise={:.1f}%, kv_cache={:.1f}%, env_step_update={:.1f}%, other={:.1f}%\033[0m".format(
+            total_pct['encode_obs'], total_pct['video_denoise'], total_pct['action_denoise'],
+            total_pct['kv_cache'], total_pct['env_step_update'], total_pct['other']))
+        print("\033[97menv_step 平均耗时: {:.1f} ms/step ({} steps)\033[0m".format(avg_env_step * 1000.0, total_env_steps))
+        print("\033[97m========================================================\033[0m\n")
+
     return now_seed, TASK_ENV.suc
 
 
@@ -667,6 +751,8 @@ def parse_args_and_config():
     parser.add_argument("--video_guidance_scale", type=float, default=5.0)
     parser.add_argument("--action_guidance_scale", type=float, default=5.0)
     parser.add_argument("--test_num", type=int, default=100)
+    parser.add_argument("--save_visualization", type=lambda x: str(x).lower() not in ('0', 'false', 'no', 'off'), default=True,
+                        help='是否渲染并保存预测视频（VAE 解码+对比视频），关闭可显著加速。传 0 或 false 关闭。')
     args = parser.parse_args()
 
     with open(args.config, "r", encoding="utf-8") as f:
diff --git a/example/robotwin/README.txt b/example/robotwin/README.txt
new file mode 100644
index 0000000..868b1f5
--- /dev/null
+++ b/example/robotwin/README.txt
@@ -0,0 +1,11 @@
+# 本目录用于 Image-to-Video-Action (i2va) 推理的「首帧图像」输入。
+# 使用 robotwin_i2av 配置时，需要以下 3 个 PNG 文件（与 obs_cam_keys 对应）：
+#
+#   observation.images.cam_high.png
+#   observation.images.cam_left_wrist.png
+#   observation.images.cam_right_wrist.png
+#
+# 图像尺寸会被代码自动 resize（如 256x320），用任意尺寸的 RGB 图即可。
+#
+# 生成占位图（便于先跑通流程）：
+#   python create_dummy_images.py
diff --git a/example/robotwin/create_dummy_images.py b/example/robotwin/create_dummy_images.py
new file mode 100644
index 0000000..a50fc35
--- /dev/null
+++ b/example/robotwin/create_dummy_images.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+"""生成 robotwin i2va 所需的占位首帧图像，便于先跑通推理流程。"""
+import os
+
+try:
+    from PIL import Image
+    import numpy as np
+except ImportError:
+    print("请先安装: pip install Pillow numpy")
+    raise
+
+# 与 va_robotwin_cfg.obs_cam_keys 一致
+OBS_CAM_KEYS = [
+    "observation.images.cam_high",
+    "observation.images.cam_left_wrist",
+    "observation.images.cam_right_wrist",
+]
+# robotwin 主视角尺寸
+HEIGHT, WIDTH = 256, 320
+
+def main():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    os.makedirs(script_dir, exist_ok=True)
+    for i, key in enumerate(OBS_CAM_KEYS):
+        # 简单渐变图，避免全 0 导致潜在数值问题
+        arr = np.zeros((HEIGHT, WIDTH, 3), dtype=np.uint8)
+        arr[:, :, 0] = 30 + i * 60
+        arr[:, :, 1] = 60 + i * 40
+        arr[:, :, 2] = 90 + i * 30
+        path = os.path.join(script_dir, f"{key}.png")
+        Image.fromarray(arr).save(path)
+        print(f"Written: {path}")
+    print("Done. 可用 script/run_i2va_single_gpu.sh 跑 i2va 推理。")
+
+if __name__ == "__main__":
+    main()
diff --git a/example/robotwin/observation.images.cam_high.png b/example/robotwin/observation.images.cam_high.png
index 7546ec3..b493486 100644
Binary files a/example/robotwin/observation.images.cam_high.png and b/example/robotwin/observation.images.cam_high.png differ
diff --git a/example/robotwin/observation.images.cam_left_wrist.png b/example/robotwin/observation.images.cam_left_wrist.png
index 68323f3..1e19e14 100644
Binary files a/example/robotwin/observation.images.cam_left_wrist.png and b/example/robotwin/observation.images.cam_left_wrist.png differ
diff --git a/example/robotwin/observation.images.cam_right_wrist.png b/example/robotwin/observation.images.cam_right_wrist.png
index 1f7008b..edbda85 100644
Binary files a/example/robotwin/observation.images.cam_right_wrist.png and b/example/robotwin/observation.images.cam_right_wrist.png differ
diff --git a/markdown/EVAL_ROBOTWIN.md b/markdown/EVAL_ROBOTWIN.md
new file mode 100644
index 0000000..8aa7653
--- /dev/null
+++ b/markdown/EVAL_ROBOTWIN.md
@@ -0,0 +1,92 @@
+# LingBot-VA 在 RoboTwin-2.0 上 Eval 说明
+
+## 前置条件
+
+- **RoboTwin-2.0** 已下载，且 **assets** 已就绪（你当前路径：`/mnt/users/wangyuxuan-20250915/EAI/RoboTwin-2.0`）
+- **LingBot-VA** 模型目录：`lingbot-va-base`（或通过 `LINGBOT_VA_MODEL_PATH` 指定）
+- 推理时 `lingbot-va-base/transformer/config.json` 中 `attn_mode` 为 `"torch"` 或 `"flashattn"`（你当前已是 `flashattn`，无需改）
+
+## 环境依赖（重要）
+
+Eval 时 **client 会启动 RoboTwin 仿真**，当前 Python 环境必须能 `import sapien`，否则会报错：
+
+```text
+ModuleNotFoundError: No module named 'sapien'
+```
+
+请在本机 **用于跑 eval 的 conda 环境**（如 `lingbot-va`）中安装 RoboTwin 仿真依赖，例如：
+
+```bash
+# 0. setuptools 需 <82，否则 sapien 的 pkg_resources 会报错
+pip install 'setuptools<82'
+
+# 1. 系统（若未装）
+sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-tools
+
+# 2. 在 lingbot-va 环境中安装（不覆盖已有 torch）
+pip install sapien==3.0.0   # 或 3.0.0b1（若你从源码/其它源安装）
+pip install open3d scipy mplib gymnasium trimesh imageio pydantic zarr h5py
+pip install -r /mnt/users/wangyuxuan-20250915/EAI/RoboTwin-2.0/script/requirements.txt
+```
+
+**Curobo（运动规划）**：RoboTwin 的 env 依赖 [NVlabs/curobo](https://github.com/NVlabs/curobo)，需在 RoboTwin 仓库下按官方步骤安装：
+
+```bash
+cd /mnt/users/wangyuxuan-20250915/EAI/RoboTwin-2.0/envs
+git clone https://github.com/NVlabs/curobo.git
+cd curobo && pip install -e . --no-build-isolation && cd ../..
+```
+
+若与当前 PyTorch/CUDA 版本不兼容，可参考 [RoboTwin 安装文档](https://robotwin-platform.github.io/doc/usage/robotwin-install.html) 使用与 LingBot-VA 兼容的 torch 版本后再装 curobo。  
+并按需执行 RoboTwin 的 `script/_install.sh`（pytorch3d、mplib/sapien 补丁等）。
+
+## 一键 Eval（同机 Server + Client）
+
+在 **lingbot-va 仓库根目录** 执行：
+
+```bash
+export ROBOTWIN_ROOT=/mnt/users/wangyuxuan-20250915/EAI/RoboTwin-2.0
+export LINGBOT_VA_MODEL_PATH=/mnt/users/wangyuxuan-20250915/EAI/lingbot-va/lingbot-va-base  # 可选，默认即仓库下 lingbot-va-base
+
+# 默认：任务 adjust_bottle，test_num=100，结果到 ./results
+bash script/run_eval_robotwin.sh
+
+# 指定结果目录、任务、测试次数（快速试跑建议 test_num=2）
+bash script/run_eval_robotwin.sh ./results adjust_bottle 2
+```
+
+脚本会先启动 LingBot-VA server（WebSocket），再启动 RoboTwin eval client，结果在 `save_root`（默认 `./results`）下。
+
+## 仅跑 Client（Server 已另起）
+
+若已在其他终端启动 LingBot-VA server（例如 `bash evaluation/robotwin/launch_server.sh`），可只跑 client：
+
+```bash
+export ROBOTWIN_ROOT=/mnt/users/wangyuxuan-20250915/EAI/RoboTwin-2.0
+bash script/run_eval_robotwin_client_only.sh ./results adjust_bottle 2
+```
+
+默认连接 `PORT=29056`，可通过 `export PORT=29056` 修改。
+
+## 可选：快速试跑脚本
+
+```bash
+bash script/run_eval_robotwin_quick.sh
+```
+
+等价于：`run_eval_robotwin.sh ./results adjust_bottle 2`，用于快速验证流程。
+
+## 结果与指标
+
+- 视频与可视化：`save_root/stseed-<seed>/visualization/<task_name>/`
+- 成功率等：`save_root/stseed-<seed>/metrics/<task_name>/res.json`（`succ_num`、`total_num`、`succ_rate`）
+
+## 常见问题
+
+| 现象 | 处理 |
+|------|------|
+| `ModuleNotFoundError: No module named 'sapien'` | 在当前环境中安装 sapien 及 RoboTwin 依赖（见上方「环境依赖」） |
+| `attn_mode` 相关报错 | 确认 `transformer/config.json` 中为 `"torch"` 或 `"flashattn"` |
+| 未找到 `ROBOTWIN_ROOT/assets` | 先下载 RoboTwin assets，或设置 `ROBOTWIN_ROOT` 到正确路径 |
+| 未找到 `curobo` / `CuroboPlanner` | 在 RoboTwin 下安装 curobo：`cd $ROBOTWIN_ROOT/envs && git clone https://github.com/NVlabs/curobo.git && cd curobo && pip install -e . --no-build-isolation` |
+| Server 启动失败 | 检查 GPU、CUDA、以及模型路径 `LINGBOT_VA_MODEL_PATH` |
diff --git a/markdown/INFERENCE.md b/markdown/INFERENCE.md
new file mode 100644
index 0000000..32761e4
--- /dev/null
+++ b/markdown/INFERENCE.md
@@ -0,0 +1,117 @@
+# LingBot-VA 推理调试指南
+
+按以下步骤可在单机上把 **Image-to-Video-Action (i2va)** 推理跑通。
+
+## 1. 环境
+
+- Python 3.10、PyTorch 2.9、CUDA 12.6（与 README 一致）
+- 安装依赖后，**推理** 时 `transformer` 的 `attn_mode` 必须为 `"torch"` 或 `"flashattn"`，不能为 `"flex"`
+
+## 2. 下载模型
+
+从 [HuggingFace](https://huggingface.co/robbyant/lingbot-va-base) 或 [ModelScope](https://modelscope.cn/models/Robbyant/lingbot-va-base) 下载 **lingbot-va-base**，得到本地目录，例如：
+
+```text
+/path/to/lingbot-va-base/
+├── vae/
+├── tokenizer/
+├── text_encoder/
+└── transformer/
+```
+
+## 3. 设置推理用 attn_mode
+
+编辑 **`<模型目录>/transformer/config.json`**，将 `"attn_mode"` 改为 `"torch"` 或 `"flashattn"`：
+
+```json
+"attn_mode": "torch"
+```
+
+（训练时为 `"flex"`，推理必须改掉，否则会报错。）
+
+## 4. 准备首帧图像（i2va）
+
+使用 **robotwin_i2av** 配置时，需要 3 张首帧 PNG，放在 `example/robotwin/` 下，文件名为：
+
+- `observation.images.cam_high.png`
+- `observation.images.cam_left_wrist.png`
+- `observation.images.cam_right_wrist.png`
+
+**快速生成占位图**（仅用于跑通流程）：
+
+```bash
+cd /path/to/lingbot-va
+python example/robotwin/create_dummy_images.py
+```
+
+会在这 3 个文件名下生成 256x320 的占位图。
+
+## 5. 单 GPU 跑 i2va
+
+在仓库根目录下执行：
+
+```bash
+export LINGBOT_VA_MODEL_PATH=/path/to/lingbot-va-base
+bash script/run_i2va_single_gpu.sh
+```
+
+未设置 `LINGBOT_VA_MODEL_PATH` 或路径不对时，脚本会报错并提示。
+
+- 结果会写到 `save_root` 下的 `real/<prompt>_<时间>/`（默认 `./train_out` 可改）。
+- 其中会保存 `latents_*.pt`、`actions_*.pt`，以及用首帧 + 预测 latent 解码得到的 `demo.mp4`（在 generate 流程里）。
+
+## 6. 可选：换配置 / 减少步数
+
+- 使用 **demo** 配置（2 视角、不同 action 维度）时，可改用 `demo_i2av`，并设置 `example/demo/` 下对应名称的 PNG（见 `va_demo_cfg.obs_cam_keys`）。
+- 在对应 config 里可调：
+  - `num_inference_steps` / `action_num_inference_steps`：减小可加快推理（质量会下降）；
+  - `num_chunks_to_infer`：i2va 生成的总 chunk 数；
+  - `frame_chunk_size`：每个 chunk 的帧数。
+
+## 7. 常见错误
+
+| 现象 | 处理 |
+|------|------|
+| `attn_mode` 相关报错 | 确认 `transformer/config.json` 里为 `"torch"` 或 `"flashattn"` |
+| 找不到 `observation.images.*.png` | 在 `example/robotwin/` 下运行 `create_dummy_images.py` 或自行放置同名 PNG |
+| CUDA OOM | 使用 README 中的 offload（VAE、text_encoder 放到 CPU），或减小 `frame_chunk_size` / 推理步数 |
+| 找不到 `wan_va` 模块 | 在 **仓库根目录**（含 `wan_va` 的上一级）执行 `bash script/run_i2va_single_gpu.sh` |
+
+## 8. Server 模式（与仿真器联调）
+
+若要与 RoboTwin 等仿真器联调，使用 **server** 模式：
+
+- 启动推理服务：`bash evaluation/robotwin/launch_server.sh`（需先设好 `wan22_pretrained_model_name_or_path` 或 `LINGBOT_VA_MODEL_PATH`）
+- 再在另一终端启动 client / 仿真器，见 README 的 RoboTwin 部署说明。
+
+上述步骤可保证 i2va 推理从环境、模型、首帧到单 GPU 脚本整条链路打通；若某一步报错，把报错信息与对应步骤号贴出来即可继续排查。
+
+---
+
+## 9. 下载 Post-Training 数据集（用于微调）
+
+**该数据集仅在 HuggingFace 提供，ModelScope 无此数据集。**
+
+- 数据集：`robbyant/robotwin-clean-and-aug-lerobot`（LeRobot 格式，用于 post-training 微调）
+
+**一键下载到仓库下默认目录：**
+
+```bash
+# 需先安装: pip install huggingface_hub
+python script/download_dataset.py
+```
+
+**下载到指定目录：**
+
+```bash
+python script/download_dataset.py /path/to/save
+```
+
+下载完成后，训练时设置环境变量即可：
+
+```bash
+export LINGBOT_VA_DATASET_PATH=/path/to/robotwin-clean-and-aug-lerobot
+NGPU=8 bash script/run_va_posttrain.sh
+```
+
+配置里已支持从 `LINGBOT_VA_DATASET_PATH` 读取数据集路径（见 `wan_va/configs/va_robotwin_train_cfg.py`）。
diff --git a/INSTALL.md b/markdown/INSTALL.md
similarity index 100%
rename from INSTALL.md
rename to markdown/INSTALL.md
diff --git a/markdown/ideas/ideas_0305.md b/markdown/ideas/ideas_0305.md
new file mode 100644
index 0000000..0db7b3c
--- /dev/null
+++ b/markdown/ideas/ideas_0305.md
@@ -0,0 +1,279 @@
+# WAM（World Action Model）推理加速：创新点清单（training-free 为主）
+
+下面的点尽量围绕 **WAM 的独特性**（动作会改变世界、需要闭环控制、world/action 联合生成且可复用缓存），避免照搬纯视频生成或常见 VLA（BC/RT/纯 policy）套路。每条都给出：机制、落点、加速来源、风险与验证方式。
+
+---
+
+## 背景：当前推理耗时主要来自哪里
+
+以当前实现（`wan_va/wan_va_server.py`）为例，推理开销通常由以下部分主导：
+
+- **NFE（transformer 前向次数）**：video 分支 `num_inference_steps` + action 分支 `action_num_inference_steps`。
+- **Token 数 / 注意力有效边数**：world latent token 远多于 action token；action 阶段还会 attend 到 world 的 KV cache。
+- **Cache 维护成本**：KV cache 增长、slot 管理、以及（若设计不当）无约束的可见范围带来的注意力开销。
+- **观测编码与数据搬运**：多视角图像 resize/stack + VAE encode；offload 时还有 CPU↔GPU 传输。
+
+因此，推理加速的“杠杆”主要是：**减少 NFE、减少有效 token/边、复用/压缩 cache、减少不必要的 world 去噪、减少数据搬运**。
+
+---
+
+## 设计原则（WAM 专属）
+
+- **动作优先**：若最终指标是控制成功率，世界生成只需提供“对动作有用”的信息（不是高保真视频）。
+- **干预因果**：动作是干预变量（intervention），world 必须对 action 敏感；world 的哪些 token 对 action 重要应可识别并被优先计算。
+- **闭环与低延迟**：每个控制周期必须准时输出动作；允许“粗动作 + 快速修正”的策略。
+- **缓存可迁移**：同一 episode 内、甚至相似场景间，许多计算（KV、prompt embedding、静态背景 token）可复用。
+
+---
+
+## 创新点 1：Action-first 世界“懒惰化”（Lazy World Denoising, LWD）
+
+- **WAM 特性**：world 只是 action 的上下文，不一定需要每个周期都高质量去噪。
+- **算法**：
+  - 用一个廉价的 **world-need 指标** 决定是否跑 world 去噪：例如观测变化幅度、接触/抓取状态变化、动作不确定性（action 方差/熵）、或“动作对 world token 的注意力集中度”。
+  - 若指标低：本周期 **跳过/极少** world steps（例如 `video_exec_step=0~2`），直接用已有 world cache 生成 action。
+  - 若指标高：再启用正常 world 去噪。
+- **落点**：`wan_va_server.py::_infer()` 中 video loop 的 step budget 动态化（per-chunk/per-cycle）。
+- **加速来源**：直接砍掉大量 world NFE。
+- **风险/验证**：
+  - 风险：遇到“需要精确世界状态”的时刻误判。
+  - 验证：按任务阶段分桶（抓取前/抓取中/放置前）统计跳过 world 的成功率变化；并用“误判率”分析指标设计。
+
+---
+
+## 创新点 2：多速率联合生成（Multi-rate WAM Sampling）
+
+- **WAM 特性**：动作需要高频更新，世界状态（尤其背景/静态部分）可以低频更新。
+- **算法**：
+  - world 每 \(K\) 个控制周期才更新一次（或每个 chunk 只在首周期更新）。
+  - action 每个周期都更新；action 的 attention 只看“最近一次更新的 world KV + 最新观测条件 token（很短）”。
+  - 可加一个轻量的 **观测增量 token**：只编码/写入最新帧的少量 summary token，而不是全量世界 latent。
+- **落点**：server 模式下 `compute_kv_cache` 与 `infer` 的调用节奏；以及 cache 可见范围控制。
+- **加速来源**：world 侧 NFE 与 token 更新频次下降；action 侧每步 K/V 更短。
+- **风险/验证**：
+  - 风险：world 低频导致“状态滞后”。
+  - 验证：对比不同 \(K\) 下的成功率/动作抖动/碰撞率，并量化延迟降低。
+
+---
+
+## 创新点 3：WAM 专属“因果 cache 视野裁剪”（Causal Cache Cropping）
+
+- **WAM 特性**：动作只需要与当前动作因果相关的 world 部分（例如手、被操作物体、容器口等）。
+- **算法**：
+  - 用 action→world 的注意力或梯度（近似）在推理时在线得到 **重要 world token 子集**。
+  - 将 KV cache 按重要性裁剪：action 阶段只 attend 到 Top-\(M\) 的 world KV（或 ROI KV）。
+  - 重要性可以是：
+    - 最近几步 action token 对 world token 的平均 attention；
+    - 或 “动作预测对 world token 的敏感度” 的低成本近似（例如 last-layer attention 聚合）。
+- **落点**：`modules/model.py` 的 cache 读出（`valid` selection）处引入“重要 token mask”；或在写入时就打标分类。
+- **加速来源**：减少 action 阶段的 K/V 长度，降低注意力计算。
+- **风险/验证**：
+  - 风险：裁剪错误导致关键几何信息缺失。
+  - 验证：记录被裁剪 token 的类别/位置；对失败案例可视化 attention 以调参。
+
+---
+
+## 创新点 4：KV cache 在线压缩成“世界摘要 token”（Online World Summarization）
+
+- **WAM 特性**：对控制而言，世界只需少量摘要（物体相对位姿、可达性、接触）即可指导动作。
+- **算法（training-free）**：
+  - 周期性将较旧的 world KV 压缩成少数 \(K\) 个 **summary KV**（例如用 attention pooling / mean pooling / PCA 低秩投影）。
+  - 将被压缩的原 KV slots 释放，保持 cache 上限。
+  - action 阶段优先看 summary + 最近窗口内的高分辨 KV。
+- **落点**：`modules/model.py` cache 管理（slot 释放策略）+ 一个压缩函数（可放在 `utils/`）。
+- **加速来源**：控制 cache 长度，避免长 episode 变慢；action 注意力更短。
+- **风险/验证**：
+  - 风险：摘要丢失细粒度操作信息。
+  - 验证：按“接触/精细操作阶段”关闭压缩或提高 \(K\)；做阶段自适应。
+
+---
+
+## 创新点 5：用“速度场历史”做多步外推（AB2/AB3 for Flow, training-free）
+
+- **WAM 特性**：你们的 `FlowMatchScheduler.step()` 是显式 Euler（速度场积分），天然可做多步法。
+- **算法**：
+  - 保存前一步（或前两步）的速度 \(v_{t-1}, v_{t-2}\)。
+  - 用 Adams–Bashforth（2/3 阶）更新：
+    - AB2：\(\Delta x \approx \Delta\sigma \cdot (3/2\,v_t - 1/2\,v_{t-1})\)
+    - AB3：\(\Delta x \approx \Delta\sigma \cdot (23/12\,v_t - 16/12\,v_{t-1} + 5/12\,v_{t-2})\)
+  - 在更少 steps 下维持质量（或相同 steps 下更好质量）。
+- **落点**：`wan_va/utils/scheduler.py` 新增 `step_ab2/ab3`；`wan_va_server.py` 循环里维护速度历史。
+- **加速来源**：允许把 steps 降得更激进而不崩（减少 NFE）。
+- **风险/验证**：
+  - 风险：外推不稳定，尤其在 guidance 强或噪声大区域。
+  - 验证：对比 Euler/AB2 在 10/15/20 steps 下的成功率与动作平滑度。
+
+---
+
+## 创新点 6：WAM 版 speculative control：快动作提案 + 单步世界验真 + 局部修正
+
+- **WAM 特性**：控制可以容忍“先给一个可执行动作”，再快速修正（闭环）。
+- **算法**：
+  1) 用极少 steps（甚至仅 action 分支）产生 action 提案 \(a_t\)。
+  2) 用 1 次（或极少）world 更新预测 \(\hat{o}_{t+1}\) 的关键摘要（不需要整段视频）。
+  3) 若验真失败（违反约束/目标偏差大），再追加少量 steps 对 action 做 refinement（或回退到完整采样）。
+- **落点**：server 模式最自然；需要一个廉价的“验真器”（可用现有 world head 的低成本统计，或基于任务约束的几何判据）。
+- **加速来源**：大多数时刻只走快路径；仅少数困难时刻走慢路径。
+- **风险/验证**：
+  - 风险：验真器设计不良导致误放行。
+  - 验证：统计快路径命中率、回退率、以及回退带来的尾延迟。
+
+---
+
+## 创新点 7：动作维度的“冻结/早停”（Action Token Freezing）
+
+- **WAM 特性**：很多动作维度在大多数阶段接近常量（例如部分关节、或某些夹爪维度）。
+- **算法（training-free）**：
+  - 在 action 去噪过程中，监测每个 action 维度（或 token）的变化量 \(|\Delta a|\)。
+  - 若连续 \(m\) 步 \(|\Delta a| < \epsilon\)，则将该维度标记为 frozen：后续 steps 不再更新（或仅做一次低频更新）。
+  - 对 frozen 维度可以在 transformer 输出后做 mask，或在输入噪声中置零相应更新。
+- **落点**：`wan_va_server.py` action loop + `actions_mask` 扩展；或在 `postprocess_action` 前做冻结逻辑。
+- **加速来源**：减少有效 action token 更新与注意力计算（尤其在更细粒度 token 化时效果更明显）。
+- **风险/验证**：冻结阈值/窗口需要按任务阶段自适应。
+
+---
+
+## 创新点 8：观测编码复用（Obs/VAE Encode Reuse with Change Detection）
+
+- **WAM 特性**：相邻控制周期多视角图像变化可能很小；重复 VAE encode 浪费。
+- **算法**：
+  - 对输入图像做低成本 hash/差分（例如 downsample 后 L2、或 SSIM 近似）。
+  - 若变化小于阈值：复用上次的 encoded latent（或仅对变化最大的相机视角重编码）。
+  - 对 wrist camera 可按运动幅度自适应刷新频率。
+- **落点**：`wan_va_server.py::_encode_obs()` 外围加缓存与变化检测。
+- **加速来源**：减少 VAE encode 与 CPU↔GPU 传输（offload 时收益更大）。
+- **风险/验证**：快速运动/遮挡变化需要强制刷新。
+
+---
+
+## 创新点 9：WAM 目标导向的 step 自适应（Goal-Progress Adaptive Step Budget）
+
+- **WAM 特性**：控制目标通常有可计算的进度指标（距离、门角度、抓取状态、容器对齐）。
+- **算法（training-free）**：
+  - 在每次输出 action 前，用一个廉价的 proxy 估计“本周期动作对目标推进是否足够”（例如从 action 大小、方向一致性、或 world 摘要预测中估计）。
+  - 若推进明显：减少后续采样 steps；若推进停滞/反向：增加 steps 或触发回退策略。
+- **落点**：server loop 中的 step controller；与上面的 speculative control 可组合。
+- **加速来源**：多数“简单阶段”少算，困难阶段多算。
+- **风险/验证**：需要设计任务无关的通用 proxy（或按任务族提供不同 proxy）。
+
+---
+
+## 创新点 10：控制专用的“低保真 world”通道（World-for-Control Channel）
+
+- **WAM 特性**：控制不需要高保真像素细节，而需要几何/接触/可达性。
+- **算法**：
+  - 不改变训练（或极少改动），在推理时从现有 latent 中提取一个极低维的控制特征（例如每帧若干统计：均值/方差、或固定池化得到的 K 个 token）。
+  - action 阶段只 attend 到这些控制 token + 最近窗口内的少量高分辨 token（必要时再补全）。
+- **落点**：在 world cache 写入后追加一段 pooled token；action 阶段只读 pooled token（类似“在线瓶颈”）。
+- **加速来源**：大幅缩短 action 侧的 K/V。
+- **风险/验证**：对精细插入/对齐任务，可能需要阶段性打开高分辨 KV。
+
+---
+
+## 创新点 11：候选动作并行 + 单次世界评估（Batch Candidates, Shared World KV）
+
+- **WAM 特性**：world KV 可共享；动作候选可以并行评估，比串行多次采样更划算。
+- **算法**：
+  - 以 batch 方式并行生成 \(K\) 个 action 候选（少步数/不同噪声 seed）。
+  - 复用同一份 world KV（以及 prompt embedding），用廉价 world 评估器/约束检查选最优。
+  - 只对选中的候选做后续 refinement（可选）。
+- **落点**：`wan_va_server.py` 将 action 采样 batch 化（维持 world cache 相同），以及选择器实现。
+- **加速来源**：用并行吞吐换更少回退与更少长采样；在 GPU 上常更划算。
+- **风险/验证**：batch 增大显存；需要找到“选优 proxy”。
+
+---
+
+## 创新点 12：训练-推理对齐的“交错式联合步”但保持 cache（Interleaved Joint-Step with Cache）
+
+- **WAM 特性**：训练里 world/action 联合序列 + 因果 mask；推理里分阶段。对齐可减少步数敏感性。
+- **算法（尽量 training-free）**：
+  - 每个大步只做 **一次** transformer 调用，但在 cache 中交错写入：
+    1) 读 world cache，更新一小步 world（写入 pred KV）
+    2) 立刻在同一步用更新后的 KV 更新 action（或用同一 forward 的共享 trunk，输出两个 head）
+  - 目标是：在不显著增加 token 的前提下，把“2 次 forward/步”变成“1 次 forward/步”（或减少总步数）。
+- **落点**：需要对 `modules/model.py` 的前向做轻量封装（共享 block 计算、双 head 输出），或复用 `forward_train` 的拼接思路但用 cache 控制稀疏可见范围。
+- **加速来源**：减少总前向次数（NFE）。
+- **风险/验证**：实现复杂；需要确保 mask/缓存可见性不会引入未来泄露。
+
+---
+
+## 创新点 13：KV cache 冷热分层 + 冷层量化（Hot/Cold KV Tiering + Quantized Cold Cache）
+
+- **WAM 特性**：episode 变长时 cache 增长拖慢 action 注意力；但真正有用的通常是最近窗口 + 少量长期记忆。
+- **算法（training-free）**：
+  - 将 KV 分为 **Hot**（最近 \(W\) 帧 bf16/fp16）与 **Cold**（更旧部分 int8/fp8 或 cpu-pinned）。
+  - action 阶段默认只 attend Hot + 少量 Cold summary；事件触发时临时扩大可见 Cold。
+- **落点**：`modules/model.py` cache 存取与 `valid` selection（读时拼接/解量化）。
+- **加速来源**：缩短有效 K/V、降显存、避免长 episode 变慢（offload 场景更明显）。
+- **风险/验证**：量化误差；用分阶段策略（精细操作阶段禁用或提高精度）。
+
+---
+
+## 创新点 14：Cache 的 Delta Write（无更新就不写，避免无效 cache 维护）
+
+- **WAM 特性**：很多周期 world 不更新或只更新 ROI，但 naive 仍写整段 KV，导致 cache 膨胀与带宽浪费。
+- **算法**：当本周期 world 不更新（或只更新 ROI），则 **禁止写入 pred cache**（或只写 ROI token 的 KV）。
+- **落点**：`wan_va_server.py` 的 `update_cache` 策略 + `modules/model.py` cache 写入逻辑。
+- **加速来源**：减少 cache 写带宽、控制 cache 长度增长、降低后续注意力开销。
+- **风险/验证**：需保证因果一致性；记录“跳写比例 vs 失败案例”。
+
+---
+
+## 创新点 15：World 的 ROI Token Update（按“动的/因果相关的”token 更新 world）
+
+- **WAM 特性**：背景静态，关键是“手+物体+接触区域”；全量 world token 去噪浪费。
+- **算法（training-free）**：
+  - 用廉价变化检测（多视角 downsample diff / 光流近似）得到 ROI。
+  - 仅对 ROI token 做 world 更新，其余 token 直接复用上一周期 world latent/KV。
+- **落点**：ROI 从 `_encode_obs` 外围产出；world 数据打包/patch 逻辑支持子集 token（block 粒度先做）。
+- **加速来源**：world token 数显著下降 → 注意力边数下降 → NFE 成本下降。
+- **风险/验证**：ROI 映射误差；先用粗 block 降错杀风险。
+
+---
+
+## 创新点 16：Action Diffusion Warm-start（跨周期动作 latent 热启动）
+
+- **WAM 特性**：相邻控制周期动作连续；每次从纯噪声采样浪费。
+- **算法**：用上周期 action latent 末态做本周期初值（加小噪声/从中间 sigma 开始），并结合早停/冻结降低 steps。
+- **落点**：`wan_va_server.py` action loop 维护 `prev_action_latent`，改变初始化与 timesteps 起点。
+- **加速来源**：减少 action NFE（action_steps 高时收益更明显）。
+- **风险/验证**：突变场景会卡住；用事件触发强制重启（random init）。
+
+---
+
+## 创新点 17：Scheduler Early-Exit（用收敛判据提前结束扩散）
+
+- **WAM 特性**：很多周期去噪很快收敛，后续 steps 增益小。
+- **算法（training-free）**：监测 \(\|v_t-v_{t-1}\|\) / \(\|\Delta x\|\) 等，连续低于阈值则提前结束该分支（world 或 action）。
+- **落点**：`wan_va_server.py` 的 video/action loop 或 `FlowMatchScheduler.step()` 外围。
+- **加速来源**：直接减少 NFE。
+- **风险/验证**：阈值敏感；做离线 sweep 找 Pareto。
+
+---
+
+## 创新点 18：CPU↔GPU 搬运与算子重叠（Encode/Transfer Overlap with CUDA Streams）
+
+- **WAM 特性**：offload + 多视角 preprocess 常让 CPU↔GPU 搬运成为瓶颈，拉高尾延迟。
+- **算法**：pinned memory + 异步 H2D；VAE encode 独立 CUDA stream；与 transformer forward 重叠；CPU preprocess 线程化预取下一周期。
+- **落点**：`wan_va_server.py::_encode_obs*()` 与 `_compute_kv_cache()` 的 pipeline 拆分与重叠。
+- **加速来源**：减少 pipeline 泡沫，显著改善 P90/P99。
+- **风险/验证**：工程复杂；用 profiler 验证 overlap 生效。
+
+---
+
+## 建议的落地顺序（按“最可能立刻提速/风险最低”排序）
+
+1. **LWD（跳过/少跑 world）** + **自适应 step budget**（最直接砍 NFE）
+2. **AB2/AB3 多步法**（不改模型，仅改 scheduler/循环）
+3. **Causal Cache Cropping / Summary token**（减少 action 阶段 attention）
+4. **Obs/VAE encode 复用**（工程上收益稳定）
+5. **Speculative control（快路径 + 验真 + 回退）**（闭环友好，尾延迟需评估）
+
+---
+
+## 评估指标（建议同时记录）
+
+- **吞吐/时延**：ms/step、ms/chunk、P50/P90/P99 时延、GPU 利用率、显存峰值。
+- **控制质量**：成功率、碰撞率、动作抖动（\|\Delta a\|）、阶段性失败分布（抓取/放置/开合）。
+- **计算分解**：world NFE vs action NFE、cache 长度随时间变化、VAE encode/CPU↔GPU 传输占比。
+
diff --git a/markdown/ideas/ideas_action_relevant_video_tokens_0305.md b/markdown/ideas/ideas_action_relevant_video_tokens_0305.md
new file mode 100644
index 0000000..d45b5e5
--- /dev/null
+++ b/markdown/ideas/ideas_action_relevant_video_tokens_0305.md
@@ -0,0 +1,157 @@
+# 识别“对下一个动作强相关”的 video/world tokens（training-free 创新点）
+
+目标：在 WAM（world/action 联合扩散）推理中，在线识别当前生成的 **video/world tokens** 里哪些对 **下一步动作** 最关键，并 **优先保留/更新** 这些 tokens（其余 token 可被裁剪为不可见、压缩为摘要、或低频刷新）。
+
+> 关键建议：优先做 **KV 可见性裁剪**（token 仍存在，但后续注意力只看关键 token），工程风险远低于“真正删 token 让前向变短”。后者可作为第二阶段。
+
+---
+
+## 0) 定义：什么叫“对下一个动作强相关”
+
+给定 world token \(w_i\)、下一步动作输出 \(a\)（或 action 去噪过程中的噪声预测 \(\\epsilon_a\) / 速度场 \(v_a\)），我们希望估计某种 “重要性”：
+
+- **Sensitivity/Influence**：\( \\partial a / \\partial w_i \) 大
+- **Information**：移除/扰动 \(w_i\) 会明显改变动作分布（均值/方差）
+- **Causal utility**：\(w_i\) 能解释 action 的决策（而不是仅相关）
+
+training-free 的核心：用 **模型本身的 attention/输出变化** 做 proxy，而不是再训练一个 importance head。
+
+---
+
+## 1) Action→World 注意力归因（最可行、WAM 专属）
+
+### 1.1 单层/多层 attention 聚合
+
+**思路**：在 action 分支 forward 时，拿到 action tokens 对 world tokens 的 attention 权重，把它当作 “action 正在读取的信息”。
+
+- **importance**：
+  \[
+  \\mathrm{imp}(w_i) = \\sum_{a \\in \\mathcal{A}} \\mathrm{Attn}(a \\rightarrow w_i)
+  \]
+  可对最后 1–2 层、或全部层加权求和（越靠后越贴近输出）。
+
+### 1.2 跨 step 的 EMA 稳定化（减少抖动）
+
+- 维护 \( \\hat{\\mathrm{imp}}_t = \\alpha \\hat{\\mathrm{imp}}_{t-1} + (1-\\alpha)\\mathrm{imp}_t \)
+- 用 \(\\hat{\\mathrm{imp}}_t\) 做 Top‑K/Top‑Blocks 选择。
+
+### 1.3 强制保留项（防止“注意力偏见”漏掉关键几何）
+
+即便 attention 给低分，也强制保留：
+
+- **最近帧** tokens（短期因果）
+- **变化显著 ROI** tokens（见第 3 节）
+- **wrist/手附近** 的 token 块（可用 state/相机先验映射）
+
+**优势**：不改训练、信号天然存在；最贴合 “world 为 action 服务” 的 WAM 叙事。  
+**风险**：attention ≠ 因果；需要用扰动验证（第 2 节）做 sanity check。
+
+---
+
+## 2) 反事实扰动评分（更因果，但更耗）
+
+### 2.1 Token dropout / mask 的 action 变化量
+
+**思路**：对候选 token 集合做小扰动，看 action 输出改变多少。
+
+- 扰动方式：
+  - **drop**：把 token 的 K/V 置零或替换为 mean token
+  - **noise**：加小噪声扰动 token 表示
+- 评分：
+  - \(\\Delta_a = \\| a - a^{(\\text{drop } i)} \\|\)
+  - 或动作分布差异（若你有多样采样）：KL/方差变化
+
+### 2.2 “二阶段”快速筛选（降低开销）
+
+先用 attention 得到 Top‑M 候选，再对这 M 个做反事实验证，最终保留 Top‑K（K≪M）。
+
+**优势**：更接近因果影响，解释性强。  
+**代价**：需要额外 forward（可只在低频或关键阶段运行）。
+
+---
+
+## 3) Change/Flow 驱动的 token 重要性（与多视角观测强耦合）
+
+### 3.1 观测变化 → latent block 变化 → token 重要性
+
+**思路**：动作决策往往依赖 “正在变化的部分”（手、物体、接触）。用低成本图像变化检测做 ROI，再映射到 token 网格。
+
+- 变化信号：
+  - 多视角 downsample diff（mean abs diff）
+  - 近似光流（低分辨率即可）
+  - 目标/手/物体 mask 变化（若你有分割器）
+- 映射方式：
+  - 先按 **粗 block**（例如 latent 8×8 patch）做 ROI，降低错杀风险
+  - 对多相机拼接的 latent：每个相机对应固定的宽度区间，ROI 映射到对应 slice
+
+### 3.2 与 attention 融合（最稳的组合）
+
+最终分数：
+
+\[
+\\mathrm{score}(w_i)=\\lambda\\,\\hat{\\mathrm{imp}}_{\\text{attn}}(w_i) + (1-\\lambda)\\,\\hat{\\mathrm{imp}}_{\\text{change}}(w_i)
+\]
+
+**优势**：training-free、成本低、能覆盖“注意力没看但其实关键在动”的情况。  
+**风险**：变化不等于因果（例如光照变化）；靠强制保留/阈值鲁棒化。
+
+---
+
+## 4) Action-uncertainty 驱动的重要性（不确定性越大越需要更多 token）
+
+**思路**：当 action 分支对世界不确定时，通常需要更多上下文 token。反过来：能显著降低 action 不确定性的 token 更重要。
+
+- 做法：
+  - 对同一 obs，采样多次 action（不同 seed/少量步）得到方差 \(\\mathrm{Var}(a)\)
+  - 逐步引入 token 子集（从 Top‑K 开始）观察方差下降速度
+  - 能最快降低方差的 tokens 视为关键
+
+**优势**：直接服务“下一步动作稳定/确定”。  
+**代价**：需要少量多样采样（可低频做）。
+
+---
+
+## 5) 梯度/一阶敏感度近似（更直接，但需要可导获取）
+
+**思路**：用一阶近似度量 token 对动作的影响（类似 saliency）。不更新参数，但允许取梯度。
+
+- 定义标量目标：例如 action 预测的 L2 范数、或某些维度的 logit/均值
+- 计算：
+  - \(\\mathrm{imp}(w_i)=\\|\\nabla_{w_i} \\mathcal{L}\\|\) 或 \(\\|w_i\\odot \\nabla_{w_i}\\mathcal{L}\\|\)
+- 实用技巧：
+  - 只对最后 1–2 层 token 取梯度（降开销）
+  - 只在关键阶段触发（接触/遮挡）
+
+**优势**：比 attention 更接近“影响”。  
+**风险**：实现侵入+开销；且梯度噪声大，需要平滑/分块。
+
+---
+
+## 6) 如何“保留这些 tokens”：三种落地形态（从易到难）
+
+### 6.1 KV 可见性裁剪（最推荐）
+
+- world token 仍写 cache，但 action 阶段只读取 Top‑K 的 world K/V（其余视为被 prune）
+- 好处：不改 `mesh_id` / patch 打包；不破坏张量形状；容易 A/B。
+
+### 6.2 Cache 生命周期管理（Hot/Cold + Summary）
+
+- 保留关键 tokens 为 Hot，低重要 tokens 变 Cold（压缩/量化/低频更新）
+- 适合长 episode，避免 token 长度增长导致越跑越慢。
+
+### 6.3 真正缩短序列（token merging / ToMe）
+
+- 对低重要 tokens 做 merge，减少 forward token 数
+- 工程侵入最大，建议在 6.1/6.2 验证收益后再做。
+
+---
+
+## 7) 实验与验证（必须做的 sanity checks）
+
+- **Token recall**：关键物体/手附近 tokens 是否在 Top‑K 覆盖率高（可视化到像素/patch）。
+- **Ablation**：
+  - 只用 attention / 只用 change / 融合
+  - K 从小到大扫（Pareto：时延 vs 成功率）
+- **反事实验证**：随机丢 token vs 丢低分 token vs 丢高分 token，比较 action 变化量与任务成功率。
+- **长序列曲线**：episode 越长，是否还能保持时延不增长（cache hygiene 成功与否）。
+
diff --git a/markdown/ideas/ideas_dynamic_step_budget_0305.md b/markdown/ideas/ideas_dynamic_step_budget_0305.md
new file mode 100644
index 0000000..e0cd5fd
--- /dev/null
+++ b/markdown/ideas/ideas_dynamic_step_budget_0305.md
@@ -0,0 +1,103 @@
+# 动态 Step Budget：Lazy World Denoising / Multi-rate 实现思路
+
+目标：**每控制周期动态决定 world/action 的步数**，在不动模型结构的前提下砍 world NFE（通常是最大头）。对应主清单 `ideas_0305.md` 创新点 1/2/9/11。
+
+---
+
+## 1. 核心直觉
+
+- **World 分支**：token 多、attention 重，是推理大头；但很多周期里“世界没怎么变”或“action 不需要新世界信息”。
+- **做法**：大多数周期 world **少算/不算**（用旧 world cache/旧 world latent）；少数关键周期 world **多算**。
+- **实现**：每个 control cycle 动态决定 `world_steps`、`action_steps`（以及是否强制刷新观测/缓存）。
+
+---
+
+## 2. Lazy World Denoising（LWD）
+
+**含义**：world 按需算——当“世界没怎么变 / action 不需要新世界”时，world 去噪步数从 full 降到 0~2。
+
+**代码落点（最小改动）**  
+`wan_va_server.py::_infer()` 当前逻辑：
+
+- `self.scheduler.set_timesteps(video_inference_step)`
+- `timesteps = timesteps[:video_step]`（当 `video_step != -1`）
+
+把 **`video_step` 从固定 config 改为每周期计算的 `world_steps`**：
+
+- **world_steps = 0**：跳过 video loop。需注意：若 `frame_st_id == 0`，仍需把 `latents[:, :, 0:1]` 设为 `init_latent`（第一帧条件注入），否则偏离有条件生成。
+- **world_steps = 1~3**：跑极少步，维持 KV/latent 的“新鲜度”。
+- **world_steps = full**：关键周期跑满。
+
+**world_need 指标（training-free、低成本）**  
+结合已有观测变化检测（如 `_obs_change_score` / obs ref）：
+
+- **obs_change**：多视角 downsample 的 mean-abs-diff。
+- **state_delta**：`np.max(np.abs(state - prev_state))`。
+- **action_delta**：上一周期输出动作幅度/jerk，如 `||a_t - a_{t-1}||`。
+- **外部强事件**：碰撞、抓取成功、目标切换等 → 强制 `world_steps = full`。
+
+**简单分段规则示例**：
+
+```python
+if force_refresh or obs_change > T_hi or state_delta > S_hi:
+    world_steps = FULL
+elif obs_change < T_lo and state_delta < S_lo and action_delta < A_lo:
+    world_steps = 0
+else:
+    world_steps = 2  # 小步刷新
+```
+
+---
+
+## 3. Multi-rate（多速率 / 快慢双环）
+
+**含义**：world 低频更新、action 高频更新；不必每个控制周期都更新 world。
+
+**实现要点**：
+
+- 维护计数器 `world_update_countdown`（或等价逻辑）。
+- 每次 `infer()`：
+  - 若 countdown == 0 或事件触发：跑一次 world 更新（少步或 full），然后 countdown = K。
+  - 否则：本周期 `world_steps = 0`，只跑 action。
+- action 分支继续使用**上一轮** world 的 cache/latent；需保证在“更新周期”里 world cache 确实被更新（如 `_compute_kv_cache()` 或 video loop 最后一次 forward 的 `update_cache=1`）。
+
+**落点**：server 内 `infer()` 的调用节奏 + `_infer()` 的 step 覆盖逻辑。
+
+---
+
+## 4. Goal-Progress Adaptive Budget（创新点 9 的加速版）
+
+**含义**：用任务进度 proxy 调预算——越接近目标/越简单 → 步数越少；停滞/反向/恢复 → 步数增加。
+
+**实现**：step controller 多一个输入 `progress_proxy`（末端到目标距离、抓取状态、门角度等），并入 LWD 的分段规则，例如：
+
+- 进度明显推进 → 减少 world/action steps。
+- 进度停滞或反向 → 增加 steps 或触发回退策略。
+
+---
+
+## 5. Batch Candidates（创新点 11，用并行换更少大步数）
+
+**含义**：用**小步数**并行生成 K 个 action 候选，用廉价选择器（规则/约束/碰撞几何）选一个，再对选中者做少量 refinement（可选）。目标是降低“单路径长 steps 兜底”的概率，从而降低**平均** NFE 与尾延迟。
+
+**实现要点**：
+
+- action latent 的 batch 维扩成 K，**共享同一份 world cache**（不复制 world token）。
+- 跑 `action_steps_small`，选优后再单独跑 `action_steps_refine`（可选）。
+
+---
+
+## 6. 最小可落地改法（建议起步）
+
+1. 在 `VA_Server` 增加 **step controller** 状态：`prev_state`、`prev_action`、`prev_obs_change`（或等价）。
+2. `_infer()` 支持 **override**：`world_steps_override`、`action_steps_override`，用它们截断 `timesteps` 长度。
+3. **world_steps = 0 的边界**：确保 `frame_st_id == 0` 时 `latent_cond` 仍注入（否则有条件生成会偏）。
+4. 先只用 **obs_change + state_delta** 做 ablation：`world_steps ∈ {0, 2, FULL}`，扫阈值看延迟/成功率 Pareto。
+
+---
+
+## 7. 评估指标建议
+
+- 延迟：每周期 P50/P90/P99；world NFE 分布。
+- 控制质量：成功率、碰撞率、动作抖动；按阶段（抓取前/中/放置）分桶。
+- 统计：world_steps=0 / 2 / full 的占比；误判（关键周期被判为 0）与漏判比例。
diff --git a/markdown/ideas/ideas_training_free_wam_strong_0305.md b/markdown/ideas/ideas_training_free_wam_strong_0305.md
new file mode 100644
index 0000000..de37e71
--- /dev/null
+++ b/markdown/ideas/ideas_training_free_wam_strong_0305.md
@@ -0,0 +1,109 @@
+# WAM（World Action Model）推理加速：补充创新点（training-free）
+
+这份文件专门补充 **training-free 的 accelerate WAM** 点子（不写 MPC/规划/奖励那类“提高成功率为主”的内容）。主清单在 `ideas/ideas_0305.md`，这里给你一些 **更工程化但依然“WAM 特有”** 的加速补充，重点围绕：
+
+- **KV cache 的冷热分层/压缩/量化**
+- **world token 的 ROI 更新（只算“动的/因果相关”的部分）**
+- **跨周期 warm-start 与 early-exit**
+- **CPU↔GPU 搬运与算子并行重叠**
+
+---
+
+## 创新点 A：KV Cache 冷热分层 + 冷层量化（Hot/Cold KV Tiering + Quantized Cold Cache）
+
+- **WAM 特性**：闭环 episode 里 cache 会增长；action 侧会反复 attend 历史 world KV，但真正“有效”的往往是最近窗口 + 少量长期记忆。
+- **算法（training-free）**：
+  - 维护 **Hot KV**（最近 \(W\) 帧，高精度 bf16/fp16）与 **Cold KV**（更旧部分，量化存储 int8/fp8 或 cpu-pinned）。
+  - action 阶段默认只看 Hot + 少量 Cold summary（或按需解量化一小段）。
+  - 当触发事件（接触/遮挡/目标切换）才临时扩大可见 Cold 范围。
+- **落点**：`modules/model.py` 的 cache 存取/valid selection（写入时打 hot/cold tag；读取时拼接/解量化）。
+- **加速来源**：缩短 action 注意力的有效 K/V；减少显存占用与 cache 管理成本；offload 场景收益更大。
+- **风险/验证**：量化误差影响精细操作；用分阶段策略（精细阶段禁用 cold 量化或增大 Hot 窗口）。
+
+---
+
+## 创新点 B：Cache 的 “Delta Write” （无更新即不写，避免无效 cache 维护）
+
+- **WAM 特性**：很多周期里 world 其实未更新（或只更新很小一块），但 naive 实现仍会写入整段 KV，带来 cache 膨胀与带宽浪费。
+- **算法**：
+  - 当触发器判定本周期 world 不需要更新（或只更新 ROI），则 **禁止写入 pred cache**（或只写 ROI token 的 KV）。
+  - action 分支使用上次的 KV（或上次 + ROI 增量）。
+- **落点**：`wan_va_server.py` 调用 transformer 时的 `update_cache` 策略 + `modules/model.py` 中 cache 写入逻辑。
+- **加速来源**：减少 cache 写带宽、减少 cache 长度增长、减少后续注意力计算。
+- **风险/验证**：需要严格保证因果一致性；用日志记录每次“跳写”的比例与失败案例。
+
+---
+
+## 创新点 C：World 的 ROI Token Update（基于“运动/变化”更新 world token）
+
+- **WAM 特性**：控制里大量背景是静态的；真正重要的是 “手+物体+接触区域”。
+- **算法（training-free）**：
+  - 用廉价的变化检测（多视角 downsample diff / 光流近似 / mask 变化）得到 ROI。
+  - 将 ROI 映射到 latent patch/token，只有 ROI token 进入 world 去噪更新；其余 token 直接复用上一周期 world latent（以及 KV）。
+  - ROI token 的写入以 block 为粒度（利于 cache 管理）。
+- **落点**：`wan_va_server.py::_encode_obs_with_cache()` 产出 ROI mask；world 循环里按 mask 构建子序列（需要在数据打包/patch 逻辑处支持子集）。
+- **加速来源**：world token 数显著减少 → 注意力边数下降 → NFE 总成本下降。
+- **风险/验证**：ROI 映射误差；先用粗 block（如 8×8 patch）降低错杀风险。
+
+---
+
+## 创新点 D：Action Diffusion Warm-start（跨周期动作 latent 热启动）
+
+- **WAM 特性**：相邻控制周期最优动作通常连续；action 去噪从纯噪声开始浪费。
+- **算法**：
+  - 用上周期 action latent 的末态 \(a_{t}^{*}\) 作为本周期初值，加小噪声后只做少量去噪（或直接从中间 sigma 开始）。
+  - 结合你的 “动作维度冻结/早停”（`ideas_0305.md` 的创新点 7）进一步减少步数。
+- **落点**：`wan_va_server.py` action loop：维护 `prev_action_latent`，改变初始化与 timesteps 起点。
+- **加速来源**：减少 action NFE（尤其 action_steps 很高时）。
+- **风险/验证**：遇到突变场景会陷入局部；用 “变化检测/事件触发” 强制重启（random init）。
+
+---
+
+## 创新点 E：Scheduler Early-Exit（用收敛判据提前结束扩散）
+
+- **WAM 特性**：很多周期里去噪很快收敛，后续 steps 的增益小。
+- **算法（training-free）**：
+  - **监测信号（不额外 forward）**：在每次 `scheduler.step()` 你本来就有 `model_pred_t`（如 \(v_t\)/\(\epsilon_t\)）以及更新后的 latent \(x_{t+1}\)。用它们构造“相对变化量”：
+    - \(\Delta_v(t)=\dfrac{\|v_t-v_{t-1}\|_2}{\|v_t\|_2+\varepsilon}\)（或把 \(v\) 换成 \(\epsilon\)/\(\hat{x}_0\)）
+    - \(\Delta_x(t)=\dfrac{\|x_{t+1}-x_t\|_2}{\|x_t\|_2+\varepsilon}\)
+    - world 分支可只在 ROI/block 上算 \(\Delta_x\)（与“ROI token update”配合更稳）；action 分支在 action latent 维度上算即可。
+  - **抗抖/滞回**：对 \(\Delta_v,\Delta_x\) 做一个轻量 EMA（或滑窗均值）得到 \(\widehat{\Delta}(t)\)，并设置“连续命中”计数器 `hit`：
+    - 若 \(\widehat{\Delta}(t) < \tau(t)\)，则 `hit += 1`，否则 `hit = 0`。
+    - 当 `hit >= m` 且已完成最少步数 `t >= t_min`，触发 early-exit。
+  - **阈值怎么设（关键是归一化 + 随噪声阶段调度）**：
+    - 固定阈值起步：\(\tau(t)=\tau_0\)，例如 \(\tau_0\in[10^{-3},10^{-2}]\)（依赖归一化后量纲）。
+    - 更稳的做法：随噪声强度收紧/放宽，比如 \(\tau(t)=c\cdot \sigma_t\) 或 \(\tau(t)=c\cdot \sigma_t^2\)（早期 \(\sigma\) 大允许更大变化，后期更严格）。
+    - 分阶段：接触/精细阶段使用更小阈值、更大的 \(m\)（更保守），粗阶段相反。
+  - **防误退 guardrails**：
+    - 设 `t_min`（至少跑完前 \(p\%\) steps 才允许早停），避免一开始就误判“变化小”。
+    - 若触发“事件/突变”（例如观测 diff/接触检测/目标切换），本周期禁用 early-exit 或直接重启（参考 D 的“变化触发强制重启”）。
+    - 可加一个“最终质量兜底”检查：例如 early-exit 前再看一次 \(\Delta_x\) 是否也低于阈值（双条件）以降低误判。
+- **落点**：`wan_va_server.py` 的 video/action loop；或 `FlowMatchScheduler.step()` 外围做。
+- **加速来源**：直接减少 NFE。
+- **风险/验证**：阈值敏感；用离线 sweep 找 Pareto。
+
+---
+
+## 创新点 F：CPU↔GPU 搬运与算子重叠（Encode/Preprocess Overlap with CUDA Streams）
+
+- **WAM 特性**：offload + 多视角 preprocess 常让 CPU↔GPU 变成瓶颈，尤其 server 长跑时会出现尾延迟。
+- **算法**：
+  - 使用 pinned memory + 异步 H2D；VAE encode 放到独立 CUDA stream；与 transformer forward 重叠。
+  - 图像 resize/stack 在 CPU 侧也可并行（线程池）并提前准备下一周期数据。
+- **落点**：`wan_va_server.py::_encode_obs_with_cache()` 与 `_compute_kv_cache()`：把 preprocess/transfer/encode 拆分成可重叠阶段。
+- **加速来源**：减少 pipeline 泡沫、降低 P90/P99 时延。
+- **风险/验证**：实现复杂；用 Nsight/torch profiler 验证 overlap 是否生效。
+
+---
+
+## 创新点 G：自适应 Offload 策略（基于命中率/负载的 VAE 常驻调度）
+
+- **WAM 特性**：offload 带来搬运开销，但在 encode reuse 命中率高时 VAE 常驻 GPU 可能更划算。
+- **算法**：
+  - 在线统计：encode reuse 命中率、VAE encode 的占比、GPU 余量。
+  - 动态决定：VAE 是否常驻 GPU（或只常驻高频视角的编码路径）。
+- **落点**：`VA_Server.__init__` 与 `_encode_obs_with_cache()`：根据统计切换 `self.vae`/`self.streaming_vae` 的 device。
+- **加速来源**：避免“搬运主导”的坏工况，尤其在高帧率/多视角时。
+- **风险/验证**：频繁切换 device 反而更慢；需加入 hysteresis（滞回）。
+
+
diff --git a/markdown/ideas/ideas_value_aware_compute_0306.md b/markdown/ideas/ideas_value_aware_compute_0306.md
new file mode 100644
index 0000000..57e6063
--- /dev/null
+++ b/markdown/ideas/ideas_value_aware_compute_0306.md
@@ -0,0 +1,144 @@
+# WAM 推理加速：Value-Aware Compute 创新点（0306）
+
+单纯把 `num_inference_steps` 调小（如 10→3）难以作为创新点——审稿人会认为 diffusion/flow matching 减步数、蒸馏、一致性采样已有大量工作。
+
+真正能站住脚的角度：在 WAM 里，**world 和 action 不是两个独立生成任务**，而是一个闭环控制系统里**两种不同时间尺度、不同价值密度**的计算。创新应写成 **「如何把有限的 denoising budget 在 world/video 与 action 之间按控制价值动态分配」**，而不是「把 diffusion steps 调小」。
+
+当前实现（`wan_va_server.py`）已给出切入点：先 video loop 再 action loop，两个 budget 分开配置；可在此基础上做**预算分配策略、world/action 耦合、停止准则、cache 复用**。
+
+---
+
+## 1. Action-Conditioned Dynamic Budget
+
+**核心**：不是固定 `video=3, action=10`，而是每个 control cycle 动态决定  
+- 这一步要不要更新 world  
+- world 需要几步  
+- action 需要几步  
+
+预算由 **action relevance** 决定，而非仅看图像变化。
+
+**信号示例**：
+- 当前 action 的不确定性 / 多样性
+- action 对 world token 的注意力集中度
+- 预测动作与上一步动作的差异
+- 接触 / 抓取 / 放置等关键事件
+- world prediction 对 action 的边际增益
+
+**Novelty 表述**：  
+*"We allocate denoising budget according to the control value of world updates, rather than uniformly reducing diffusion steps."*
+
+与传统 diffusion 的区别：传统是样本质量导向；此处是**控制收益导向**；预算分配对象是**两个耦合分支**，不是单一生成器。
+
+---
+
+## 2. Multi-Rate World-Action Sampling
+
+**思想**：
+- action **高频**更新
+- world **低频**更新
+- 大部分周期复用旧 world cache
+- 仅在关键时刻刷新 video/world branch
+
+机器人控制里动作环通常比世界建模环更高频，设定自然。
+
+**Novelty 表述**：  
+*"Asynchronous denoising for embodied world-action models."*
+
+与一般 video diffusion 的区别：world 不是最终输出目标，而是 action 的上下文；可允许 world 低频、action 高频；本质是控制系统里的**多速率采样**。
+
+---
+
+## 3. Action-Guided ROI World Denoising
+
+**思想**：不要让 world branch 每次对整张 latent grid 同等强度更新，而是  
+- 重点更新与当前动作相关的区域（gripper 附近、被操作物体、容器口等）  
+- 背景区域复用 cache 或低频更新  
+
+**ROI 来源**：
+- wrist camera / end-effector pose
+- 上一步 action rollout 落点
+- action→world attention map
+- optical flow / obs difference
+
+**Novelty 表述**：  
+*"We reduce world denoising by exploiting the action-conditioned spatial sparsity of embodied interaction."*
+
+比通用 sparse diffusion 更合理：ROI 不是纯视觉，而是 **causal to action**。
+
+---
+
+## 4. Speculative Control with World Verification
+
+**流程**：
+1. 用极少 steps 快速生成 action proposal
+2. 用 1 小步 world update 预测该 action 的后果
+3. 若 world verification 通过 → 直接执行
+4. 若不通过 → 追加 refinement steps
+
+类比 LLM speculative decoding，但此处：proposal 是 action，verifier 是 world model，目标是 control success。
+
+**Novelty 表述**：  
+*"Fast action proposal, cheap world verification, selective refinement."*
+
+WAM-specific：利用同时有 world 与 action 两个头。
+
+---
+
+## 5. 论文主线 Framing
+
+**主标题建议**（避免「fewer diffusion steps」）：  
+- **Value-Aware Compute Allocation for World-Action Models**  
+- 或 **Action-Centric Adaptive Sampling for World-Action Models**
+
+**主张**：
+- WAM 的 world/video 与 action 分支对控制的价值密度不同
+- 统一固定步数低效
+- 应根据动作相关性、阶段、置信度，把 compute budget 动态分配到最有用的 branch / token / timestep
+
+---
+
+## 6. 推荐组合方案
+
+落地时建议做成一个完整系统：
+
+- **动态 world/action 步数分配**
+- **world 低频、action 高频**
+- **action-guided ROI world update**
+- **world verification 触发 fallback refinement**
+
+**快路径**：不更新或少更新 world，小步数出 action，用局部 ROI 或 summary world 做验证。  
+**慢路径**：关键周期 full world refresh，增加 action refinement，重建可靠 world context。
+
+---
+
+## 7. 避免被说「只是工程 trick」
+
+需证明：
+
+1. **不是所有 step reduction 都一样**：相同 NFE 下，你的分配策略优于固定减步。
+2. **world/action 联合分配优于只压 video 或只压 action**：做 branch-level ablation。
+3. **改善来自 WAM 特有结构**：例如  
+   - action uncertainty 能预测 world refresh 必要性  
+   - ROI world tokens 与 action success 强相关  
+   - speculative verification 显著降低尾延迟而不掉成功率  
+
+**评测建议**：success rate、average NFE、P50/P90 latency、failure mode breakdown、不同任务阶段的 budget 分布。
+
+---
+
+## 8. 关于 env_step 占比
+
+若 `env_step_update` 占端到端时间很高（如 80%+），则：
+
+- 减少 inference steps 对**模型推理时间**有效
+- 对**端到端 wall-clock** 改善会被环境仿真掩盖
+
+论文中建议拆开报告：**model-only inference latency** 与 **end-to-end control latency**。
+
+---
+
+## 9. 一句话总结
+
+最有新意的方向不是「把 diffusion steps 调小」，而是：  
+**让 world 和 action 在闭环控制中按价值、按阶段、按空间区域、按验证结果动态消耗计算。**  
+这才是从「生成模型加速」走向「world-action model 专属推理机制」的差异点。
diff --git a/markdown/ideas/ideas_video_token_prune_0305.md b/markdown/ideas/ideas_video_token_prune_0305.md
new file mode 100644
index 0000000..29536a0
--- /dev/null
+++ b/markdown/ideas/ideas_video_token_prune_0305.md
@@ -0,0 +1,127 @@
+# Video / World Token Prune 创新点（training-free）
+
+目标：对 **生成的 video/world token 长度** 做裁剪或压缩，降低 attention 计算与 cache 成本，且不改变训练。最稳的做法是先做在 **KV/注意力侧**（等价于 token prune），再考虑真正缩短前向序列。
+
+---
+
+## 设计原则
+
+- **优先做“谁 attend 谁”的裁剪**：world token 仍存在，但在后续 attention 里只让一部分 token 参与（缩短有效 K/V），不改输入 token 形状与 mesh_id，工程风险小。
+- **再考虑真正缩短序列**：merge/evict 等需动 cache 写入与打包逻辑，适合在 KV prune 验证后再做。
+
+---
+
+## 创新点 1：Action-conditioned KV Token Prune（最推荐、可行性最高）
+
+**目标**：action 分支 attend 的 world KV 从“全量视频 token”变为“Top-M 因果相关 token”。
+
+**重要性打分（training-free）**：
+
+- 用 **action token → world token 的 attention 权重**（最后 1~2 层即可）：
+  - \(\mathrm{imp}(w_i) = \sum_{a \in \text{action tokens}} \mathrm{Attn}(a \to w_i)\)
+- 可在多 step/多层做 EMA 平滑，减少抖动。
+
+**Prune 规则**：
+
+- 保留 **Top-M** world tokens（或按 block 保留 Top-B blocks）。
+- **强制保留**：wrist/手附近 ROI、最近帧 token、变化显著区域（见创新点 3）。
+
+**落点**：
+
+- 在 transformer 的 attention **读 cache 时**做 K/V 的 gather（只给注意力一个缩短的 K/V），不改 latent / mesh_id 生成。
+
+**加速来源**：显著降低 **action 阶段** attention 的 K/V 长度；长 episode 时延与 cache 增长更可控。
+
+**创新性**：WAM 专属“因果裁剪”——world 为 action 服务，只保留对动作预测重要的 token。
+
+**风险/验证**：M 过小可能丢关键几何；可先做 M 的 sweep，并记录被裁掉的 token 空间/时间分布。
+
+---
+
+## 创新点 2：Temporal Token Eviction + Summary Tokens
+
+**目标**：把“很久以前的 video tokens”从全量保留改为少量“世界摘要 token”，使有效 token 长度稳定在上限。
+
+**做法**：
+
+- **Hot**：最近 \(W\) 帧 world tokens 全保留。
+- **Cold**：更早的帧压缩成 \(K\) 个 summary tokens（mean pooling / attention pooling / PCA 低秩等，均 training-free）。
+- action 分支默认只 attend：**summary + 最近 \(W\) 帧**。
+
+**落点**：cache 的写入/淘汰策略；summary 作为额外 KV 写入，不改变主前向的 token 形状定义。
+
+**加速来源**：控制 token 长度不随 episode 增长爆炸；降低长序列时延与显存。
+
+**创新性**：把“世界记忆”做成可控的长期摘要，而非无限增长的 KV。
+
+**风险/验证**：摘要丢失细粒度操作信息；可在接触/精细阶段关闭压缩或增大 \(K\)。
+
+---
+
+## 创新点 3：Change-driven Spatial ROI Prune
+
+**目标**：只保留“动的/相关的”空间区域对应 token，其余视为静态背景。
+
+**ROI 信号（training-free）**：
+
+- 多视角下采样 diff、近似光流、分割 mask 变化。
+- 末端速度大时扩大 ROI；静止时缩小 ROI。
+
+**工程要点**（当前 world latent 为多相机在宽度维拼接）：
+
+- 每个相机 ROI 映射到其对应的 **latent 水平区间**。
+- 先按**粗 block**（如 8×8 latent patch）做，鲁棒且易实现。
+
+**落点**：
+
+- 建议先做“ROI 决定 KV 可见性”（即 KV prune），不立刻做“减少前向 token 数”。
+
+**加速来源**：action attention 与长序列 cache 均受益；ROI 也可作为动态 step budget 的触发器。
+
+**创新性**：利用 WAM 多视角结构，把视觉变化直接映射到 token 级计算裁剪。
+
+---
+
+## 创新点 4：ToMe / Token Merging on World Tokens（真缩短序列，研究向）
+
+**目标**：在 world 分支去噪过程中，将相似/低重要性 token **合并**，真实减少序列长度。
+
+**做法（training-free）**：
+
+- 每隔若干步，对 world tokens 做相似度（cosine / L2），将最相似的若干对 **merge**（如加权平均）。
+- 保留映射表，必要时可“反投影”回原网格（通常不完全可逆）。
+
+**难点**：
+
+- 需同步处理 **mesh_id**、token 打包、cache slot 对应关系。
+- 对扩散去噪稳定性有影响（尤其 early steps）。
+
+**建议**：作为论文亮点合适；工程上建议在 **KV prune（创新点 1/2/3）** 验证后再上。
+
+---
+
+## 落地顺序建议
+
+| 顺序 | 方案 | 说明 |
+|------|------|------|
+| 1 | Action-conditioned KV prune（创新点 1） | 改动集中、不破坏形状，对“video token 长度带来的注意力成本”最直接 |
+| 2 | Temporal eviction + summary（创新点 2） | 控制长 episode 的 token 上限，与 1 可组合 |
+| 3 | Change-driven ROI（创新点 3） | 先用于 KV prune / 触发器，再考虑真减 token |
+| 4 | ToMe / token merging（创新点 4） | 真剪序列长度，研究型，放在最后 |
+
+---
+
+## 落点与依赖小结
+
+- **创新点 1**：transformer 内 attention 读 cache 时的 K/V 索引/gather；需暴露 1~2 层 action→world attention 权重（或等价重要性）。
+- **创新点 2**：cache 管理（写入、淘汰、summary 生成与拼接）。
+- **创新点 3**：`_encode_obs` 外围的 ROI 映射 + 同上 cache/attention 的可见性。
+- **创新点 4**：world 数据打包、patch、mesh_id 与 cache slot 一致性。
+
+---
+
+## 评估指标建议
+
+- 有效 world token 数 / K-V 长度分布；attention 计算量（FLOPs 或等价）。
+- 控制质量：成功率、碰撞率、阶段失败分布；与“不 prune”的 A/B。
+- 延迟与显存：P50/P90 时延；cache 峰值；长 episode 的时延增长曲线。
diff --git a/markdown/ideas/vla_roi_tokens_methods.md b/markdown/ideas/vla_roi_tokens_methods.md
new file mode 100644
index 0000000..8d1bb6c
--- /dev/null
+++ b/markdown/ideas/vla_roi_tokens_methods.md
@@ -0,0 +1,135 @@
+# VLA 中 ROI Tokens 的计算方法与创新点备忘
+
+这里的 **ROI tokens** 指：从视觉 token 序列（patch tokens / video tokens / latent tokens / blocks）中挑出一小部分“更值得算/更该被模型关注”的子集，用于 **token pruning / 动态计算 / cache 选择 / ROI 更新** 等目的。目标通常是：在几乎不掉成功率的前提下，减少注意力边数、减少 NFE、或减少跨模态对齐成本。
+
+---
+
+## 1) ROI token 的常见定义（你到底想“保留”什么）
+
+- **Action-relevant ROI**：与下一步动作决策最相关（手/物体/接触点/目标区域）。
+- **Dynamics ROI**：随时间变化显著（运动、形变、遮挡变化），背景静态区域不重要。
+- **Uncertainty / Hard ROI**：模型最不确定/最难预测的区域（需要更多计算）。
+- **Task-conditioned ROI**：由语言指令/任务状态决定（“把红杯子拿起来” → 红杯区域优先）。
+
+实践里往往是 **多信号融合**，并用时间一致性减少抖动。
+
+---
+
+## 2) Training-free（不训练新模块）ROI 计算
+
+### 2.1 运动/变化驱动（最稳的第一选择）
+
+- **帧差 / 低分辨率 diff**：在像素或 VAE latent 上做 `|I_t - I_{t-1}|` 或 `|z_t - z_{t-1}|`，再映射回 patch/token。
+  - **优点**：便宜、实现快、对“静态背景+局部运动”的场景很有效。
+  - **缺点**：相机抖动/光照变化会误报；动作相关但静止的目标会漏掉。
+- **光流/特征流（近似）**：用轻量 flow 或用 backbone 特征做相关性匹配，得到运动 mask。
+- **时序一致性 + 滞回**：对 ROI mask 做 dilate/erode，或对 token 重要性做 EMA，并设置最小保持时长，减少“闪烁式 ROI”。
+
+### 2.2 模型内部注意力信号（不额外 forward）
+
+适用于 Transformer/VLM/VLA：把“模型已经算出来的东西”拿来当重要性。
+
+- **Cross-attention mass**（语言/动作 query → 视觉 token 的注意力权重）：
+  - 计算每个视觉 token 被关注的总量 \(s_i=\sum_{q\in Q} \mathrm{Attn}(q\rightarrow i)\)，Top-K 作为 ROI。
+  - 变体：只统计与动词/名词相关的语言 token；或只统计 action head 的 query。
+- **Attention rollout / last-layer attention**：
+  - 使用最后几层、或 rollout 得到更“语义化”的重要性。
+- **KV cache usage proxy**：
+  - 统计某些 token 在注意力里被访问的频率/平均权重，作为长期重要性。
+
+注意：纯注意力权重可能被“分布形状/温度”影响，建议做归一化和跨 step 平滑（EMA）。
+
+### 2.3 预测误差/残差驱动（diffusion / world model 特别常见）
+
+- **去噪残差/更新幅度**：
+  - 例如对 latent patch 的 \(|\Delta x|\) 或 \(|\Delta \epsilon|\) 做聚合，大的区域当 ROI（“哪里还没收敛就多算哪里”）。
+- **重建误差/一致性误差**：
+  - 用轻量 decoder 或特征一致性评估，误差大 → ROI。
+- **不确定性 proxy**：
+  - 多次 dropout / 两个头的分歧 / 方差估计：分歧大 → ROI（计算更贵，但更稳）。
+
+### 2.4 几何/先验驱动（机器人常用）
+
+- **手-眼先验**：
+  - 已知末端执行器投影/深度 → 以 gripper 投影点为中心取 ROI（环形/高斯窗）。
+- **目标框/检测/分割**（训练外部小模型也算 training-free for 主模型）：
+  - 用现成 detector/segmenter 产生 mask → token 选择。
+- **接触/力觉事件触发 ROI 扩张**：
+  - 事件发生时扩大 ROI（避免错杀关键细节）。
+
+---
+
+## 3) 需要少量训练/可学习的 ROI 计算（质量更高，但要数据/训练）
+
+### 3.1 可学习的 Token Selector / Router（动态 Top-K）
+
+- **Gating network**：输入视觉 token（可加语言/动作条件）输出每个 token 的 keep score。
+  - 训练信号：行为克隆 loss、成功率、或 distillation（保持与全算模型输出一致）。
+- **Block-level routing**：
+  - 先选 block 再选 token（两级稀疏），更适合工程落地与 cache 管理。
+- **Budget-aware / compute-aware 训练**：
+  - 把 FLOPs/NFE 当约束：在固定预算下最大化任务指标。
+
+### 3.2 Action-conditioned ROI（VLA 特有的“更贴任务”方式）
+
+- **预测“动作敏感区域”**：
+  - 让 selector 直接预测“哪些视觉 token 会影响下一步 action distribution”。
+  - 可用梯度近似监督：\(\|\partial \pi(a|s)/\partial x_i\|\) 大的 token 更重要（实践可用蒸馏近似，避免真梯度开销）。
+- **Counterfactual masking**：
+  - 训练时随机 mask 一部分 token，看 action 变化大不大；变化大 → 该 token 重要（可用于训练 selector）。
+
+### 3.3 任务/语言对齐式 ROI（指令驱动）
+
+- **Phrase grounding**：
+  - 把指令中的实体/属性对齐到视觉区域，ROI = 被 grounding 的区域 + 邻域。
+- **Query-former / region queries**：
+  - 用少量 learnable queries 从视觉 token 中抽取“可控数量”的 region features，本质是软 ROI 压缩。
+
+---
+
+## 4) 组合策略（工业界常见的“稳 + 快”）
+
+ROI 质量通常来自 **多信号融合 + 时序稳定化**：
+
+- **融合打分**（例）：
+  - \(s_i = w_m s^{motion}_i + w_a s^{attn}_i + w_u s^{uncert}_i + w_p s^{prior}_i\)
+  - 再做 Top-K / Top-blocks。
+- **跨 step EMA 稳定化**：
+  - \(\hat{s}_t = \alpha \hat{s}_{t-1} + (1-\alpha)s_t\)，用 \(\hat{s}_t\) 选择，减少 ROI 抖动。
+- **hysteresis（滞回）**：
+  - 进入 ROI 用高阈值，退出 ROI 用低阈值；或设置最小驻留步数。
+- **Multi-res ROI**：
+  - 先粗分辨率找 ROI block，再在 block 内细选 token；可显著降低 selector 成本。
+
+---
+
+## 5) 可写成“创新点”的方向（更像论文/专利的表述）
+
+- **创新点 1：Action-conditioned + Dynamics 双通路 ROI**
+  - 一路用动作条件（cross-attn / action head），一路用变化检测（\(|\Delta x|\)/motion），再做可学习融合或规则融合。
+  - 亮点：兼顾“动作相关但静止”和“变化但不相关”的两类误差。
+
+- **创新点 2：ROI 的时序一致性约束（减少 flicker 的稳定化机制）**
+  - EMA + 滞回 + 最小驻留步数 + 事件触发扩张（接触/遮挡/目标切换）。
+  - 亮点：把“选择稳定性”当成显式目标，提升闭环控制鲁棒性。
+
+- **创新点 3：Uncertainty-driven 计算再分配（把算力花在难点上）**
+  - 用模型分歧/残差预测来扩张 ROI；easy 区域早停或降精度。
+  - 亮点：与 diffusion/world model 的收敛特性天然匹配。
+
+- **创新点 4：Block-sparse ROI + Cache-aware 选择**
+  - ROI 不只决定 forward 计算，还决定 KV 写入/保留/量化策略（hot/cold 分层、delta write）。
+  - 亮点：把“token 选择”与“系统级瓶颈（带宽/显存）”统一优化。
+
+- **创新点 5：Counterfactual token importance 的轻量蒸馏**
+  - 用训练时的 token masking 估计因果重要性，蒸馏给一个便宜 selector（推理时近似反事实）。
+  - 亮点：比纯注意力权重更接近“对 action 的因果贡献”。
+
+---
+
+## 6) 验证与指标（避免只看速度不看闭环）
+
+- **速度**：token 数/注意力边数、wall-clock、显存峰值、NFE、P90/P99。
+- **质量**：成功率、轨迹偏差、接触稳定性、失败类型（漏 ROI / 误 ROI / 抖动）。
+- **稳定性**：ROI 集合变化率（churn）、mask 闪烁频率、平均驻留步数。
+
diff --git a/markdown/ideas/world-action-model-acceleration_0301.md b/markdown/ideas/world-action-model-acceleration_0301.md
new file mode 100644
index 0000000..2718909
--- /dev/null
+++ b/markdown/ideas/world-action-model-acceleration_0301.md
@@ -0,0 +1,43 @@
+# Acceleration Ideas for World-Action Models (WAM)
+
+In this project, inference cost is dominated by:
+- **Video token count** (often much larger than action tokens; e.g., Robotwin is ~30:1).
+- **Video diffusion steps** (many backbone calls per chunk).
+
+Below are algorithm-level acceleration directions, especially for optimizing **video latent** inference.
+
+## A) Fewer steps (distillation / consistency / rectified flow)
+- Distill the video branch from many steps (e.g., 25) to **1–4 steps** using teacher trajectories.
+- `FlowMatchScheduler.step()` is Euler-like integration, suitable for progressive distillation / consistency training.
+- Keep action head unchanged initially; aggressively compress **video steps** first.
+
+## B) Fewer video tokens (structural latent compression)
+1. **Low-res latent diffusion + latent super-resolution**
+   - Diffuse on smaller `(H', W')` latent grids (token count \(\propto\) area), then use a lightweight decoder/upsampler to recover full latent size.
+2. **Learned bottleneck tokens (per-frame K summary tokens)**
+   - Encode each frame latent grid into **K \(\ll\) H×W** tokens; diffuse only these tokens; decode back only if needed.
+3. **Camera/ROI-aware compression**
+   - Keep high-res tokens only for critical views/regions (e.g., wrist / end-effector neighborhood), downsample the rest.
+
+## C) Fewer tokens without changing output (dynamic token selection)
+- Update only a subset of video tokens per step; reuse previous values for the rest.
+- Token importance can be estimated by motion/changes, end-effector proximity, or early-step attention statistics.
+- Combine with `flex_attention` masks to realize sparse compute beyond fixed windows.
+
+## D) Fewer backbone calls (share computation between video and action)
+- Instead of running separate diffusion loops for video then action, predict actions from shared hidden states during the video loop.
+- Alternatively reduce action diffusion to 1–2 steps, or switch action to deterministic regression + uncertainty head.
+
+## E) More reuse across chunks (beyond KV cache)
+- Extend from KV cache to **state/token cache**: cache a compressed world-state representation across chunks and only update the increment.
+- Use a **keyframe strategy**: refresh video latent features at low frequency; run action at high frequency; periodic correction.
+
+## F) Fast-path inference (no explicit video generation)
+- If deployment only needs control success (not visualization), do not generate full video latents at inference time.
+- Train with video loss to maintain representation quality, but deploy a fast-path that outputs only compact features needed for action.
+
+## Recommended MVP routes
+- **Route 1 (most reliable):** video step distillation (25 → 4 → 2 → 1).
+- **Route 2 (token bottleneck):** latent token compression (low-res or K-summary) + lightweight decoding.
+- **Route 3 (research-y):** dynamic sparse attention + keyframe refresh policy.
+
diff --git a/pyproject.toml b/pyproject.toml
index 9236cbc..710dddc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,16 +12,15 @@ authors = [
 license = { file = "LICENSE.txt" }
 readme = "README.md"
 requires-python = ">=3.10,<4.0"
+# 不在此声明 torch/torchvision/flash_attn，避免 pip install -e . 从 PyPI 拉取覆盖已安装的 cu124 版本
+# 请先按 README 或 script/setup_cu124_mirror.md 单独安装 PyTorch (cu124)，再 pip install -e . --no-deps 或直接运行
 dependencies = [
-    "torch>=2.9.0",
-    "torchvision>=0.24.0",
     "diffusers>=0.36.0",
     "transformers>=4.55.4",
     "tokenizers>=0.21.4",
     "tqdm",
     "imageio",
     "easydict",
-    "flash_attn",
     "numpy>=1.26.4,<2"
 ]
 
@@ -44,10 +43,10 @@ modelscope = "https://github.com/Robbyant"
 discord = "https://github.com/Robbyant"
 
 [tool.setuptools]
-packages = ["lingbot_va"]
+packages = ["wan_va"]
 
 [tool.setuptools.package-data]
-"lingbot_va" = ["**/*.py"]
+"wan_va" = ["**/*.py"]
 
 [tool.black]
 line-length = 88
diff --git a/run_inference.sh b/run_inference.sh
new file mode 100644
index 0000000..1c7c18c
--- /dev/null
+++ b/run_inference.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# LingBot-VA 一键推理（自动激活 conda 环境 + 设置模型路径）
+# 用法: bash run_inference.sh  或  bash script/
+set -e
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-4}"
+
+CONDA_BASE=""
+if [[ -n "$CONDA_EXE" ]]; then
+  CONDA_BASE="${CONDA_EXE%/bin/conda}"
+fi
+if [[ -z "$CONDA_BASE" ]]; then
+  CONDA_BASE="$(conda info --base 2>/dev/null)" || true
+fi
+if [[ -n "$CONDA_BASE" && -f "$CONDA_BASE/etc/profile.d/conda.sh" ]]; then
+  source "$CONDA_BASE/etc/profile.d/conda.sh"
+else
+  echo "未检测到 conda，请先安装 Miniconda/Anaconda 或在本机已激活 lingbot-va 的终端中直接执行:"
+  echo "  export LINGBOT_VA_MODEL_PATH=$REPO_ROOT/lingbot-va-base"
+  echo "  bash script/run_i2va_single_gpu.sh"
+  exit 1
+fi
+
+echo ">>> 激活环境: lingbot-va"
+conda activate lingbot-va
+
+echo ">>> 模型路径: $REPO_ROOT/lingbot-va-base"
+export LINGBOT_VA_MODEL_PATH="${LINGBOT_VA_MODEL_PATH:-$REPO_ROOT/lingbot-va-base}"
+
+cd "$REPO_ROOT"
+bash script/run_i2va_single_gpu.sh "$@"
diff --git a/script/download/download_bench_robotwin.sh b/script/download/download_bench_robotwin.sh
new file mode 100644
index 0000000..6ea1613
--- /dev/null
+++ b/script/download/download_bench_robotwin.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/bash
+# RoboTwin 2.0 测评环境与资源下载（测评 bench 用）
+# 使用完整路径，从仓库根目录执行: bash script/download_bench_robotwin.sh
+#
+# 说明：测评不是在“数据集文件”上跑，而是在 RoboTwin 仿真里跑 50 个 task。
+#       需要先克隆 RoboTwin 仓库并下载其 assets，再按 README 启动 server + client。
+
+set -e
+
+# 测评 bench 根目录（RoboTwin 克隆位置）
+BENCH_ROOT="${BENCH_ROOT:-/mnt/users/wangyuxuan-20250915/EAI/RoboTwin}"
+ROBOTWIN_COMMIT="${ROBOTWIN_COMMIT:-2eeec322}"
+
+echo "BENCH_ROOT (RoboTwin 克隆目录): $BENCH_ROOT"
+echo "RoboTwin commit: $ROBOTWIN_COMMIT"
+echo ""
+
+# 1. 克隆 RoboTwin 并 checkout
+if [ ! -d "$BENCH_ROOT" ]; then
+    echo ">>> 克隆 RoboTwin 到 $BENCH_ROOT"
+    git clone https://github.com/RoboTwin-Platform/RoboTwin.git "$BENCH_ROOT"
+    cd "$BENCH_ROOT"
+    git checkout "$ROBOTWIN_COMMIT"
+    cd - > /dev/null
+else
+    echo ">>> 目录已存在: $BENCH_ROOT，跳过 clone；如需重装请先删掉该目录"
+    cd "$BENCH_ROOT"
+    git fetch origin 2>/dev/null || true
+    git checkout "$ROBOTWIN_COMMIT" 2>/dev/null || true
+    cd - > /dev/null
+fi
+
+# 2. 下载 assets（测评必需）
+echo ""
+echo ">>> 下载 RoboTwin assets（测评场景与资源）"
+cd "$BENCH_ROOT"
+if [ -f "script/_download_assets.sh" ]; then
+    if ! bash script/_download_assets.sh; then
+        echo ""
+        echo "官方脚本下载失败（多为 HuggingFace 连接中断）。请用带重试与镜像的脚本："
+        echo "  cd $(dirname "$0")/.."
+        echo "  HF_ENDPOINT=https://hf-mirror.com python script/download_robotwin_assets.py $BENCH_ROOT"
+        exit 1
+    fi
+else
+    echo "未找到 script/_download_assets.sh，请先完成 RoboTwin 安装（见 README 步骤 2–4）。"
+    exit 1
+fi
+cd - > /dev/null
+
+echo ""
+echo "测评 bench 下载完成."
+echo "RoboTwin 路径: $BENCH_ROOT"
+echo ""
+echo "后续步骤（需在 RoboTwin 文档中完成安装后再做）："
+echo "  1) 安装依赖: sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-tools"
+echo "  2) 按 README 修改 script/requirements.txt 和 script/_install.sh 后执行: bash script/_install.sh"
+echo "  3) 测评时设置 RoboTwin 路径并启动 client:"
+echo "     export ROBOTWIN_ROOT=$BENCH_ROOT"
+echo "     # 在 lingbot-va 仓库根目录执行: bash evaluation/robotwin/launch_client.sh <save_root> <task_name>"
+echo "  4) 先启动 LingBot-VA server，再在同上目录运行 launch_client.sh 进行测评"
diff --git a/script/download/download_dataset.py b/script/download/download_dataset.py
new file mode 100644
index 0000000..cc4732d
--- /dev/null
+++ b/script/download/download_dataset.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+"""
+从 HuggingFace 下载 LingBot-VA post-training 数据集 (robotwin-clean-and-aug-lerobot)。
+README 中该数据集仅在 HuggingFace 提供，ModelScope 无此数据集。
+
+用法:
+  python script/download_dataset.py
+  python script/download_dataset.py /path/to/save
+"""
+import os
+import sys
+
+def main():
+    repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    default_dir = os.path.join(repo_root, "robotwin-clean-and-aug-lerobot")
+    local_dir = os.path.abspath(sys.argv[1]) if len(sys.argv) > 1 else default_dir
+
+    try:
+        from huggingface_hub import snapshot_download
+    except ImportError:
+        print("请先安装: pip install huggingface_hub")
+        sys.exit(1)
+
+    repo_id = "robbyant/robotwin-clean-and-aug-lerobot"
+    print(f"正在从 HuggingFace 下载数据集: {repo_id}")
+    print(f"保存到: {local_dir}")
+    os.makedirs(local_dir, exist_ok=True)
+
+    try:
+        path = snapshot_download(
+            repo_id=repo_id,
+            repo_type="dataset",
+            local_dir=local_dir,
+        )
+    except Exception as e:
+        print(f"下载失败: {e}")
+        sys.exit(1)
+
+    print(f"\n下载完成: {path}")
+    print("训练时设置数据集路径:")
+    print(f"  export LINGBOT_VA_DATASET_PATH={path}")
+
+if __name__ == "__main__":
+    main()
diff --git a/script/download/download_modelscope.py b/script/download/download_modelscope.py
new file mode 100644
index 0000000..2b03082
--- /dev/null
+++ b/script/download/download_modelscope.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""
+从 ModelScope 下载 LingBot-VA 模型 (lingbot-va-base)。
+用法:
+  python script/download_modelscope.py
+  python script/download_modelscope.py /path/to/save
+"""
+import os
+import sys
+
+def main():
+    # 默认保存到仓库根目录下的 lingbot-va-base
+    repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    default_dir = os.path.join(repo_root, "lingbot-va-base")
+    local_dir = os.path.abspath(sys.argv[1]) if len(sys.argv) > 1 else default_dir
+
+    try:
+        from modelscope import snapshot_download
+    except ImportError:
+        print("请先安装 modelscope: pip install modelscope")
+        sys.exit(1)
+
+    model_id = "Robbyant/lingbot-va-base"
+    print(f"正在从 ModelScope 下载: {model_id}")
+    print(f"保存到: {local_dir}")
+    os.makedirs(local_dir, exist_ok=True)
+
+    try:
+        path = snapshot_download(model_id, local_dir=local_dir)
+    except Exception as e:
+        print(f"下载失败: {e}")
+        sys.exit(1)
+
+    print(f"\n下载完成: {path}")
+    print("推理前请设置并修改 transformer 的 attn_mode:")
+    print(f"  export LINGBOT_VA_MODEL_PATH={path}")
+    print("  # 编辑 {}/transformer/config.json 将 attn_mode 改为 \"torch\" 或 \"flashattn\"".format(path))
+
+if __name__ == "__main__":
+    main()
diff --git a/script/download/download_posttrain_robotwin.py b/script/download/download_posttrain_robotwin.py
new file mode 100644
index 0000000..4f38681
--- /dev/null
+++ b/script/download/download_posttrain_robotwin.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""
+从 HuggingFace 或 ModelScope 下载 LingBot-VA RoboTwin 后训权重 (lingbot-va-posttrain-robotwin)。
+用于 RoboTwin 2.0 评测时达到论文报告成功率，需先下载再设置 LINGBOT_VA_MODEL_PATH。
+
+用法:
+  python script/download/download_posttrain_robotwin.py
+  python script/download/download_posttrain_robotwin.py /path/to/save
+  HF_ENDPOINT=https://hf-mirror.com python script/download/download_posttrain_robotwin.py  # 国内镜像
+
+下载完成后评测前设置:
+  export LINGBOT_VA_MODEL_PATH=/path/to/lingbot-va-posttrain-robotwin
+  bash script/run_eval_robotwin.sh
+"""
+import os
+import sys
+
+REPO_ID_HF = "robbyant/lingbot-va-posttrain-robotwin"
+REPO_ID_MS = "Robbyant/lingbot-va-posttrain-robotwin"
+
+
+def main():
+    # 仓库根目录 = script/download 的上级的上级
+    repo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    default_dir = os.path.join(repo_root, "lingbot-va-posttrain-robotwin")
+    local_dir = os.path.abspath(sys.argv[1]) if len(sys.argv) > 1 else default_dir
+
+    use_modelscope = os.environ.get("USE_MODELSCOPE", "").lower() in ("1", "true", "yes")
+
+    if use_modelscope:
+        try:
+            from modelscope import snapshot_download
+        except ImportError:
+            print("请先安装: pip install modelscope")
+            sys.exit(1)
+        print(f"正在从 ModelScope 下载: {REPO_ID_MS}")
+        print(f"保存到: {local_dir}")
+        os.makedirs(local_dir, exist_ok=True)
+        try:
+            path = snapshot_download(REPO_ID_MS, local_dir=local_dir)
+        except Exception as e:
+            print(f"下载失败: {e}")
+            sys.exit(1)
+    else:
+        try:
+            from huggingface_hub import snapshot_download
+        except ImportError:
+            print("请先安装: pip install huggingface_hub")
+            sys.exit(1)
+        print(f"正在从 HuggingFace 下载: {REPO_ID_HF}")
+        print(f"保存到: {local_dir}")
+        os.makedirs(local_dir, exist_ok=True)
+        try:
+            path = snapshot_download(
+                repo_id=REPO_ID_HF,
+                local_dir=local_dir,
+            )
+        except Exception as e:
+            print(f"下载失败: {e}")
+            print("  国内用户可试: HF_ENDPOINT=https://hf-mirror.com python script/download/download_posttrain_robotwin.py")
+            sys.exit(1)
+
+    print(f"\n下载完成: {path}")
+    print("RoboTwin 评测前设置:")
+    print(f"  export LINGBOT_VA_MODEL_PATH={path}")
+    print("  或在运行脚本前: LINGBOT_VA_MODEL_PATH=%s bash script/run_eval_robotwin.sh" % path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/download/download_robotwin2.sh b/script/download/download_robotwin2.sh
new file mode 100755
index 0000000..052fe77
--- /dev/null
+++ b/script/download/download_robotwin2.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+# RoboTwin 2.0 一键下载脚本
+# 功能：克隆 RoboTwin 仓库（若不存在）+ 下载并解压 **2.0 专用** assets，并写入资源路径配置。
+#
+# 与 1.0 区分：默认安装到 RoboTwin-2.0，避免和已有 RoboTwin/1.0 共用同一 assets 目录造成混用或覆盖。
+# 若需同时保留 1.0：1.0 用例如 EAI/RoboTwin 或 EAI/RoboTwin-1.0，2.0 用本脚本默认 EAI/RoboTwin-2.0。
+#
+# 用法（任选其一）：
+#   # 从 lingbot-va 仓库根目录执行（推荐）
+#   bash script/download/download_robotwin2.sh
+#   bash script/download/download_robotwin2.sh /path/to/RoboTwin-2.0
+#
+#   # 国内镜像（HuggingFace 不稳定时）
+#   HF_ENDPOINT=https://hf-mirror.com bash script/download/download_robotwin2.sh
+#   BENCH_ROOT=/path/to/RoboTwin-2.0 bash script/download/download_robotwin2.sh
+#
+# 环境变量：
+#   BENCH_ROOT      RoboTwin 2.0 安装目录，默认: .../EAI/RoboTwin-2.0（与 1.0 分目录）
+#   ROBOTWIN_COMMIT 使用的 git 提交，默认: 2eeec322（与官方测评一致）
+#   HF_ENDPOINT     可选，国内可设 https://hf-mirror.com 加速 assets 下载
+
+set -e
+
+# 脚本所在目录 -> lingbot-va 仓库根
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+BENCH_ROOT="${1:-${BENCH_ROOT:-/mnt/users/wangyuxuan-20250915/EAI/RoboTwin-2.0}}"
+ROBOTWIN_COMMIT="${ROBOTWIN_COMMIT:-2eeec322}"
+# 转为绝对路径（若目录已存在）
+if [ -d "$BENCH_ROOT" ]; then
+  BENCH_ROOT="$(cd "$BENCH_ROOT" && pwd)"
+elif [ -d "$(dirname "$BENCH_ROOT")" ]; then
+  true
+else
+  mkdir -p "$(dirname "$BENCH_ROOT")"
+fi
+
+echo "============= RoboTwin 2.0 下载 =============
+  lingbot-va 仓库: $REPO_ROOT
+  RoboTwin 目录:  $BENCH_ROOT
+  git commit:     $ROBOTWIN_COMMIT
+  HF_ENDPOINT:    ${HF_ENDPOINT:-（未设置）}
+================================================"
+
+# 1. 克隆 RoboTwin 仓库（若不存在）
+if [ ! -d "$BENCH_ROOT" ] || [ ! -f "$BENCH_ROOT/script/_download_assets.sh" ]; then
+  if [ ! -d "$BENCH_ROOT" ]; then
+    echo ">>> 克隆 RoboTwin 到: $BENCH_ROOT"
+    mkdir -p "$(dirname "$BENCH_ROOT")"
+    git clone https://github.com/RoboTwin-Platform/RoboTwin.git "$BENCH_ROOT"
+  fi
+  cd "$BENCH_ROOT"
+  git fetch origin 2>/dev/null || true
+  git checkout "$ROBOTWIN_COMMIT" 2>/dev/null || true
+  cd - > /dev/null
+  echo ">>> 仓库就绪: $BENCH_ROOT"
+else
+  echo ">>> 已存在 RoboTwin 目录，跳过 clone；如需指定 commit 可设 ROBOTWIN_COMMIT 后重新运行"
+  cd "$BENCH_ROOT"
+  git fetch origin 2>/dev/null || true
+  git checkout "$ROBOTWIN_COMMIT" 2>/dev/null || true
+  cd - > /dev/null
+fi
+# 确保之后使用绝对路径
+BENCH_ROOT="$(cd "$BENCH_ROOT" && pwd)"
+
+# 2. 下载并解压 assets（使用带重试与镜像的 Python 脚本）
+echo ""
+echo ">>> 下载 RoboTwin 2.0 assets（HuggingFace: TianxingChen/RoboTwin2.0）..."
+if [ ! -f "$REPO_ROOT/script/download/download_robotwin_assets.py" ]; then
+  echo "Error: 未找到 $REPO_ROOT/script/download/download_robotwin_assets.py，请从 lingbot-va 仓库根目录执行本脚本。"
+  exit 1
+fi
+
+cd "$REPO_ROOT"
+python script/download/download_robotwin_assets.py "$BENCH_ROOT"
+
+echo ""
+echo "============= 下载完成 =============
+  RoboTwin 2.0: $BENCH_ROOT
+  assets:       $BENCH_ROOT/assets（仅 2.0 资源，与 1.0 分目录不混用）
+
+后续步骤（测评 / 运行仿真）：
+  1) 安装系统与 Python 依赖（见 RoboTwin 文档）：
+     sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-tools
+     cd $BENCH_ROOT && bash script/_install.sh
+  2) 测评时指定 2.0 路径并运行 eval：
+     export ROBOTWIN_ROOT=$BENCH_ROOT
+     bash script/run_eval_robotwin.sh
+=========================================="
diff --git a/script/download/download_robotwin_assets.py b/script/download/download_robotwin_assets.py
new file mode 100644
index 0000000..76794d8
--- /dev/null
+++ b/script/download/download_robotwin_assets.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+"""
+RoboTwin 2.0 测评 assets 下载（带重试 + 可选国内镜像）。
+HuggingFace 连接中断时可用此脚本重试；国内建议先设 HF_ENDPOINT 镜像。
+
+用法:
+  python script/download_robotwin_assets.py
+  python script/download_robotwin_assets.py /mnt/users/wangyuxuan-20250915/EAI/RoboTwin
+  HF_ENDPOINT=https://hf-mirror.com python script/download_robotwin_assets.py /path/to/RoboTwin
+"""
+import os
+import sys
+import time
+
+def main():
+    repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    bench_root = os.path.abspath(sys.argv[1]) if len(sys.argv) > 1 else os.environ.get("BENCH_ROOT", os.path.join(repo_root, "..", "RoboTwin"))
+    bench_root = os.path.normpath(bench_root)
+    assets_dir = os.path.join(bench_root, "assets")
+    os.makedirs(assets_dir, exist_ok=True)
+
+    # 国内可设 HF_ENDPOINT=https://hf-mirror.com 加速/避免连接中断
+    hf_endpoint = os.environ.get("HF_ENDPOINT", "")
+    if hf_endpoint:
+        os.environ["HF_ENDPOINT"] = hf_endpoint
+        print(f"Using HF_ENDPOINT: {hf_endpoint}")
+
+    try:
+        from huggingface_hub import snapshot_download
+    except ImportError:
+        print("请先安装: pip install huggingface_hub")
+        sys.exit(1)
+
+    repo_id = "TianxingChen/RoboTwin2.0"
+    patterns = ["background_texture.zip", "embodiments.zip", "objects.zip"]
+    max_retries = 3
+
+    print(f"下载 RoboTwin2.0 assets 到: {assets_dir}")
+    for attempt in range(1, max_retries + 1):
+        try:
+            snapshot_download(
+                repo_id=repo_id,
+                allow_patterns=patterns,
+                local_dir=assets_dir,
+                repo_type="dataset",
+            )
+            print("下载完成.")
+            break
+        except Exception as e:
+            print(f"第 {attempt}/{max_retries} 次尝试失败: {e}")
+            if attempt < max_retries:
+                wait = 10 * attempt
+                print(f"{wait}s 后重试...")
+                time.sleep(wait)
+            else:
+                print("多次重试仍失败。建议：")
+                print("  1) 国内用户设置镜像后重试: HF_ENDPOINT=https://hf-mirror.com python script/download_robotwin_assets.py", bench_root)
+                print("  2) 或浏览器打开 https://huggingface.co/datasets/TianxingChen/RoboTwin2.0 手动下载上述 zip 放到", assets_dir, "后执行:")
+                print("     cd " + bench_root + " && bash script/_download_assets.sh  # 仅解压与配置")
+                sys.exit(1)
+
+    # 解压与配置（与 RoboTwin 原脚本一致）
+    import subprocess
+    orig_cwd = os.getcwd()
+    try:
+        os.chdir(assets_dir)
+        for name in ["background_texture.zip", "embodiments.zip", "objects.zip"]:
+            if os.path.isfile(name):
+                subprocess.check_call(["unzip", "-o", name], shell=False)
+                os.remove(name)
+        os.chdir(bench_root)
+        if os.path.isfile(os.path.join(bench_root, "script", "update_embodiment_config_path.py")):
+            subprocess.check_call([sys.executable, "script/update_embodiment_config_path.py"], cwd=bench_root)
+        print("解压与路径配置完成.")
+    finally:
+        os.chdir(orig_cwd)
+
+if __name__ == "__main__":
+    main()
diff --git a/script/run_eval_robotwin.sh b/script/run_eval_robotwin.sh
new file mode 100755
index 0000000..682b075
--- /dev/null
+++ b/script/run_eval_robotwin.sh
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+# RoboTwin 2.0 测评脚本（LingBot-VA）
+# 使用前：已下载 RoboTwin 及 assets 到 ROBOTWIN_ROOT（默认 EAI/RoboTwin）
+# 重要：eval client 会启动 RoboTwin 仿真，当前 conda 环境需安装 RoboTwin 依赖（sapien、mplib 等），
+#       否则会报 ModuleNotFoundError: No module named 'sapien'。见下方「依赖」说明。
+#
+# 用法（在 lingbot-va 仓库根目录执行）：
+#   bash script/run_eval_robotwin.sh                    # 单任务 adjust_bottle，结果到 ./results
+#   bash script/run_eval_robotwin.sh ./my_results        # 指定结果目录
+#   bash script/run_eval_robotwin.sh ./out stack_bowls_three   # 指定任务名
+#   bash script/run_eval_robotwin.sh ./out adjust_bottle 10    # 结果目录 + 任务 + test_num
+# 测完全集（50 任务 × 100 条）：bash script/run_eval_robotwin_full.sh [save_root] [test_num]
+# 是否渲染/保存预测视频（VAE 解码+对比视频，较耗时）：SAVE_VISUALIZATION=1 bash script/run_eval_robotwin.sh ...  # 默认 0 关闭
+# 是否启用 offload（1=把 VAE 放 CPU 节省显存，0=VAE 常驻 GPU 加速 encode_obs）：ENABLE_OFFLOAD=0 bash script/run_eval_robotwin.sh ...
+
+set -e
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+# 最先激活 lingbot-va 环境，保证后续 python/sapien 检查及 server/client 均在该环境下运行
+CONDA_BASE=""
+if [[ -n "$CONDA_EXE" ]]; then
+  CONDA_BASE="${CONDA_EXE%/bin/conda}"
+fi
+if [[ -z "$CONDA_BASE" ]]; then
+  CONDA_BASE="$(conda info --base 2>/dev/null)" || true
+fi
+if [[ -z "$CONDA_BASE" ]]; then
+  for d in "$HOME/miniconda3" "$HOME/miniforge3" "$HOME/anaconda3" "/opt/conda"; do
+    if [[ -f "${d}/etc/profile.d/conda.sh" ]]; then
+      CONDA_BASE="$d"
+      break
+    fi
+  done
+fi
+if [[ -n "$CONDA_BASE" && -f "$CONDA_BASE/etc/profile.d/conda.sh" ]]; then
+  source "$CONDA_BASE/etc/profile.d/conda.sh"
+  conda activate lingbot-va
+  echo ">>> 已自动激活 conda 环境: lingbot-va"
+else
+  echo "Error: 未检测到 conda 或无法找到 lingbot-va 环境。"
+  echo "  请安装 conda 并创建环境: conda create -n lingbot-va python=3.x && conda activate lingbot-va"
+  exit 1
+fi
+
+ROBOTWIN_ROOT="${ROBOTWIN_ROOT:-/mnt/users/wangyuxuan-20250915/EAI/RoboTwin-2.0}"
+export ROBOTWIN_ROOT
+
+# 模型路径：优先 models/，其次仓库根下 posttrain，否则 base
+if [ -n "$LINGBOT_VA_MODEL_PATH" ]; then
+  :
+elif [ -d "$REPO_ROOT/models/lingbot-va-posttrain-robotwin" ]; then
+  export LINGBOT_VA_MODEL_PATH="$REPO_ROOT/models/lingbot-va-posttrain-robotwin"
+elif [ -d "$REPO_ROOT/lingbot-va-posttrain-robotwin" ]; then
+  export LINGBOT_VA_MODEL_PATH="$REPO_ROOT/lingbot-va-posttrain-robotwin"
+else
+  export LINGBOT_VA_MODEL_PATH="${LINGBOT_VA_MODEL_PATH:-$REPO_ROOT/lingbot-va-base}"
+fi
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-5}"
+export LD_LIBRARY_PATH="/usr/lib64:/usr/lib:${LD_LIBRARY_PATH:-}"
+
+# 可调参数：server 使用的 GPU 进程数（默认 1，多卡时需 server 支持）
+NPROC_PER_NODE="${NPROC_PER_NODE:-1}"
+
+# 解析参数
+save_root="${1:-./results}"
+task_name="${2:-adjust_bottle}"
+test_num="${3:-10}"
+PORT="${PORT:-29056}"
+MASTER_PORT="${MASTER_PORT:-29501}"
+# 是否渲染并保存预测视频（0=关闭可加速，1=开启保存对比视频）
+save_visualization="${SAVE_VISUALIZATION:-0}"
+# 是否把 VAE/text encoder offload 到 CPU（默认 0，优先速度）
+enable_offload="${ENABLE_OFFLOAD:-0}"
+export ENABLE_OFFLOAD="$enable_offload"
+
+# 实际推理为 LingBot-VA（WebsocketClientPolicy），结果目录与日志用 lingbot；deploy 配置沿用 RoboTwin 的 ACT 配置
+policy_name=lingbot
+policy_config="${POLICY_CONFIG:-ACT}"
+task_config=demo_clean 
+train_config_name=0
+model_name=0
+seed=0
+
+echo "============= RoboTwin Eval (LingBot-VA) =============
+  REPO_ROOT          = $REPO_ROOT
+  ROBOTWIN_ROOT      = $ROBOTWIN_ROOT
+  LINGBOT_VA_MODEL_PATH = $LINGBOT_VA_MODEL_PATH
+  policy_name        = $policy_name (LingBot-VA)
+  policy_config      = $policy_config (deploy yml)
+  save_root          = $save_root
+  task_name          = $task_name
+  test_num           = $test_num
+  PORT               = $PORT
+  NPROC_PER_NODE     = $NPROC_PER_NODE
+  CUDA_VISIBLE_DEVICES = $CUDA_VISIBLE_DEVICES
+  SAVE_VISUALIZATION = $save_visualization (0=关 1=开预测视频)
+  ENABLE_OFFLOAD     = $enable_offload (0=VAE在GPU加速 1=offload到CPU省显存)
+========================================================"
+
+# 首次运行：更新 embodiment 配置中的资源路径（从 _tmp.yml 生成 .yml）
+if [ -d "$ROBOTWIN_ROOT/assets/embodiments" ] && [ -f "$ROBOTWIN_ROOT/script/update_embodiment_config_path.py" ]; then
+  if [ -n "$(find "$ROBOTWIN_ROOT/assets/embodiments" -name '*_tmp.yml' 2>/dev/null | head -1)" ]; then
+    echo ">>> 更新 RoboTwin embodiment 配置路径..."
+    (cd "$ROBOTWIN_ROOT" && python script/update_embodiment_config_path.py </dev/null) || true
+  fi
+fi
+
+cd "$REPO_ROOT"
+mkdir -p "$save_root"
+server_log="${save_root}/server.log"
+
+# 启动 LingBot-VA server（后台，输出写入 server.log，避免结束时的 SIGTERM 信息刷屏终端）
+echo ">>> 启动 LingBot-VA server (port=$PORT, nproc_per_node=$NPROC_PER_NODE)，日志: $server_log ..."
+PYTHONWARNINGS=ignore::UserWarning python -m torch.distributed.run \
+  --nproc_per_node "$NPROC_PER_NODE" \
+  --master_port "$MASTER_PORT" \
+  wan_va/wan_va_server.py \
+  --config-name robotwin \
+  --port "$PORT" \
+  --save_root "$save_root" \
+  >> "$server_log" 2>&1 &
+SERVER_PID=$!
+trap "kill $SERVER_PID 2>/dev/null || true" EXIT
+
+echo ">>> 启动 eval client (task=$task_name, test_num=$test_num) ..."
+PYTHONWARNINGS=ignore::UserWarning \
+XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 python -m evaluation.robotwin.eval_polict_client_openpi \
+  --config "$ROBOTWIN_ROOT/policy/$policy_config/deploy_policy.yml" \
+  --overrides \
+  --task_name "$task_name" \
+  --task_config "$task_config" \
+  --train_config_name "$train_config_name" \
+  --model_name "$model_name" \
+  --ckpt_setting "$model_name" \
+  --seed "$seed" \
+  --policy_name "$policy_name" \
+  --save_root "$save_root" \
+  --video_guidance_scale 5 \
+  --action_guidance_scale 1 \
+  --test_num "$test_num" \
+  --save_visualization "$save_visualization" \
+  --port "$PORT"
+
+echo ">>> Eval 结束，结果见: $save_root"
diff --git a/script/run_eval_robotwin_client_only.sh b/script/run_eval_robotwin_client_only.sh
new file mode 100755
index 0000000..58b2503
--- /dev/null
+++ b/script/run_eval_robotwin_client_only.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# 仅运行 RoboTwin eval client（需先在另一终端启动 LingBot-VA server）
+# 用法（在 lingbot-va 仓库根目录执行）：
+#   bash script/run_eval_robotwin_client_only.sh
+#   bash script/run_eval_robotwin_client_only.sh ./results adjust_bottle 20
+#   export PORT=29056; bash script/run_eval_robotwin_client_only.sh
+
+set -e
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+ROBOTWIN_ROOT="${ROBOTWIN_ROOT:-/mnt/users/wangyuxuan-20250915/EAI/RoboTwin-2.0}"
+export ROBOTWIN_ROOT
+
+save_root="${1:-./results}"
+task_name="${2:-adjust_bottle}"
+test_num="${3:-100}"
+PORT="${PORT:-29056}"
+
+policy_name=ACT
+task_config=demo_clean
+train_config_name=0
+model_name=0
+seed=0
+
+export LD_LIBRARY_PATH="/usr/lib64:/usr/lib:${LD_LIBRARY_PATH:-}"
+
+echo "Client only: ROBOTWIN_ROOT=$ROBOTWIN_ROOT save_root=$save_root task_name=$task_name test_num=$test_num port=$PORT"
+cd "$REPO_ROOT"
+mkdir -p "$save_root"
+
+PYTHONWARNINGS=ignore::UserWarning \
+XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 python -m evaluation.robotwin.eval_polict_client_openpi \
+  --config "$ROBOTWIN_ROOT/policy/$policy_name/deploy_policy.yml" \
+  --overrides \
+  --task_name "$task_name" \
+  --task_config "$task_config" \
+  --train_config_name "$train_config_name" \
+  --model_name "$model_name" \
+  --ckpt_setting "$model_name" \
+  --seed "$seed" \
+  --policy_name "$policy_name" \
+  --save_root "$save_root" \
+  --video_guidance_scale 5 \
+  --action_guidance_scale 1 \
+  --test_num "$test_num" \
+  --port "$PORT"
diff --git a/script/run_eval_robotwin_full.sh b/script/run_eval_robotwin_full.sh
new file mode 100755
index 0000000..dd666a0
--- /dev/null
+++ b/script/run_eval_robotwin_full.sh
@@ -0,0 +1,298 @@
+#!/usr/bin/env bash
+# RoboTwin 2.0 完全集测评（50 个任务 × test_num）
+# 支持单卡顺序跑或多卡并行：通过 NUM_GPUS 或第 3 个参数指定 GPU 数，>1 时每卡起一个 server，任务按卡分批并行。
+# 与 run_eval_robotwin.sh 同环境要求。
+#
+# 用法（在 lingbot-va 仓库根目录执行）：
+#   bash script/run_eval_robotwin_full.sh                    # 单卡顺序，结果 ./results，每任务 100 条
+#   bash script/run_eval_robotwin_full.sh ./my_results        # 指定结果目录
+#   bash script/run_eval_robotwin_full.sh ./my_results 100 4 # 结果目录 + test_num + 4 卡并行
+#   NUM_GPUS=8 bash script/run_eval_robotwin_full.sh ./out   # 8 卡并行（50 任务分 8 批）
+#   GPU_IDS=2,3,4,5 bash script/run_eval_robotwin_full.sh ./out 100 4  # 使用指定 GPU 4卡并行
+#   GPU_IDS=1 bash script/run_eval_robotwin_full.sh ./out    # 单卡运行，使用 GPU 1
+#
+# 可选环境变量：ROBOTWIN_ROOT, LINGBOT_VA_MODEL_PATH, NUM_GPUS, GPU_IDS, START_PORT, MASTER_PORT_BASE
+
+set -e
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+# 最先激活 lingbot-va 环境（与 run_eval_robotwin.sh 一致）
+CONDA_BASE=""
+if [[ -n "$CONDA_EXE" ]]; then
+  CONDA_BASE="${CONDA_EXE%/bin/conda}"
+fi
+if [[ -z "$CONDA_BASE" ]]; then
+  CONDA_BASE="$(conda info --base 2>/dev/null)" || true
+fi
+if [[ -z "$CONDA_BASE" ]]; then
+  for d in "$HOME/miniconda3" "$HOME/miniforge3" "$HOME/anaconda3" "/opt/conda"; do
+    if [[ -f "${d}/etc/profile.d/conda.sh" ]]; then
+      CONDA_BASE="$d"
+      break
+    fi
+  done
+fi
+if [[ -n "$CONDA_BASE" && -f "$CONDA_BASE/etc/profile.d/conda.sh" ]]; then
+  source "$CONDA_BASE/etc/profile.d/conda.sh"
+  conda activate lingbot-va
+  echo ">>> 已自动激活 conda 环境: lingbot-va"
+else
+  echo "Error: 未检测到 conda 或无法找到 lingbot-va 环境。"
+  echo "  请安装 conda 并创建环境: conda create -n lingbot-va python=3.x && conda activate lingbot-va"
+  exit 1
+fi
+
+ROBOTWIN_ROOT="${ROBOTWIN_ROOT:-/mnt/users/wangyuxuan-20250915/EAI/RoboTwin-2.0}"
+export ROBOTWIN_ROOT
+
+# 模型路径：优先 models/，其次仓库根下 posttrain，否则 base（与 run_eval_robotwin.sh 一致）
+if [ -n "$LINGBOT_VA_MODEL_PATH" ]; then
+  :
+elif [ -d "$REPO_ROOT/models/lingbot-va-posttrain-robotwin" ]; then
+  export LINGBOT_VA_MODEL_PATH="$REPO_ROOT/models/lingbot-va-posttrain-robotwin"
+elif [ -d "$REPO_ROOT/lingbot-va-posttrain-robotwin" ]; then
+  export LINGBOT_VA_MODEL_PATH="$REPO_ROOT/lingbot-va-posttrain-robotwin"
+else
+  export LINGBOT_VA_MODEL_PATH="${LINGBOT_VA_MODEL_PATH:-$REPO_ROOT/lingbot-va-base}"
+fi
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
+export LD_LIBRARY_PATH="/usr/lib64:/usr/lib:${LD_LIBRARY_PATH:-}"
+
+save_root="${1:-./results}"
+test_num="${2:-100}"
+num_gpus="${3:-${NUM_GPUS:-6}}"
+num_gpus=$((num_gpus))
+GPU_IDS="${GPU_IDS:-2,3,4,5,6,7}"
+START_PORT="${START_PORT:-29556}"
+MASTER_PORT_BASE="${MASTER_PORT_BASE:-29661}"
+PORT="${PORT:-29056}"
+MASTER_PORT="${MASTER_PORT:-29501}"
+
+# 实际推理为 LingBot-VA，结果目录与日志用 lingbot；deploy 配置沿用 RoboTwin 的 ACT
+policy_name=lingbot
+policy_config="${POLICY_CONFIG:-ACT}"
+task_config=demo_clean
+train_config_name=0
+model_name=0
+seed=0
+
+# 50 个任务（与 evaluation/robotwin/calc_stat.py TASK_CLASS 一致）
+# ROBOTWIN_ALL_TASKS=(
+#   adjust_bottle beat_block_hammer blocks_ranking_rgb blocks_ranking_size click_alarmclock
+#   click_bell dump_bin_bigbin grab_roller handover_block handover_mic hanging_mug
+#   lift_pot move_can_pot move_pillbottle_pad move_playingcard_away move_stapler_pad
+#   open_laptop open_microwave pick_diverse_bottles pick_dual_bottles place_a2b_left
+#   place_a2b_right place_bread_basket place_bread_skillet place_burger_fries place_can_basket
+#   place_cans_plasticbox place_container_plate place_dual_shoes place_empty_cup place_fan
+#   place_mouse_pad place_object_basket place_object_scale place_object_stand place_phone_stand
+#   place_shoe press_stapler put_bottles_dustbin put_object_cabinet rotate_qrcode scan_object
+#   shake_bottle shake_bottle_horizontally stack_blocks_three stack_blocks_two stack_bowls_three
+#   stack_bowls_two stamp_seal turn_switch
+# )
+
+ROBOTWIN_ALL_TASKS=(
+  adjust_bottle beat_block_hammer blocks_ranking_rgb blocks_ranking_size click_alarmclock
+  click_bell dump_bin_bigbin grab_roller handover_block handover_mic hanging_mug
+  lift_pot move_can_pot move_pillbottle_pad move_playingcard_away move_stapler_pad
+  open_laptop open_microwave pick_diverse_bottles pick_dual_bottles place_a2b_left
+  place_a2b_right place_bread_basket place_bread_skillet place_burger_fries place_can_basket
+  place_cans_plasticbox place_container_plate place_dual_shoes place_empty_cup place_fan
+  place_mouse_pad place_object_basket place_object_scale place_object_stand place_phone_stand
+  place_shoe press_stapler put_bottles_dustbin put_object_cabinet rotate_qrcode scan_object
+  shake_bottle shake_bottle_horizontally stack_blocks_three stack_blocks_two stack_bowls_three
+  stack_bowls_two stamp_seal turn_switch
+)
+
+echo "============= RoboTwin 完全集 Eval (LingBot-VA) =============
+  REPO_ROOT          = $REPO_ROOT
+  ROBOTWIN_ROOT      = $ROBOTWIN_ROOT
+  LINGBOT_VA_MODEL_PATH = $LINGBOT_VA_MODEL_PATH
+  policy_name        = $policy_name (LingBot-VA)
+  policy_config      = $policy_config (deploy yml)
+  save_root          = $save_root
+  test_num per task  = $test_num (每任务条数)
+  任务数              = ${#ROBOTWIN_ALL_TASKS[@]} (共 50 个任务)
+  num_gpus           = $num_gpus
+  GPU_IDS            = ${GPU_IDS:-自动按卡分配 (0,1,2...)}
+  START_PORT         = $START_PORT
+  PORT               = $PORT
+  CUDA_VISIBLE_DEVICES = $CUDA_VISIBLE_DEVICES
+================================================================"
+
+if [ ! -d "$ROBOTWIN_ROOT" ]; then
+  echo "Error: ROBOTWIN_ROOT 不存在: $ROBOTWIN_ROOT"
+  exit 1
+fi
+
+if [ ! -d "$ROBOTWIN_ROOT/assets" ]; then
+  echo "Error: 未找到 $ROBOTWIN_ROOT/assets，请先下载 RoboTwin assets"
+  echo "  例: cd $REPO_ROOT && python script/download/download_robotwin_assets.py $ROBOTWIN_ROOT"
+  exit 1
+fi
+
+# 检查 RoboTwin 仿真依赖（client 会 import envs -> sapien）（与 run_eval_robotwin.sh 一致）
+if ! python -c "import sapien" 2>/dev/null; then
+  echo "Error: 当前环境缺少 RoboTwin 仿真依赖 'sapien'，导致 client 报错 ModuleNotFoundError."
+  echo "  请在当前 conda 环境中安装 RoboTwin 依赖后再运行本脚本，例如："
+  echo "    pip install sapien==3.0.0b1"
+  echo "    pip install -r $ROBOTWIN_ROOT/script/requirements.txt"
+  echo "  并按 RoboTwin 文档完成 script/_install.sh（pytorch3d、mplib 补丁、curobo 等）。"
+  echo "  详见: $ROBOTWIN_ROOT/INSTALLATION.md 或 README"
+  exit 1
+fi
+
+# 首次运行：更新 embodiment 配置中的资源路径（从 _tmp.yml 生成 .yml）
+if [ -d "$ROBOTWIN_ROOT/assets/embodiments" ] && [ -f "$ROBOTWIN_ROOT/script/update_embodiment_config_path.py" ]; then
+  if [ -n "$(find "$ROBOTWIN_ROOT/assets/embodiments" -name '*_tmp.yml' 2>/dev/null | head -1)" ]; then
+    echo ">>> 更新 RoboTwin embodiment 配置路径..."
+    (cd "$ROBOTWIN_ROOT" && python script/update_embodiment_config_path.py </dev/null) || true
+  fi
+fi
+
+cd "$REPO_ROOT"
+mkdir -p "$save_root"
+
+if [ "$num_gpus" -eq 1 ]; then
+  # ---------- 单卡：一个 server，顺序跑 50 个任务 ----------
+  if [ -z "$GPU_IDS" ]; then
+    export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+  else
+    gpu_single=$(echo "$GPU_IDS" | cut -d, -f1)
+    export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-$gpu_single}"
+  fi
+  echo ">>> 启动 LingBot-VA server (单卡 GPU=$CUDA_VISIBLE_DEVICES, port=$PORT) ..."
+  PYTHONWARNINGS=ignore::UserWarning python -m torch.distributed.run \
+    --nproc_per_node "$num_gpus" \
+    --master_port "$MASTER_PORT" \
+    wan_va/wan_va_server.py \
+    --config-name robotwin \
+    --port "$PORT" \
+    --save_root "$save_root" &
+  SERVER_PID=$!
+  trap "kill $SERVER_PID 2>/dev/null || true" EXIT
+
+  sleep 15
+  if ! kill -0 $SERVER_PID 2>/dev/null; then
+    echo "Error: Server 启动失败，请检查日志（如 attn_mode 是否为 torch/flashattn）"
+    exit 1
+  fi
+
+  total=${#ROBOTWIN_ALL_TASKS[@]}
+  for i in "${!ROBOTWIN_ALL_TASKS[@]}"; do
+    task_name="${ROBOTWIN_ALL_TASKS[$i]}"
+    echo ""
+    echo ">>> [$((i+1))/$total] 启动 eval client: task=$task_name, test_num=$test_num"
+    PYTHONWARNINGS=ignore::UserWarning \
+    XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 python -m evaluation.robotwin.eval_polict_client_openpi \
+      --config "$ROBOTWIN_ROOT/policy/$policy_config/deploy_policy.yml" \
+      --overrides \
+      --task_name "$task_name" \
+      --task_config "$task_config" \
+      --train_config_name "$train_config_name" \
+      --model_name "$model_name" \
+      --ckpt_setting "$model_name" \
+      --seed "$seed" \
+      --policy_name "$policy_name" \
+      --save_root "$save_root" \
+      --video_guidance_scale 5 \
+      --action_guidance_scale 1 \
+      --test_num "$test_num" \
+      --port "$PORT"
+  done
+else
+  # ---------- 多卡：每卡一个 server，任务分批并行 ----------
+  # 解析 GPU_IDS
+  if [ -z "$GPU_IDS" ]; then
+    GPU_ARRAY=()
+    for ((i=0; i<num_gpus; i++)); do
+      GPU_ARRAY+=($i)
+    done
+  else
+    IFS=',' read -ra GPU_ARRAY <<< "$GPU_IDS"
+  fi
+  
+  if [ ${#GPU_ARRAY[@]} -lt $num_gpus ]; then
+    echo "Error: GPU_IDS 中的GPU数量 (${#GPU_ARRAY[@]}) 小于指定的 num_gpus ($num_gpus)"
+    exit 1
+  fi
+  
+  SERVER_PIDS=()
+  trap 'for p in "${SERVER_PIDS[@]}"; do kill $p 2>/dev/null || true; done' EXIT
+
+  echo ">>> 启动 $num_gpus 个 LingBot-VA server (GPU=${GPU_ARRAY[*]}, ports $START_PORT..$((START_PORT + num_gpus - 1))) ..."
+  for ((g=0; g<num_gpus; g++)); do
+    gpu_id=${GPU_ARRAY[$g]}
+    port=$((START_PORT + g))
+    master_port=$((MASTER_PORT_BASE + g))
+    export CUDA_VISIBLE_DEVICES=$gpu_id
+    PYTHONWARNINGS=ignore::UserWarning python -m torch.distributed.run \
+      --nproc_per_node 1 \
+      --master_port "$master_port" \
+      wan_va/wan_va_server.py \
+      --config-name robotwin \
+      --port "$port" \
+      --save_root "$save_root" &
+    SERVER_PIDS+=($!)
+    sleep 2
+  done
+
+  sleep 15
+  for ((g=0; g<num_gpus; g++)); do
+    if ! kill -0 "${SERVER_PIDS[$g]}" 2>/dev/null; then
+      echo "Error: Server GPU $g 启动失败，请检查日志（如 attn_mode 是否为 torch/flashattn）"
+      exit 1
+    fi
+  done
+
+  total=${#ROBOTWIN_ALL_TASKS[@]}
+  batch_time=$(date +%Y%m%d_%H%M%S)
+  log_dir="${save_root}/logs"
+  mkdir -p "$log_dir"
+
+  for ((batch_start=0; batch_start<total; batch_start+=num_gpus)); do
+    batch_num=$((batch_start / num_gpus + 1))
+    batch_total=$(( (total + num_gpus - 1) / num_gpus ))
+    end_idx=$((batch_start + num_gpus))
+    [ "$end_idx" -gt "$total" ] && end_idx=$total
+    echo ""
+    echo ">>> 批次 $batch_num/$batch_total (任务 $((batch_start+1))..${end_idx}/$total)"
+    PIDS=()
+    for ((j=0; j<num_gpus; j++)); do
+      idx=$((batch_start + j))
+      [ "$idx" -ge "$total" ] && break
+      task_name="${ROBOTWIN_ALL_TASKS[$idx]}"
+      gpu_id=${GPU_ARRAY[$j]}
+      port=$((START_PORT + j))
+      log_file="${log_dir}/${task_name}_${batch_time}.log"
+      (
+        export CUDA_VISIBLE_DEVICES=$gpu_id
+        PYTHONWARNINGS=ignore::UserWarning \
+        XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 python -m evaluation.robotwin.eval_polict_client_openpi \
+          --config "$ROBOTWIN_ROOT/policy/$policy_config/deploy_policy.yml" \
+          --overrides \
+          --task_name "$task_name" \
+          --task_config "$task_config" \
+          --train_config_name "$train_config_name" \
+          --model_name "$model_name" \
+          --ckpt_setting "$model_name" \
+          --seed "$seed" \
+          --policy_name "$policy_name" \
+          --save_root "$save_root" \
+          --video_guidance_scale 5 \
+          --action_guidance_scale 1 \
+          --test_num "$test_num" \
+          --port "$port"
+      ) > "$log_file" 2>&1 &
+      PIDS+=($!)
+    done
+    for p in "${PIDS[@]}"; do wait "$p" || true; done
+  done
+fi
+
+echo ""
+echo ">>> 完全集 Eval 结束，结果见: $save_root"
+if [ "$num_gpus" -gt 1 ]; then
+  echo "    各任务日志: $save_root/logs/"
+fi
+echo "    汇总成功率: python -m evaluation.robotwin.calc_stat $save_root/stseed-0/visualization"
diff --git a/script/run_eval_robotwin_slurm.sh b/script/run_eval_robotwin_slurm.sh
new file mode 100755
index 0000000..e012138
--- /dev/null
+++ b/script/run_eval_robotwin_slurm.sh
@@ -0,0 +1,229 @@
+#!/usr/bin/env bash
+#SBATCH -p a100_global
+#SBATCH -N 1
+#SBATCH --gres=gpu:4
+# 多卡并行：改为 --gres=gpu:4 则起 4 个 server+4 个 client 并行跑 4 个任务（每卡一个）
+#SBATCH --cpus-per-task=12
+#SBATCH --ntasks=1
+#SBATCH --job-name=robotwin_va
+#SBATCH --time=48:00:00
+#SBATCH --output=logs/robotwin_%x-%j.out
+#SBATCH --error=logs/robotwin_%x-%j.err
+#
+# RoboTwin 2.0 测评脚本（LingBot-VA）- Slurm 版
+# 用法（在 lingbot-va 仓库根目录执行）：
+#   sbatch script/run_eval_robotwin_slurm.sh
+#   sbatch script/run_eval_robotwin_slurm.sh ./my_results
+#   sbatch script/run_eval_robotwin_slurm.sh ./out stack_bowls_three
+#   sbatch script/run_eval_robotwin_slurm.sh ./out adjust_bottle 10
+# 可选环境变量（提交前 export 或 sbatch 前设置）：
+#   SAVE_ROOT, TASK_NAME, TEST_NUM, PORT, NPROC_PER_NODE, CUDA_VISIBLE_DEVICES, SLURM_GPUS
+# 多卡并行：sbatch --gres=gpu:4 时起 4 个 server（每卡一个）+ 4 个 client 并行跑 4 个任务；
+#   可通过 TASK_LIST="t1 t2 t3 t4" 指定任务，否则用前 4 个 from 50 任务列表。
+
+set -e
+
+NGPU="${SLURM_GPUS_ON_NODE:-${SLURM_GPUS:-1}}"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+ROBOTWIN_ROOT="${ROBOTWIN_ROOT:-/mnt/users/wangyuxuan-20250915/EAI/RoboTwin-2.0}"
+export ROBOTWIN_ROOT
+
+# 50 任务列表（多卡时取前 N 个，或由 TASK_LIST 指定）
+ROBOTWIN_ALL_TASKS=(
+  adjust_bottle beat_block_hammer blocks_ranking_rgb blocks_ranking_size click_alarmclock
+  click_bell dump_bin_bigbin grab_roller handover_block handover_mic hanging_mug
+  lift_pot move_can_pot move_pillbottle_pad move_playingcard_away move_stapler_pad
+  open_laptop open_microwave pick_diverse_bottles pick_dual_bottles place_a2b_left
+  place_a2b_right place_bread_basket place_bread_skillet place_burger_fries place_can_basket
+  place_cans_plasticbox place_container_plate place_dual_shoes place_empty_cup place_fan
+  place_mouse_pad place_object_basket place_object_scale place_object_stand place_phone_stand
+  place_shoe press_stapler put_bottles_dustbin put_object_cabinet rotate_qrcode scan_object
+  shake_bottle shake_bottle_horizontally stack_blocks_three stack_blocks_two stack_bowls_three
+  stack_bowls_two stamp_seal turn_switch
+)
+
+# 模型路径（优先 models/ 下，其次仓库根下）
+if [ -n "$LINGBOT_VA_MODEL_PATH" ]; then
+  :
+elif [ -d "$REPO_ROOT/models/lingbot-va-posttrain-robotwin" ]; then
+  export LINGBOT_VA_MODEL_PATH="$REPO_ROOT/models/lingbot-va-posttrain-robotwin"
+elif [ -d "$REPO_ROOT/lingbot-va-posttrain-robotwin" ]; then
+  export LINGBOT_VA_MODEL_PATH="$REPO_ROOT/lingbot-va-posttrain-robotwin"
+else
+  export LINGBOT_VA_MODEL_PATH="${LINGBOT_VA_MODEL_PATH:-$REPO_ROOT/lingbot-va-base}"
+fi
+# 使用前 NGPU 张卡（未设 CUDA_VISIBLE_DEVICES 时；单卡即为 0）
+if [[ -z "$CUDA_VISIBLE_DEVICES" ]]; then
+  export CUDA_VISIBLE_DEVICES="$(seq -s, 0 $((NGPU-1)))"
+fi
+export LD_LIBRARY_PATH="/usr/lib64:/usr/lib:${LD_LIBRARY_PATH:-}"
+
+save_root="${SAVE_ROOT:-${1:-./results}}"
+task_name="${TASK_NAME:-${2:-adjust_bottle}}"
+test_num="${TEST_NUM:-${3:-100}}"
+PORT="${PORT:-29056}"
+MASTER_PORT="${MASTER_PORT:-29501}"
+
+policy_name=ACT
+task_config=demo_clean
+train_config_name=0
+model_name=0
+seed=0
+
+# 多卡时每卡 1 个 server（1 进程），单卡时也是 1 进程
+NPROC_PER_NODE=1
+
+echo "============= RoboTwin Eval (LingBot-VA) [Slurm] =============
+  JOB_ID             = ${SLURM_JOB_ID:-N/A}
+  NGPU               = $NGPU
+  REPO_ROOT          = $REPO_ROOT
+  ROBOTWIN_ROOT      = $ROBOTWIN_ROOT
+  LINGBOT_VA_MODEL_PATH = $LINGBOT_VA_MODEL_PATH
+  save_root          = $save_root
+  task_name          = $task_name
+  test_num           = $test_num
+  PORT               = $PORT
+  CUDA_VISIBLE_DEVICES = $CUDA_VISIBLE_DEVICES
+========================================================"
+
+if [ ! -d "$ROBOTWIN_ROOT" ]; then
+  echo "Error: ROBOTWIN_ROOT 不存在: $ROBOTWIN_ROOT"
+  exit 1
+fi
+
+if [ ! -d "$ROBOTWIN_ROOT/assets" ]; then
+  echo "Error: 未找到 $ROBOTWIN_ROOT/assets，请先下载 RoboTwin assets"
+  exit 1
+fi
+
+if ! python -c "import sapien" 2>/dev/null; then
+  echo "Error: 当前环境缺少 RoboTwin 仿真依赖 'sapien'."
+  exit 1
+fi
+
+if [ -d "$ROBOTWIN_ROOT/assets/embodiments" ] && [ -f "$ROBOTWIN_ROOT/script/update_embodiment_config_path.py" ]; then
+  if [ -n "$(find "$ROBOTWIN_ROOT/assets/embodiments" -name '*_tmp.yml' 2>/dev/null | head -1)" ]; then
+    (cd "$ROBOTWIN_ROOT" && python script/update_embodiment_config_path.py </dev/null) || true
+  fi
+fi
+
+CONDA_BASE=""
+[[ -n "$CONDA_EXE" ]] && CONDA_BASE="${CONDA_EXE%/bin/conda}"
+[[ -z "$CONDA_BASE" ]] && CONDA_BASE="$(conda info --base 2>/dev/null)" || true
+if [[ -n "$CONDA_BASE" && -f "$CONDA_BASE/etc/profile.d/conda.sh" ]]; then
+  source "$CONDA_BASE/etc/profile.d/conda.sh"
+  conda activate lingbot-va
+else
+  echo "Error: 未检测到 conda 或 lingbot-va 环境"
+  exit 1
+fi
+
+cd "$REPO_ROOT"
+mkdir -p logs "$save_root"
+
+if [ "$NGPU" -eq 1 ]; then
+  # ---------- 单卡：1 个 server + 1 个 client ----------
+  echo ">>> 启动 LingBot-VA server (单卡 port=$PORT) ..."
+  PYTHONWARNINGS=ignore::UserWarning python -m torch.distributed.run \
+    --nproc_per_node 1 \
+    --master_port "$MASTER_PORT" \
+    wan_va/wan_va_server.py \
+    --config-name robotwin \
+    --port "$PORT" \
+    --save_root "$save_root" &
+  SERVER_PID=$!
+  trap "kill $SERVER_PID 2>/dev/null || true" EXIT
+
+  sleep 15
+  if ! kill -0 $SERVER_PID 2>/dev/null; then
+    echo "Error: Server 启动失败"
+    exit 1
+  fi
+
+  echo ">>> 启动 eval client (task=$task_name, test_num=$test_num) ..."
+  PYTHONWARNINGS=ignore::UserWarning \
+  XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 python -m evaluation.robotwin.eval_polict_client_openpi \
+    --config "$ROBOTWIN_ROOT/policy/$policy_name/deploy_policy.yml" \
+    --overrides \
+    --task_name "$task_name" \
+    --task_config "$task_config" \
+    --train_config_name "$train_config_name" \
+    --model_name "$model_name" \
+    --ckpt_setting "$model_name" \
+    --seed "$seed" \
+    --policy_name "$policy_name" \
+    --save_root "$save_root" \
+    --video_guidance_scale 5 \
+    --action_guidance_scale 1 \
+    --test_num "$test_num" \
+    --port "$PORT"
+else
+  # ---------- 多卡：每卡 1 个 server + 1 个 client，并行跑 N 个任务 ----------
+  if [ -n "$TASK_LIST" ]; then
+    TASKS=($TASK_LIST)
+  else
+    TASKS=("${ROBOTWIN_ALL_TASKS[@]:0:$NGPU}")
+  fi
+  if [ ${#TASKS[@]} -lt "$NGPU" ]; then
+    echo "Error: 任务数 ${#TASKS[@]} 小于 NGPU=$NGPU，请设置 TASK_LIST=\"t1 t2 ...\""
+    exit 1
+  fi
+
+  SERVER_PIDS=()
+  trap 'for p in "${SERVER_PIDS[@]}"; do kill $p 2>/dev/null || true; done' EXIT
+
+  echo ">>> 启动 $NGPU 个 LingBot-VA server (ports $PORT..$((PORT+NGPU-1))) ..."
+  for ((g=0; g<NGPU; g++)); do
+    port_g=$((PORT + g))
+    master_port_g=$((MASTER_PORT + g))
+    export CUDA_VISIBLE_DEVICES=$g
+    PYTHONWARNINGS=ignore::UserWarning python -m torch.distributed.run \
+      --nproc_per_node 1 \
+      --master_port "$master_port_g" \
+      wan_va/wan_va_server.py \
+      --config-name robotwin \
+      --port "$port_g" \
+      --save_root "$save_root" &
+    SERVER_PIDS+=($!)
+    sleep 2
+  done
+
+  sleep 15
+  for ((g=0; g<NGPU; g++)); do
+    if ! kill -0 "${SERVER_PIDS[$g]}" 2>/dev/null; then
+      echo "Error: Server GPU $g 启动失败"
+      exit 1
+    fi
+  done
+
+  echo ">>> 启动 $NGPU 个 eval client 并行 (tasks: ${TASKS[*]}) ..."
+  PIDS=()
+  for ((g=0; g<NGPU; g++)); do
+    task_g="${TASKS[$g]}"
+    port_g=$((PORT + g))
+    (
+      export CUDA_VISIBLE_DEVICES=$g
+      PYTHONWARNINGS=ignore::UserWarning \
+      XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 python -m evaluation.robotwin.eval_polict_client_openpi \
+        --config "$ROBOTWIN_ROOT/policy/$policy_name/deploy_policy.yml" \
+        --overrides \
+        --task_name "$task_g" \
+        --task_config "$task_config" \
+        --train_config_name "$train_config_name" \
+        --model_name "$model_name" \
+        --ckpt_setting "$model_name" \
+        --seed "$seed" \
+        --policy_name "$policy_name" \
+        --save_root "$save_root" \
+        --video_guidance_scale 5 \
+        --action_guidance_scale 1 \
+        --test_num "$test_num" \
+        --port "$port_g"
+    ) &
+    PIDS+=($!)
+  done
+  for p in "${PIDS[@]}"; do wait "$p" || true; done
+fi
+
+echo ">>> Eval 结束，结果见: $save_root"
diff --git a/script/run_i2va_single_gpu.sh b/script/run_i2va_single_gpu.sh
new file mode 100644
index 0000000..da90bd7
--- /dev/null
+++ b/script/run_i2va_single_gpu.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/bash
+# 单 GPU 运行 Image-to-Video-Action 推理（用于调试/本地跑通）
+# 使用前请设置模型路径并准备首帧图像，见 INFERENCE.md
+#
+# 从仓库根目录执行: bash script/run_i2va_single_gpu.sh
+
+set -e
+
+NGPU=${NGPU:-1}
+CONFIG_NAME=${CONFIG_NAME:-robotwin_i2av}
+# 模型目录：需包含 vae / tokenizer / text_encoder / transformer 子目录
+export LINGBOT_VA_MODEL_PATH=${LINGBOT_VA_MODEL_PATH:-"/path/to/pretrained/model"}
+
+if [ "$LINGBOT_VA_MODEL_PATH" = "/path/to/pretrained/model" ]; then
+    echo "Error: 请设置环境变量 LINGBOT_VA_MODEL_PATH 为已下载的 LingBot-VA 模型目录"
+    echo "  export LINGBOT_VA_MODEL_PATH=/path/to/lingbot-va-base"
+    echo "  并确保该目录下存在: vae/ tokenizer/ text_encoder/ transformer/"
+    exit 1
+fi
+
+if [ ! -d "$LINGBOT_VA_MODEL_PATH/transformer" ]; then
+    echo "Error: 未找到 $LINGBOT_VA_MODEL_PATH/transformer"
+    echo "  请从 HuggingFace/ModelScope 下载 lingbot-va-base 并解压到该路径"
+    exit 1
+fi
+
+# 推理前请将 transformer/config.json 中 attn_mode 设为 "torch" 或 "flashattn"（不能是 "flex"）
+echo "Config: NGPU=$NGPU CONFIG_NAME=$CONFIG_NAME"
+echo "Model:  $LINGBOT_VA_MODEL_PATH"
+echo ""
+
+export TOKENIZERS_PARALLELISM=false
+cd "$(dirname "$0")/.."
+
+PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" python -m torch.distributed.run \
+    --nproc_per_node="$NGPU" \
+    --master_port=29501 \
+    -m wan_va.wan_va_server --config-name "$CONFIG_NAME" "$@"
diff --git a/script/setup/install_flash_attn_cu124.sh b/script/setup/install_flash_attn_cu124.sh
new file mode 100755
index 0000000..fd70624
--- /dev/null
+++ b/script/setup/install_flash_attn_cu124.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+# 从根本上解决 flash_attn 与 PyTorch 2.6.0+cu124 的 ABI 兼容：安装匹配的 wheel 或从源码用正确 ABI 编译
+# 用法: bash script/install_flash_attn_cu124.sh [lingbot-va]
+# 要求: 已安装 torch==2.6.0+cu124（且 torch._C._GLIBCXX_USE_CXX11_ABI == False）
+
+set -e
+
+CONDA_ENV="${1:-lingbot-va}"
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+# 初始化 conda
+CONDA_BASE="${CONDA_EXE%/bin/conda}"
+[[ -z "$CONDA_BASE" ]] && CONDA_BASE="$(conda info --base 2>/dev/null)" || true
+if [[ -n "$CONDA_BASE" && -f "$CONDA_BASE/etc/profile.d/conda.sh" ]]; then
+  source "$CONDA_BASE/etc/profile.d/conda.sh"
+fi
+
+echo "=== 检查当前 PyTorch 与 ABI ==="
+conda activate "$CONDA_ENV"
+python -c "
+import torch
+v = torch.__version__
+cuda = torch.version.cuda
+abi = torch._C._GLIBCXX_USE_CXX11_ABI
+print(f'PyTorch: {v}, CUDA: {cuda}, CXX11_ABI: {abi}')
+if not v.startswith('2.6') or cuda != '12.4':
+    print('Warning: 本脚本针对 torch 2.6.x+cu124 与 cxx11abiFALSE。当前环境可能不匹配。')
+if abi is not False:
+    print('Warning: 官方 PyTorch wheel 通常为 CXX11_ABI=False。若用 cxx11abiTRUE 的 flash_attn 会报 undefined symbol。')
+"
+
+echo ""
+echo ">>> 卸载已有 flash-attn（若存在）"
+pip uninstall -y flash-attn 2>/dev/null || true
+
+# 方案 1：使用社区预编译 wheel（torch2.6 + cu124 + cp310，manylinux 与官方 PyTorch ABI 一致）
+# 来源: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases (v0.7.16)
+# 2.7.4 在 issue #1783 中 2.7.4.post1 确认与 torch 2.6.0+cu124 兼容；mjun0812 的 2.7.4+cu124torch2.6 为同组合
+WHEEL_URL_274="https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.16/flash_attn-2.7.4+cu124torch2.6-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl"
+
+echo ""
+echo ">>> 方案 1: 安装预编译 wheel (2.7.4+cu124torch2.6, cp310)"
+if pip install --no-cache-dir "$WHEEL_URL_274"; then
+  echo ""
+  echo ">>> 验证 flash_attn 导入"
+  if python -c "from flash_attn import flash_attn_func; print('flash_attn 导入成功')"; then
+    echo ""
+    echo "=== 安装成功。可将 transformer 的 attn_mode 设为 flashattn 以使用 Flash Attention。 ==="
+    exit 0
+  fi
+  echo ">>> 预编译 wheel 导入失败（多为 ABI 不匹配），尝试方案 2"
+  pip uninstall -y flash-attn 2>/dev/null || true
+fi
+
+# 方案 2：从源码编译，强制使用与 PyTorch 一致的旧 ABI（CXX11_ABI=0）
+# 官方 PyTorch wheel 使用 _GLIBCXX_USE_CXX11_ABI=0，故设为 FALSE
+echo ""
+echo ">>> 方案 2: 从源码编译 flash-attn (FLASH_ATTENTION_FORCE_CXX11_ABI=FALSE)"
+export FLASH_ATTENTION_FORCE_CXX11_ABI="FALSE"
+export FLASH_ATTENTION_FORCE_BUILD="TRUE"
+export MAX_JOBS="${MAX_JOBS:-4}"
+
+pip install --no-cache-dir --no-build-isolation "flash-attn>=2.6.3,<2.8"
+
+echo ""
+echo ">>> 验证 flash_attn 导入"
+if python -c "from flash_attn import flash_attn_func; print('flash_attn 导入成功')"; then
+  echo ""
+  echo "=== 安装成功。可将 transformer 的 attn_mode 设为 flashattn。 ==="
+  exit 0
+fi
+
+echo ""
+echo "=== 安装或导入仍失败。请检查: 1) CUDA/nvcc 可用 2) 与当前 PyTorch 版本完全一致。 ==="
+exit 1
diff --git a/script/setup/setup_cu124_mirror.md b/script/setup/setup_cu124_mirror.md
new file mode 100644
index 0000000..0daec25
--- /dev/null
+++ b/script/setup/setup_cu124_mirror.md
@@ -0,0 +1,47 @@
+# LingBot-VA 环境：国内镜像安装 PyTorch (CUDA 12.4)
+
+阿里云 cu124 是**目录列表页**，不是 pip 的 simple index，不能用 `--index-url`，要用 **`--find-links`**。
+
+## 1. 激活环境后安装（推荐）
+
+**阿里云镜像（--find-links）：**
+```bash
+conda activate lingbot-va
+pip install --upgrade pip
+pip install torch==2.6.0+cu124 torchvision==0.21.0+cu124 torchaudio==2.6.0+cu124 \
+  --find-links https://mirrors.aliyun.com/pytorch-wheels/cu124/
+```
+
+**不指定版本（装镜像站里最新）：**
+```bash
+conda activate lingbot-va
+pip install torch torchvision torchaudio \
+  --find-links https://mirrors.aliyun.com/pytorch-wheels/cu124/
+```
+
+**南京大学镜像（若支持 find-links 可试）：**
+```bash
+pip install torch torchvision torchaudio \
+  --find-links https://mirror.nju.edu.cn/pytorch-wheels/cu124/
+```
+
+## 2. 用脚本时走国内镜像
+
+```bash
+cd /mnt/users/wangyuxuan-20250915/EAI/lingbot-va
+# 默认已用阿里云 --find-links；可改镜像：
+# export PYTORCH_MIRROR=https://mirror.nju.edu.cn/pytorch-wheels/cu124/
+bash script/setup_env_cu124.sh
+```
+
+## 3. 若镜像没有 cu124
+
+部分镜像只同步到 cu121，可改用 cu121 的包（需本机 CUDA 兼容）：
+```bash
+pip install torch torchvision torchaudio --index-url https://mirrors.aliyun.com/pytorch-wheels/cu121
+```
+
+或从 PyTorch 官方页下载对应 `.whl` 后本地安装：
+```bash
+pip install /path/to/torch-xxx-cu124-*.whl
+```
diff --git a/script/setup/setup_env.sh b/script/setup/setup_env.sh
new file mode 100644
index 0000000..2790627
--- /dev/null
+++ b/script/setup/setup_env.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+# LingBot-VA 环境配置脚本
+# 从 lingbot-va 仓库根目录执行: bash script/setup_env.sh
+# 可选: bash script/setup_env.sh <conda_env_name>
+
+set -e
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$REPO_ROOT"
+
+CONDA_ENV="${1:-lingbot-va}"
+PYTHON_VERSION="3.10.16"
+
+echo "=== LingBot-VA 环境配置 ==="
+echo "仓库根目录: $REPO_ROOT"
+echo "Conda 环境名: $CONDA_ENV"
+echo ""
+
+# 1. 创建 conda 环境
+if conda env list | grep -q "^${CONDA_ENV} "; then
+    echo ">>> 环境 $CONDA_ENV 已存在，将复用并更新依赖"
+else
+    echo ">>> 创建 conda 环境: $CONDA_ENV (Python $PYTHON_VERSION)"
+    conda create -n "$CONDA_ENV" python="$PYTHON_VERSION" -y
+fi
+
+# 2. 激活后安装（用 conda run 保证在正确 env 下执行）
+echo ">>> 安装 PyTorch 2.9.0 (CUDA 12.6)"
+conda run -n "$CONDA_ENV" pip install --upgrade pip
+conda run -n "$CONDA_ENV" pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 \
+    --index-url https://download.pytorch.org/whl/cu126
+
+echo ">>> 安装基础依赖 (README)"
+conda run -n "$CONDA_ENV" pip install \
+    websockets einops diffusers==0.36.0 transformers==4.55.2 accelerate msgpack \
+    opencv-python matplotlib ftfy easydict
+
+echo ">>> 安装 flash-attn (可能较慢)"
+conda run -n "$CONDA_ENV" pip install flash-attn --no-build-isolation
+
+echo ">>> 安装 requirements.txt 中的其余依赖"
+conda run -n "$CONDA_ENV" pip install -r "$REPO_ROOT/requirements.txt"
+
+echo ">>> 以可编辑方式安装当前项目"
+conda run -n "$CONDA_ENV" pip install -e .
+
+echo ""
+echo "=== 环境就绪 ==="
+echo "激活环境: conda activate $CONDA_ENV"
+echo ""
+echo "可选环境变量（按需设置）："
+echo "  # RoboTwin 测评时指定测评仓库路径"
+echo "  export ROBOTWIN_ROOT=/mnt/users/wangyuxuan-20250915/EAI/RoboTwin"
+echo ""
+echo "  # Post-training 训练时指定 LeRobot 数据集路径"
+echo "  export LINGBOT_VA_DATASET_PATH=$REPO_ROOT/robotwin-clean-and-aug-lerobot"
+echo "  # 若数据集尚未下载，可运行: python script/download/download_dataset.py"
+echo ""
+echo "  # 推理/测评时指定模型路径（若未用默认 lingbot-va-base）"
+echo "  export LINGBOT_VA_MODEL_PATH=/path/to/your/model"
+echo ""
diff --git a/script/setup/setup_env_cu124.sh b/script/setup/setup_env_cu124.sh
new file mode 100755
index 0000000..3e4560c
--- /dev/null
+++ b/script/setup/setup_env_cu124.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# LingBot-VA 环境配置（兼容 CUDA 12.4，且不依赖 Anaconda 官方频道，无需接受 ToS）
+# 从 lingbot-va 仓库根目录执行: bash script/setup_env_cu124.sh
+# 可选: bash script/setup_env_cu124.sh <conda_env_name>
+
+set -e
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$REPO_ROOT"
+
+CONDA_ENV="${1:-lingbot-va}"
+PYTHON_VERSION="3.10"
+
+echo "=== LingBot-VA 环境配置 (CUDA 12.4) ==="
+echo "仓库根目录: $REPO_ROOT"
+echo "Conda 环境名: $CONDA_ENV"
+echo ""
+
+# 1. 仅用 conda-forge 创建环境，避免 Anaconda 官方频道 ToS
+if conda env list | grep -q "^${CONDA_ENV} "; then
+    echo ">>> 环境 $CONDA_ENV 已存在，将复用并更新依赖"
+else
+    echo ">>> 创建 conda 环境: $CONDA_ENV (Python $PYTHON_VERSION, 仅 conda-forge)"
+    conda create -n "$CONDA_ENV" python="$PYTHON_VERSION" -y -c conda-forge --override-channels
+fi
+
+# 2. 安装 PyTorch (CUDA 12.4)：阿里云为目录列表页，须用 --find-links 而非 --index-url
+PYTORCH_MIRROR="${PYTORCH_MIRROR:-https://mirrors.aliyun.com/pytorch-wheels/cu124/}"
+echo ">>> 安装 PyTorch 2.6.0 (CUDA 12.4), 镜像: $PYTORCH_MIRROR"
+conda run -n "$CONDA_ENV" pip install --upgrade pip
+conda run -n "$CONDA_ENV" pip install torch==2.6.0+cu124 torchvision==0.21.0+cu124 torchaudio==2.6.0+cu124 \
+    --find-links "$PYTORCH_MIRROR"
+
+echo ">>> 安装基础依赖 (README)"
+conda run -n "$CONDA_ENV" pip install \
+    websockets einops diffusers==0.36.0 transformers==4.55.2 accelerate msgpack \
+    opencv-python matplotlib ftfy easydict
+
+echo ">>> 安装 flash-attn (可能较慢)"
+conda run -n "$CONDA_ENV" pip install flash-attn --no-build-isolation
+
+echo ">>> 安装其余依赖"
+conda run -n "$CONDA_ENV" pip install \
+    numpy==1.26.4 tqdm "imageio[ffmpeg]" safetensors Pillow \
+    lerobot==0.3.3 scipy wandb
+
+echo ">>> 以可编辑方式安装当前项目"
+conda run -n "$CONDA_ENV" pip install -e .
+
+echo ""
+echo "=== 环境就绪 ==="
+echo "激活: conda activate $CONDA_ENV"
+echo "可选: export LINGBOT_VA_MODEL_PATH=$REPO_ROOT/lingbot-va-base"
+echo ""
diff --git a/wan_va/configs/va_demo_cfg.py b/wan_va/configs/va_demo_cfg.py
index 6045eb7..6ad8e25 100644
--- a/wan_va/configs/va_demo_cfg.py
+++ b/wan_va/configs/va_demo_cfg.py
@@ -1,4 +1,5 @@
 # Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+import os
 import torch
 from easydict import EasyDict
 
@@ -8,7 +9,9 @@
 va_demo_cfg.update(va_shared_cfg)
 va_shared_cfg.infer_mode = 'server'
 
-va_demo_cfg.wan22_pretrained_model_name_or_path = "/path/to/pretrained/model"
+va_demo_cfg.wan22_pretrained_model_name_or_path = os.environ.get(
+    "LINGBOT_VA_MODEL_PATH", "/path/to/pretrained/model"
+)
 
 va_demo_cfg.attn_window = 30
 va_demo_cfg.frame_chunk_size = 4
diff --git a/wan_va/configs/va_robotwin_cfg.py b/wan_va/configs/va_robotwin_cfg.py
index 5fa703f..4cb1569 100644
--- a/wan_va/configs/va_robotwin_cfg.py
+++ b/wan_va/configs/va_robotwin_cfg.py
@@ -1,4 +1,5 @@
 # Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+import os
 from easydict import EasyDict
 
 from .shared_config import va_shared_cfg
@@ -6,10 +7,13 @@
 va_robotwin_cfg = EasyDict(__name__='Config: VA robotwin')
 va_robotwin_cfg.update(va_shared_cfg)
 
-va_robotwin_cfg.wan22_pretrained_model_name_or_path = "/path/to/pretrained/model"
+va_robotwin_cfg.wan22_pretrained_model_name_or_path = os.environ.get(
+    "LINGBOT_VA_MODEL_PATH", "/path/to/pretrained/model"
+)
 
 va_robotwin_cfg.attn_window = 72
-va_robotwin_cfg.frame_chunk_size = 2
+# 与论文一致: K=4 for deployment; 推理 3 steps video (to s=0.6), 10 steps action (to s=1.0)
+va_robotwin_cfg.frame_chunk_size = 4
 va_robotwin_cfg.env_type = 'robotwin_tshape'
 
 va_robotwin_cfg.height = 256
@@ -23,9 +27,10 @@
 va_robotwin_cfg.guidance_scale = 5
 va_robotwin_cfg.action_guidance_scale = 1
 
-va_robotwin_cfg.num_inference_steps = 25
+# 论文: Euler 3 steps video (to s=0.6), 10 steps action (to s=1.0); Video CFG 5.0, Action CFG 1.0
+va_robotwin_cfg.num_inference_steps = 3
 va_robotwin_cfg.video_exec_step = -1
-va_robotwin_cfg.action_num_inference_steps = 50
+va_robotwin_cfg.action_num_inference_steps = 10
 
 va_robotwin_cfg.snr_shift = 5.0
 va_robotwin_cfg.action_snr_shift = 1.0
diff --git a/wan_va/configs/va_robotwin_i2va.py b/wan_va/configs/va_robotwin_i2va.py
index 9e37872..3b94d73 100644
--- a/wan_va/configs/va_robotwin_i2va.py
+++ b/wan_va/configs/va_robotwin_i2va.py
@@ -8,4 +8,6 @@
 va_robotwin_i2va_cfg.input_img_path = 'example/robotwin'
 va_robotwin_i2va_cfg.num_chunks_to_infer = 10
 va_robotwin_i2va_cfg.prompt = 'Grab the medium-sized white mug, rotate it, place it on the table, and hook it onto the smooth dark gray rack.'
-va_robotwin_i2va_cfg.infer_mode = 'i2va'
\ No newline at end of file
+va_robotwin_i2va_cfg.infer_mode = 'i2va'
+# 导出视频时 2 倍上采样，观感更清晰（模型原生 256x320）
+va_robotwin_i2va_cfg.export_scale = 2
\ No newline at end of file
diff --git a/wan_va/configs/va_robotwin_train_cfg.py b/wan_va/configs/va_robotwin_train_cfg.py
index 190a1b0..66f63f3 100644
--- a/wan_va/configs/va_robotwin_train_cfg.py
+++ b/wan_va/configs/va_robotwin_train_cfg.py
@@ -8,7 +8,9 @@
 
 # va_robotwin_train_cfg.resume_from = '/robby/share/Robotics/lilin1/code/Wan_VA_Release/train_out/checkpoints/checkpoint_step_10'
 
-va_robotwin_train_cfg.dataset_path = '/path/to/your/dataset'
+va_robotwin_train_cfg.dataset_path = os.environ.get(
+    "LINGBOT_VA_DATASET_PATH", "/path/to/your/dataset"
+)
 va_robotwin_train_cfg.empty_emb_path = os.path.join(va_robotwin_train_cfg.dataset_path, 'empty_emb.pt')
 va_robotwin_train_cfg.enable_wandb = True
 va_robotwin_train_cfg.load_worker = 16
diff --git a/wan_va/distributed/util.py b/wan_va/distributed/util.py
index 8ad7c29..fb10775 100644
--- a/wan_va/distributed/util.py
+++ b/wan_va/distributed/util.py
@@ -12,10 +12,13 @@ def _configure_model(model, shard_fn, param_dtype, device, eval_mode=True):
     if dist.is_initialized():
         dist.barrier()
 
+    # Unify parameter dtypes before FSDP wrap (FSDP requires uniform dtype).
+    # Some modules may be kept in fp32 by the model (e.g. _keep_in_fp32_modules).
+    model.to(param_dtype)
+
     if dist.is_initialized():
         model = shard_fn(model)
     else:
-        model.to(param_dtype)
         model.to(device)
 
     return model
diff --git a/wan_va/modules/model.py b/wan_va/modules/model.py
index 25b45e5..0c409ae 100644
--- a/wan_va/modules/model.py
+++ b/wan_va/modules/model.py
@@ -26,10 +26,16 @@
 )
 from functools import partial
 
+flash_attn_func = None
 try:
-    from flash_attn_interface import flash_attn_func
-except:
-    from flash_attn import flash_attn_func
+    from flash_attn_interface import flash_attn_func as _flash_attn_func
+    flash_attn_func = _flash_attn_func
+except Exception:
+    try:
+        from flash_attn import flash_attn_func as _flash_attn_func
+        flash_attn_func = _flash_attn_func
+    except Exception:
+        flash_attn_func = None
 
 __all__ = ['WanTransformer3DModel']
 
@@ -302,12 +308,18 @@ def __init__(
         if attn_mode == 'torch':
             self.attn_op = custom_sdpa
         elif attn_mode == 'flashattn':
+            if flash_attn_func is None:
+                raise ImportError(
+                    "attn_mode='flashattn' requires flash-attn, but it is not available "
+                    "or failed to import. Please install a torch-compatible flash-attn "
+                    "or switch attn_mode to 'torch'."
+                )
             self.attn_op = flash_attn_func
         elif attn_mode == 'flex':
             self.attn_op = FlexAttnFunc(cross_attention_dim_head is not None)
         else:
             raise ValueError(
-                f"Unsupported attention mode: {attn_mode}, only support torch and flashattn"
+                f"Unsupported attention mode: {attn_mode}, only support torch, flashattn, and flex"
             )
 
         self.inner_dim = dim_head * heads
diff --git a/wan_va/wan_va_server.py b/wan_va/wan_va_server.py
index 4abaf46..ba0aa28 100644
--- a/wan_va/wan_va_server.py
+++ b/wan_va/wan_va_server.py
@@ -321,13 +321,20 @@ def _prepare_latent_input(self,
                                                           action_mask] *= 0
         return input_dict
 
-    def _encode_obs(self, obs):
+    def _encode_obs(self, obs, profile=False):
+        detail = {
+            'cpu_preprocess': 0.0,
+            'to_vae_device': 0.0,
+            'vae_encode': 0.0,
+            'latent_postprocess': 0.0,
+        }
         images = obs['obs']
         if not isinstance(images, list):
             images = [images]
         if len(images) < 1:
-            return None
+            return (None, detail) if profile else None
         videos = []
+        t0_cpu = time.perf_counter()
         for k_i, k in enumerate(self.job_config.obs_cam_keys):
             if self.env_type == 'robotwin_tshape':
                 if k_i == 0:  # camera high
@@ -345,16 +352,27 @@ def _encode_obs(self, obs):
                                             mode='bilinear',
                                             align_corners=False).unsqueeze(0)
             videos.append(history_video_k)
+        detail['cpu_preprocess'] = time.perf_counter() - t0_cpu
 
         if self.env_type == 'robotwin_tshape':
             videos_high = videos[0] / 255.0 * 2.0 - 1.0
             videos_left_and_right = torch.cat(videos[1:],
                                               dim=0) / 255.0 * 2.0 - 1.0
             vae_device = next(self.streaming_vae.vae.parameters()).device
+            t0_to = time.perf_counter()
+            videos_high = videos_high.to(vae_device).to(self.dtype)
+            videos_left_and_right = videos_left_and_right.to(vae_device).to(self.dtype)
+            if profile and vae_device.type == 'cuda':
+                torch.cuda.synchronize(vae_device)
+            detail['to_vae_device'] = time.perf_counter() - t0_to
+            t0_vae = time.perf_counter()
             enc_out_high = self.streaming_vae.encode_chunk(
-                videos_high.to(vae_device).to(self.dtype))
+                videos_high)
             enc_out_left_and_right = self.streaming_vae_half.encode_chunk(
-                videos_left_and_right.to(vae_device).to(self.dtype))
+                videos_left_and_right)
+            if profile and vae_device.type == 'cuda':
+                torch.cuda.synchronize(vae_device)
+            detail['vae_encode'] = time.perf_counter() - t0_vae
             enc_out = torch.cat([
                 torch.cat(enc_out_left_and_right.split(1, dim=0), dim=-1),
                 enc_out_high
@@ -363,15 +381,28 @@ def _encode_obs(self, obs):
         else:
             videos = torch.cat(videos, dim=0) / 255.0 * 2.0 - 1.0
             vae_device = next(self.streaming_vae.vae.parameters()).device
+            t0_to = time.perf_counter()
             videos_chunk = videos.to(vae_device).to(self.dtype)
+            if profile and vae_device.type == 'cuda':
+                torch.cuda.synchronize(vae_device)
+            detail['to_vae_device'] = time.perf_counter() - t0_to
+            t0_vae = time.perf_counter()
             enc_out = self.streaming_vae.encode_chunk(videos_chunk)
+            if profile and vae_device.type == 'cuda':
+                torch.cuda.synchronize(vae_device)
+            detail['vae_encode'] = time.perf_counter() - t0_vae
 
+        t0_post = time.perf_counter()
         mu, logvar = torch.chunk(enc_out, 2, dim=1)
         latents_mean = torch.tensor(self.vae.config.latents_mean).to(mu.device)
         latents_std = torch.tensor(self.vae.config.latents_std).to(mu.device)
         mu_norm = self.normalize_latents(mu, latents_mean, 1.0 / latents_std)
         video_latent = torch.cat(mu_norm.split(1, dim=0), dim=-1)
-        return video_latent.to(self.device)
+        video_latent = video_latent.to(self.device)
+        if profile and self.device.type == 'cuda':
+            torch.cuda.synchronize(self.device)
+        detail['latent_postprocess'] = time.perf_counter() - t0_post
+        return (video_latent, detail) if profile else video_latent
 
     def _reset(self, prompt=None):
         logger.info('Reset.')
@@ -379,6 +410,7 @@ def _reset(self, prompt=None):
         #### Reset all parameters
         self.frame_st_id = 0
         self.init_latent = None
+        self.last_latents = None  # 用于 skip video 时复用上一轮的 video latent（可视化等）
         #### clean vae and transformer cache
         self.transformer.clear_cache(self.cache_name)
         self.streaming_vae.clear_cache()
@@ -440,18 +472,41 @@ def _reset(self, prompt=None):
         torch.cuda.empty_cache()
 
     def _infer(self, obs, frame_st_id=0):
+        timing = dict(encode_obs=0.0, video_denoise=0.0, action_denoise=0.0, kv_cache=0.0, other=0.0)
         frame_chunk_size = self.job_config.frame_chunk_size
+        # 当前回合是否跳过 video 预测，仅跑 action（复用上一轮的 KV cache / last_latents）
+        # 仅当 frame_st_id != 0 且显式传入 world_steps_override=0 时生效；首帧必须跑 video 以构建 cache
+        world_steps_override = obs.get('world_steps_override', None)
+        skip_video = (
+            frame_st_id != 0
+            and world_steps_override is not None
+            and world_steps_override == 0
+            and self.last_latents is not None
+        )
+        if skip_video:
+            logger.info(f"[Skip Video] frame_st_id={frame_st_id}, 本回合仅跑 action，复用上轮 KV cache")
+
         if frame_st_id == 0:
-            init_latent = self._encode_obs(obs)
+            init_latent, encode_detail = self._encode_obs(obs, profile=True)
+            timing['encode_obs'] = sum(encode_detail.values())
+            timing['encode_obs_cpu_preprocess'] = encode_detail['cpu_preprocess']
+            timing['encode_obs_to_vae_device'] = encode_detail['to_vae_device']
+            timing['encode_obs_vae_encode'] = encode_detail['vae_encode']
+            timing['encode_obs_latent_postprocess'] = encode_detail['latent_postprocess']
             self.init_latent = init_latent
+        else:
+            init_latent = self.init_latent
 
-        latents = torch.randn(1,
-                              48,
-                              frame_chunk_size,
-                              self.latent_height,
-                              self.latent_width,
-                              device=self.device,
-                              dtype=self.dtype)
+        if skip_video:
+            latents = self.last_latents  # 复用上一轮 video 结果，仅用于返回/可视化
+        else:
+            latents = torch.randn(1,
+                                  48,
+                                  frame_chunk_size,
+                                  self.latent_height,
+                                  self.latent_width,
+                                  device=self.device,
+                                  dtype=self.dtype)
         actions = torch.randn(1,
                               self.job_config.action_dim,
                               frame_chunk_size,
@@ -484,42 +539,47 @@ def _infer(self, obs, frame_st_id=0):
         with (
                 torch.no_grad(),
         ):
-            # 1. Video Generation Loop
-            for i, t in enumerate(tqdm(timesteps)):
-                last_step = i == len(timesteps) - 1
-                latent_cond = init_latent[:, :, 0:1].to(
-                    self.dtype) if frame_st_id == 0 else None
-                input_dict = self._prepare_latent_input(
-                    latents,
-                    None,
-                    t,
-                    t,
-                    latent_cond,
-                    None,
-                    frame_st_id=frame_st_id)
-
-                video_noise_pred = self.transformer(
-                    self._repeat_input_for_cfg(input_dict['latent_res_lst']),
-                    update_cache=1 if last_step else 0,
-                    cache_name=self.cache_name,
-                    action_mode=False)
-
-                if not last_step or video_step != -1:
-                    video_noise_pred = data_seq_to_patch(
-                        self.job_config.patch_size, video_noise_pred,
-                        frame_chunk_size, self.latent_height,
-                        self.latent_width, batch_size=2 if self.use_cfg else 1)
-                    if self.job_config.guidance_scale > 1:
-                        video_noise_pred = video_noise_pred[1:] + self.job_config.guidance_scale * (video_noise_pred[:1] - video_noise_pred[1:])
-                    else:
-                        video_noise_pred = video_noise_pred[:1]
-                    latents = self.scheduler.step(video_noise_pred,
-                                                  t,
-                                                  latents,
-                                                  return_dict=False)
-
-                latents[:, :, 0:1] = latent_cond if frame_st_id == 0 else latents[:, :, 0:1]
-
+            # 1. Video Generation Loop（skip_video 时跳过，直接用上一轮 cache，只跑 action）
+            t0_video = time.perf_counter()
+            if not skip_video:
+                for i, t in enumerate(tqdm(timesteps)):
+                    last_step = i == len(timesteps) - 1
+                    latent_cond = init_latent[:, :, 0:1].to(
+                        self.dtype) if frame_st_id == 0 else None
+                    input_dict = self._prepare_latent_input(
+                        latents,
+                        None,
+                        t,
+                        t,
+                        latent_cond,
+                        None,
+                        frame_st_id=frame_st_id)
+
+                    video_noise_pred = self.transformer(
+                        self._repeat_input_for_cfg(input_dict['latent_res_lst']),
+                        update_cache=1 if last_step else 0,
+                        cache_name=self.cache_name,
+                        action_mode=False)
+
+                    if not last_step or video_step != -1:
+                        video_noise_pred = data_seq_to_patch(
+                            self.job_config.patch_size, video_noise_pred,
+                            frame_chunk_size, self.latent_height,
+                            self.latent_width, batch_size=2 if self.use_cfg else 1)
+                        if self.job_config.guidance_scale > 1:
+                            video_noise_pred = video_noise_pred[1:] + self.job_config.guidance_scale * (video_noise_pred[:1] - video_noise_pred[1:])
+                        else:
+                            video_noise_pred = video_noise_pred[:1]
+                        latents = self.scheduler.step(video_noise_pred,
+                                                      t,
+                                                      latents,
+                                                      return_dict=False)
+
+                    latents[:, :, 0:1] = latent_cond if frame_st_id == 0 else latents[:, :, 0:1]
+                self.last_latents = latents.detach().clone()
+            timing['video_denoise'] = time.perf_counter() - t0_video
+
+            t0_action = time.perf_counter()
             for i, t in enumerate(tqdm(action_timesteps)):
                 last_step = i == len(action_timesteps) - 1
                 action_cond = torch.zeros(
@@ -558,26 +618,38 @@ def _infer(self, obs, frame_st_id=0):
                                                          return_dict=False)
 
                 actions[:, :, 0:1] = action_cond if frame_st_id == 0 else actions[:, :, 0:1]
+            timing['action_denoise'] = time.perf_counter() - t0_action
 
         actions[:, ~self.action_mask] *= 0
 
+        t0_other = time.perf_counter()
         save_async(latents, os.path.join(self.exp_save_root, f'latents_{frame_st_id}.pt'))
         save_async(actions, os.path.join(self.exp_save_root, f'actions_{frame_st_id}.pt'))
 
         actions = self.postprocess_action(actions)
         torch.cuda.empty_cache()
-        return actions, latents
+        timing['other'] = time.perf_counter() - t0_other
+        return actions, latents, timing
 
     def _compute_kv_cache(self, obs):
+        timing = dict(encode_obs=0.0, video_denoise=0.0, action_denoise=0.0, kv_cache=0.0, other=0.0)
         ### optional async save obs for debug
         self.transformer.clear_pred_cache(self.cache_name)
         save_async(obs['obs'], os.path.join(self.exp_save_root, f'obs_data_{self.frame_st_id}.pt'))
-        latent_model_input = self._encode_obs(obs)
+        logger.info("[KV Cache] 阶段 1/2: 编码观测 (encode_obs, VAE)...")
+        latent_model_input, encode_detail = self._encode_obs(obs, profile=True)
+        timing['encode_obs'] = sum(encode_detail.values())
+        timing['encode_obs_cpu_preprocess'] = encode_detail['cpu_preprocess']
+        timing['encode_obs_to_vae_device'] = encode_detail['to_vae_device']
+        timing['encode_obs_vae_encode'] = encode_detail['vae_encode']
+        timing['encode_obs_latent_postprocess'] = encode_detail['latent_postprocess']
+        logger.info(f"[KV Cache] encode_obs 完成, 耗时 {timing['encode_obs']:.2f}s")
         if self.frame_st_id == 0:
             latent_model_input = torch.cat(
                 [self.init_latent, latent_model_input],
                 dim=2) if latent_model_input is not None else self.init_latent
 
+        t0_prep = time.perf_counter()
         action_model_input = self.preprocess_action(obs['state'])
         action_model_input = action_model_input.to(latent_model_input)
         logger.info(
@@ -586,10 +658,13 @@ def _compute_kv_cache(self, obs):
         input_dict = self._prepare_latent_input(latent_model_input,
                                                 action_model_input,
                                                 frame_st_id=self.frame_st_id)
+        timing['other'] = time.perf_counter() - t0_prep
 
         with (
                 torch.no_grad(),
         ):
+            logger.info("[KV Cache] 阶段 2/2: 更新 KV cache (transformer)...")
+            t0_kv = time.perf_counter()
             self.transformer(self._repeat_input_for_cfg(input_dict['latent_res_lst']),
                              update_cache=2,
                              cache_name=self.cache_name,
@@ -599,8 +674,11 @@ def _compute_kv_cache(self, obs):
                              update_cache=2,
                              cache_name=self.cache_name,
                              action_mode=True)
+            timing['kv_cache'] = time.perf_counter() - t0_kv
+            logger.info(f"[KV Cache] kv_cache 完成, 耗时 {timing['kv_cache']:.2f}s (本请求总耗时 encode_obs+kv_cache 即上述两阶段)")
         torch.cuda.empty_cache()
         self.frame_st_id += latent_model_input.shape[2]
+        return timing
 
     @torch.no_grad()
     def infer(self, obs):
@@ -614,14 +692,31 @@ def infer(self, obs):
             return dict()
         elif compute_kv_cache:
             logger.info(
-                f"################# Compute KV Cache #################")
-            self._compute_kv_cache(obs)
-            return dict()
+                "################# Compute KV Cache（本请求先做 encode_obs 再做 kv_cache）#################")
+            kv_timing = self._compute_kv_cache(obs)
+            return dict(timing=kv_timing)
         else:
             logger.info(f"################# Infer One Chunk #################")
-            action, _ = self._infer(obs, frame_st_id=self.frame_st_id)
-            return dict(action=action)
-    
+            action, latents, infer_timing = self._infer(obs, frame_st_id=self.frame_st_id)
+            out = dict(action=action, timing=infer_timing)
+            # 可选：返回当前 chunk 解码后的预测视频，供 eval 对比可视化（解码耗时计入 other）
+            save_vis = obs.get('save_visualization', False) or obs.get(b'save_visualization', False)
+            if save_vis and latents is not None:
+                if getattr(self, 'video_processor', None) is None:
+                    self.video_processor = VideoProcessor(vae_scale_factor=1)
+                if self.enable_offload:
+                    self.vae = self.vae.to(self.device)
+                try:
+                    t0_decode = time.perf_counter()
+                    video = self.decode_one_video(latents, 'np')[0]
+                    out['video'] = video.cpu().numpy() if hasattr(video, 'cpu') else np.asarray(video)
+                    infer_timing['other'] = infer_timing.get('other', 0) + (time.perf_counter() - t0_decode)
+                    logger.info(f"Returning predicted video chunk, shape {out['video'].shape}")
+                finally:
+                    if self.enable_offload:
+                        self.vae = self.vae.to('cpu')
+            return out
+
     def decode_one_video(self, latents, output_type):
         latents = latents.to(self.vae.dtype)
         latents_mean = (
@@ -651,7 +746,7 @@ def generate(self):
         pred_latent_lst = []
         pred_action_lst = []
         for chunk_id in range(self.job_config.num_chunks_to_infer):
-            actions, latents = self._infer(init_obs, frame_st_id=(chunk_id * self.job_config.frame_chunk_size))
+            actions, latents, _ = self._infer(init_obs, frame_st_id=(chunk_id * self.job_config.frame_chunk_size))
             actions = torch.from_numpy(actions)
             pred_latent_lst.append(latents)
             pred_action_lst.append(actions)
@@ -671,11 +766,28 @@ def generate(self):
             self.vae = self.vae.to(self.device).to(self.dtype)
         
         decoded_video = self.decode_one_video(pred_latent, 'np')[0]
+        # Optional 2x upscale for viewing: model outputs at 256x320 (robotwin), upscale so demo is less pixelated
+        export_scale = getattr(self.job_config, 'export_scale', 2)
+        if export_scale != 1 and export_scale > 1:
+            t, h, w, c = decoded_video.shape
+            vid = torch.from_numpy(decoded_video).float().permute(0, 3, 1, 2)  # (T, C, H, W)
+            vid = F.interpolate(
+                vid,
+                size=(h * export_scale, w * export_scale),
+                mode='bilinear',
+                align_corners=False,
+            )
+            decoded_video = vid.permute(0, 2, 3, 1).numpy().astype(np.uint8)
         export_to_video(decoded_video, os.path.join(self.save_root, "demo.mp4"), fps=10)
 
 def run(args):    
     
     config = VA_CONFIGS[args.config_name]
+    # 通过环境变量强制控制 offload：0/false 表示关闭 offload（VAE 常驻 GPU，通常更快）
+    env_enable_offload = os.environ.get("ENABLE_OFFLOAD")
+    if env_enable_offload is not None:
+        config.enable_offload = str(env_enable_offload).lower() not in ("0", "false", "no", "off")
+        logger.info(f"[Config Override] enable_offload <- {config.enable_offload} (from ENABLE_OFFLOAD={env_enable_offload})")
     port = config.port if args.port is None else args.port
     if args.save_root is not None:
         config.save_root = args.save_root