Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# 不纳入版本控制:模型与测评结果(仅保留核心代码)
models/
results/

# 日志与临时
logs/
*.log

# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
*.egg-info/
*.egg
.eggs/
dist/
build/

# 虚拟环境 / conda
.venv/
venv/
env/

# IDE / 编辑
.idea/
.vscode/
*.swp
*.swo

# Jupyter
.ipynb_checkpoints/

# 系统
.DS_Store
Thumbs.db
160 changes: 123 additions & 37 deletions evaluation/robotwin/eval_polict_client_openpi.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import sys
import os
import subprocess
import time
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
import cv2
from pathlib import Path

robowin_root = Path("/path/to/your/robowin")
robowin_root = Path(os.environ.get("ROBOTWIN_ROOT", "/path/to/your/robowin"))
if str(robowin_root) not in sys.path:
sys.path.insert(0, str(robowin_root))

Expand Down Expand Up @@ -205,57 +206,76 @@ def save_comparison_video(real_obs_list, imagined_video, action_history, save_pa

print(f"Saving video: Real {n_real} frames, Imagined {n_imagined} frames...")

final_frames = []
# 首帧确定统一尺寸,保证整段视频每帧一致,避免 imageio "All images should have same size"
obs0 = real_obs_list[0]
base_h = obs0["observation.images.cam_high"].shape[0]

def resize_h(img, h):
if img.shape[0] != h:
w = int(img.shape[1] * h / img.shape[0])
img = cv2.resize(img, (w, h))
img = np.ascontiguousarray(img)
if img.dtype != np.uint8:
img = (img * 255).astype(np.uint8)
return img

# Real 行为左-中-右:High | Left wrist | Right wrist,与 obs_cam_keys 顺序一致
part_high_0 = resize_h(obs0["observation.images.cam_high"], base_h)
part_left_0 = resize_h(obs0["observation.images.cam_left_wrist"], base_h)
part_right_0 = resize_h(obs0["observation.images.cam_right_wrist"], base_h)
w_high, w_left, w_right = part_high_0.shape[1], part_left_0.shape[1], part_right_0.shape[1]
row_real0 = np.hstack([part_high_0, part_left_0, part_right_0])
row_real0 = add_title_bar(row_real0, "Real Observation (High / Left / Right)")
target_width = row_real0.shape[1]
real_row_h = row_real0.shape[0]
imagined_row_h = 256

final_frames = []
for i in range(n_frames):
obs = real_obs_list[i]
cam_high = obs["observation.images.cam_high"]
cam_left = obs["observation.images.cam_left_wrist"]
cam_right = obs["observation.images.cam_right_wrist"]

base_h = cam_high.shape[0]

def resize_h(img, h):
if img.shape[0] != h:
w = int(img.shape[1] * h / img.shape[0])
img = cv2.resize(img, (w, h))
img = np.ascontiguousarray(img)
if img.dtype != np.uint8:
img = (img * 255).astype(np.uint8)
return img

row_real = np.hstack([
resize_h(cam_high, base_h),
resize_h(cam_left, base_h),
resize_h(cam_right, base_h)
resize_h(obs["observation.images.cam_high"], base_h),
resize_h(obs["observation.images.cam_left_wrist"], base_h),
resize_h(obs["observation.images.cam_right_wrist"], base_h),
])

row_real = np.ascontiguousarray(row_real)

row_real = add_title_bar(row_real, "Real Observation (High / Left / Right)")

target_width = row_real.shape[1]
if row_real.shape[1] != target_width or row_real.shape[0] != real_row_h:
row_real = cv2.resize(row_real, (target_width, real_row_h))

if imagined_video is not None and i < n_imagined:
img_frame = imagined_video[i]
if img_frame.dtype != np.uint8 and img_frame.max() <= 1.0001:
img_frame = (img_frame * 255).astype(np.uint8)
elif img_frame.dtype != np.uint8:
img_frame = img_frame.astype(np.uint8)

h = int(img_frame.shape[0] * target_width / img_frame.shape[1])
row_imagined = cv2.resize(img_frame, (target_width, h))
# 与 real 一致:左-中-右 = High | Left wrist | Right wrist(模型输出顺序与 obs_cam_keys 一致)
H_im, W_im = img_frame.shape[0], img_frame.shape[1]
if W_im >= 3:
third = W_im // 3
im_high = cv2.resize(img_frame[:, 0:third], (w_high, imagined_row_h))
im_left = cv2.resize(img_frame[:, third : 2 * third], (w_left, imagined_row_h))
im_right = cv2.resize(img_frame[:, 2 * third :], (w_right, imagined_row_h))
row_imagined = np.hstack([im_high, im_left, im_right])
else:
row_imagined = cv2.resize(img_frame, (target_width, imagined_row_h))
else:
row_imagined = np.zeros((300, target_width, 3), dtype=np.uint8)
cv2.putText(row_imagined, "Coming soon", (target_width//2 - 100, 150),
row_imagined = np.zeros((imagined_row_h, target_width, 3), dtype=np.uint8)
cv2.putText(row_imagined, "Coming soon", (target_width//2 - 100, imagined_row_h//2 - 20),
cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 100, 100), 2)

row_imagined = np.ascontiguousarray(row_imagined)
row_imagined = add_title_bar(row_imagined, "Imagined Video Stream")
row_imagined = add_title_bar(row_imagined, "Imagined Video (High / Left / Right)")
full_frame = np.vstack([row_real, row_imagined])
full_frame = np.ascontiguousarray(full_frame)
final_frames.append(full_frame)

# 统一为第一帧尺寸,防止 add_title_bar 等导致细微差异
ref_h, ref_w = final_frames[0].shape[0], final_frames[0].shape[1]
for idx in range(len(final_frames)):
if final_frames[idx].shape[0] != ref_h or final_frames[idx].shape[1] != ref_w:
final_frames[idx] = cv2.resize(final_frames[idx], (ref_w, ref_h))

imageio.mimsave(save_path, final_frames, fps=fps)
print(f"Combined video saved to: {save_path}")

Expand Down Expand Up @@ -305,6 +325,7 @@ def main(usr_args):
policy_name = usr_args["policy_name"]
video_guidance_scale = usr_args["video_guidance_scale"]
action_guidance_scale = usr_args["action_guidance_scale"]
save_visualization = bool(usr_args.get("save_visualization", True))
instruction_type = 'seen'
save_dir = None
video_save_dir = None
Expand Down Expand Up @@ -397,7 +418,7 @@ def get_embodiment_file(embodiment_type):
test_num = usr_args["test_num"]


model = WebsocketClientPolicy(port=usr_args['port'])
model = WebsocketClientPolicy(host="127.0.0.1", port=usr_args['port'])

st_seed, suc_num = eval_policy(task_name,
TASK_ENV,
Expand All @@ -407,7 +428,7 @@ def get_embodiment_file(embodiment_type):
test_num=test_num,
video_size=video_size,
instruction_type=instruction_type,
save_visualization=True,
save_visualization=save_visualization,
video_guidance_scale=video_guidance_scale,
action_guidance_scale=action_guidance_scale)
suc_nums.append(suc_num)
Expand Down Expand Up @@ -462,7 +483,7 @@ def eval_policy(task_name,
now_id = 0
succ_seed = 0
suc_test_seed_list = []

all_trajectory_timings = [] # 每条 trajectory 的耗时汇总,用于最后样本级统计

now_seed = st_seed
clear_cache_freq = args["clear_cache_freq"]
Expand Down Expand Up @@ -545,6 +566,7 @@ def eval_policy(task_name,
full_obs_list = []
gen_video_list = []
full_action_history = []
trajectory_timings = [] # 当前 trajectory 每次 infer 的 timing

initial_obs = TASK_ENV.get_obs()
inint_eef_pose = initial_obs['endpose']['left_endpose'] + \
Expand All @@ -561,16 +583,21 @@ def eval_policy(task_name,
first_obs = format_obs(observation, prompt)

ret = model.infer(dict(obs=first_obs, prompt=prompt, save_visualization=save_visualization, video_guidance_scale=video_guidance_scale, action_guidance_scale=action_guidance_scale)) #(TASK_ENV, model, observation)
if 'timing' in ret:
trajectory_timings.append(ret['timing'])
action = ret['action']
if 'video' in ret:
imagined_video = ret['video']
gen_video_list.append(imagined_video)
print(f" [eval] received predicted video chunk, shape {getattr(imagined_video, 'shape', '?')}")
key_frame_list = []

assert action.shape[2] % 4 == 0
action_per_frame = action.shape[2] // 4

start_idx = 1 if first else 0
t0_env_step = time.perf_counter()
env_step_count = 0
for i in range(start_idx, action.shape[1]):
for j in range(action.shape[2]):
raw_action_step = action[:, i, j].flatten()
Expand All @@ -597,28 +624,66 @@ def eval_policy(task_name,
else:
raise NotImplementedError
TASK_ENV.take_action(ee_action, action_type='ee')
env_step_count += 1

if (j+1) % action_per_frame == 0:
obs = format_obs(TASK_ENV.get_obs(), prompt)
full_obs_list.append(obs)
key_frame_list.append(obs)
trajectory_timings.append(
dict(env_step_update=(time.perf_counter() - t0_env_step), env_step_count=env_step_count)
)

first = False

model.infer(dict(obs = key_frame_list, compute_kv_cache=True, imagine=False, save_visualization=save_visualization, state=action))

ret_kv = model.infer(dict(obs = key_frame_list, compute_kv_cache=True, imagine=False, save_visualization=save_visualization, state=action))
if 'timing' in ret_kv:
trajectory_timings.append(ret_kv['timing'])

if TASK_ENV.eval_success:
succ = True
break


# 当前 trajectory 耗时汇总与占比(以 trajectory 为单位输出)
if trajectory_timings:
keys = ['encode_obs', 'video_denoise', 'action_denoise', 'kv_cache', 'env_step_update', 'other']
summed = {k: sum(t.get(k, 0.0) for t in trajectory_timings) for k in keys}
total = sum(summed.values()) or 1e-9
pct = {k: 100.0 * summed[k] / total for k in keys}
total_env_steps = int(sum(t.get('env_step_count', 0) for t in trajectory_timings))
avg_env_step = summed['env_step_update'] / max(total_env_steps, 1)
print(f"\033[90m[Trajectory {TASK_ENV.test_num + 1}] 耗时(秒): encode_obs={summed['encode_obs']:.2f}, video_denoise={summed['video_denoise']:.2f}, action_denoise={summed['action_denoise']:.2f}, kv_cache={summed['kv_cache']:.2f}, env_step_update={summed['env_step_update']:.2f}, other={summed['other']:.2f} | 占比(%): encode_obs={pct['encode_obs']:.1f}%, video_denoise={pct['video_denoise']:.1f}%, action_denoise={pct['action_denoise']:.1f}%, kv_cache={pct['kv_cache']:.1f}%, env_step_update={pct['env_step_update']:.1f}%, other={pct['other']:.1f}% | env_step均值={avg_env_step*1000:.1f}ms ({total_env_steps} steps)\033[0m")
detail_keys = [
'encode_obs_cpu_preprocess',
'encode_obs_to_vae_device',
'encode_obs_vae_encode',
'encode_obs_latent_postprocess',
]
detail_summed = {k: sum(t.get(k, 0.0) for t in trajectory_timings) for k in detail_keys}
encode_total = summed['encode_obs'] or 1e-9
detail_pct = {k: 100.0 * detail_summed[k] / encode_total for k in detail_keys}
print(
"\033[90m"
f" └─ encode_obs细分(秒): cpu_preprocess={detail_summed['encode_obs_cpu_preprocess']:.2f}, "
f"to_vae_device={detail_summed['encode_obs_to_vae_device']:.2f}, "
f"vae_encode={detail_summed['encode_obs_vae_encode']:.2f}, "
f"latent_postprocess={detail_summed['encode_obs_latent_postprocess']:.2f} "
f"| 占encode_obs(%): cpu_preprocess={detail_pct['encode_obs_cpu_preprocess']:.1f}%, "
f"to_vae_device={detail_pct['encode_obs_to_vae_device']:.1f}%, "
f"vae_encode={detail_pct['encode_obs_vae_encode']:.1f}%, "
f"latent_postprocess={detail_pct['encode_obs_latent_postprocess']:.1f}%"
"\033[0m"
)
summed['env_step_count'] = total_env_steps
all_trajectory_timings.append(summed)

vis_dir = Path(args['save_root']) / f'stseed-{st_seed}' / 'visualization' / task_name
vis_dir.mkdir(parents=True, exist_ok=True)
video_name = f"{TASK_ENV.test_num}_{prompt.replace(' ', '_')}_{succ}.mp4"
out_img_file = vis_dir / video_name
save_comparison_video(
real_obs_list=full_obs_list,
imagined_video=None, #gen_video_list,
imagined_video=gen_video_list if gen_video_list else None,
action_history=full_action_history,
save_path=str(out_img_file),
fps=15 # Suggest adjusting fps based on simulation step
Expand Down Expand Up @@ -655,6 +720,25 @@ def eval_policy(task_name,
)
now_seed += 1

# 以样本为单位输出时间占比统计
if all_trajectory_timings:
keys = ['encode_obs', 'video_denoise', 'action_denoise', 'kv_cache', 'env_step_update', 'other']
total_summed = {k: sum(t.get(k, 0.0) for t in all_trajectory_timings) for k in keys}
total_sec = sum(total_summed.values()) or 1e-9
total_pct = {k: 100.0 * total_summed[k] / total_sec for k in keys}
n_samples = len(all_trajectory_timings)
total_env_steps = int(sum(t.get('env_step_count', 0) for t in all_trajectory_timings))
avg_env_step = total_summed['env_step_update'] / max(total_env_steps, 1)
print("\n\033[97m======== 样本级时间占比统计 ({} 条 trajectory) ========\033[0m".format(n_samples))
print("\033[97m总耗时(秒): encode_obs={:.2f}, video_denoise={:.2f}, action_denoise={:.2f}, kv_cache={:.2f}, env_step_update={:.2f}, other={:.2f}\033[0m".format(
total_summed['encode_obs'], total_summed['video_denoise'], total_summed['action_denoise'],
total_summed['kv_cache'], total_summed['env_step_update'], total_summed['other']))
print("\033[97m占比(%): encode_obs={:.1f}%, video_denoise={:.1f}%, action_denoise={:.1f}%, kv_cache={:.1f}%, env_step_update={:.1f}%, other={:.1f}%\033[0m".format(
total_pct['encode_obs'], total_pct['video_denoise'], total_pct['action_denoise'],
total_pct['kv_cache'], total_pct['env_step_update'], total_pct['other']))
print("\033[97menv_step 平均耗时: {:.1f} ms/step ({} steps)\033[0m".format(avg_env_step * 1000.0, total_env_steps))
print("\033[97m========================================================\033[0m\n")

return now_seed, TASK_ENV.suc


Expand All @@ -667,6 +751,8 @@ def parse_args_and_config():
parser.add_argument("--video_guidance_scale", type=float, default=5.0)
parser.add_argument("--action_guidance_scale", type=float, default=5.0)
parser.add_argument("--test_num", type=int, default=100)
parser.add_argument("--save_visualization", type=lambda x: str(x).lower() not in ('0', 'false', 'no', 'off'), default=True,
help='是否渲染并保存预测视频(VAE 解码+对比视频),关闭可显著加速。传 0 或 false 关闭。')
args = parser.parse_args()

with open(args.config, "r", encoding="utf-8") as f:
Expand Down
11 changes: 11 additions & 0 deletions example/robotwin/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# 本目录用于 Image-to-Video-Action (i2va) 推理的「首帧图像」输入。
# 使用 robotwin_i2av 配置时,需要以下 3 个 PNG 文件(与 obs_cam_keys 对应):
#
# observation.images.cam_high.png
# observation.images.cam_left_wrist.png
# observation.images.cam_right_wrist.png
#
# 图像尺寸会被代码自动 resize(如 256x320),用任意尺寸的 RGB 图即可。
#
# 生成占位图(便于先跑通流程):
# python create_dummy_images.py
36 changes: 36 additions & 0 deletions example/robotwin/create_dummy_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python3
"""生成 robotwin i2va 所需的占位首帧图像,便于先跑通推理流程。"""
import os

try:
from PIL import Image
import numpy as np
except ImportError:
print("请先安装: pip install Pillow numpy")
raise

# 与 va_robotwin_cfg.obs_cam_keys 一致
OBS_CAM_KEYS = [
"observation.images.cam_high",
"observation.images.cam_left_wrist",
"observation.images.cam_right_wrist",
]
# robotwin 主视角尺寸
HEIGHT, WIDTH = 256, 320

def main():
script_dir = os.path.dirname(os.path.abspath(__file__))
os.makedirs(script_dir, exist_ok=True)
for i, key in enumerate(OBS_CAM_KEYS):
# 简单渐变图,避免全 0 导致潜在数值问题
arr = np.zeros((HEIGHT, WIDTH, 3), dtype=np.uint8)
arr[:, :, 0] = 30 + i * 60
arr[:, :, 1] = 60 + i * 40
arr[:, :, 2] = 90 + i * 30
path = os.path.join(script_dir, f"{key}.png")
Image.fromarray(arr).save(path)
print(f"Written: {path}")
print("Done. 可用 script/run_i2va_single_gpu.sh 跑 i2va 推理。")

if __name__ == "__main__":
main()
Binary file modified example/robotwin/observation.images.cam_high.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example/robotwin/observation.images.cam_left_wrist.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example/robotwin/observation.images.cam_right_wrist.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading