在正式测试前添加warm up

2026-05-16 14:53:58 +00:00
parent d86aeb2df0
commit 02080e2564
5 changed files with 74 additions and 12 deletions
--- a/config/eval/cube.yaml
+++ b/config/eval/cube.yaml
@@ -38,6 +38,9 @@ eval:
  eval_budget: 50
  img_size: 224
  save_video: false
  compile_warmup:
    enabled: true
    num_eval: 1
  dataset_name: ogbench/cube_single_expert
  callables:
    # -- set state
--- a/config/eval/pusht.yaml
+++ b/config/eval/pusht.yaml
@@ -33,6 +33,9 @@ eval:
  eval_budget: 50
  img_size: 224
  save_video: false
  compile_warmup:
    enabled: true
    num_eval: 1
  dataset_name: pusht_expert_train
  callables:
    # -- set state
--- a/config/eval/reacher.yaml
+++ b/config/eval/reacher.yaml
@@ -32,6 +32,9 @@ eval:
  eval_budget: 50
  img_size: 224
  save_video: false
  compile_warmup:
    enabled: true
    num_eval: 1
  dataset_name: dmc/reacher_random
  callables:
    # -- set state
--- a/config/eval/tworoom.yaml
+++ b/config/eval/tworoom.yaml
@@ -32,6 +32,9 @@ eval:
  eval_budget: 50
  img_size: 224
  save_video: false
  compile_warmup:
    enabled: true
    num_eval: 1
  dataset_name: tworoom
  callables:
    # -- set state
--- a/eval.py
+++ b/eval.py
@@ -7,6 +7,7 @@ import time
 import traceback
 from contextlib import nullcontext
 from pathlib import Path
 import tempfile
 import hydra
 import numpy as np
@@ -84,6 +85,17 @@ def get_compile_cfg(cfg):
    return compile_cfg
 def get_compile_warmup_cfg(cfg):
    warmup_cfg = {
        "enabled": True,
        "num_eval": 1,
    }
    cfg_warmup = cfg.get("compile_warmup")
    if cfg_warmup is not None:
        warmup_cfg.update(OmegaConf.to_container(cfg_warmup, resolve=True))
    return warmup_cfg
 def maybe_compile_inference_target(model, cfg, device):
    compile_cfg = get_compile_cfg(cfg)
    compile_target = "disabled"
@@ -363,23 +375,27 @@ def run_eval_subset(
    if str(device).startswith("cuda") and torch.cuda.is_available():
        torch.cuda.synchronize()
    def evaluate_subset(episodes, start_indices, *, eval_cfg=local_cfg):
        return world.evaluate_from_dataset(
            dataset,
            start_steps=list(start_indices),
            goal_offset_steps=eval_cfg.eval.goal_offset_steps,
            eval_budget=eval_cfg.eval.eval_budget,
            episodes_idx=list(episodes),
            callables=OmegaConf.to_container(
                eval_cfg.eval.get("callables"), resolve=True
            ),
            save_video=bool(eval_cfg.eval.get("save_video", False)),
            video_path=output_dir,
        )
    start_time = time.time()
    with get_eval_grad_context(solver):
        with profiler_ctx as profiler:
            with inference_ctx:
                with torch.profiler.record_function("eval.world_evaluate_from_dataset"):
-                    metrics = world.evaluate_from_dataset(
+                    metrics = evaluate_subset(eval_episodes, eval_start_idx)
                        dataset,
                        start_steps=list(eval_start_idx),
                        goal_offset_steps=local_cfg.eval.goal_offset_steps,
                        eval_budget=local_cfg.eval.eval_budget,
                        episodes_idx=list(eval_episodes),
                        callables=OmegaConf.to_container(
                            local_cfg.eval.get("callables"), resolve=True
                        ),
                        save_video=bool(local_cfg.eval.get("save_video", False)),
                        video_path=output_dir,
                    )
    if str(device).startswith("cuda") and torch.cuda.is_available():
        torch.cuda.synchronize()
    evaluation_time = time.time() - start_time
@@ -396,6 +412,38 @@ def run_eval_subset(
    }
 def maybe_run_compile_warmup(cfg, eval_episodes, eval_start_idx):
    warmup_cfg = get_compile_warmup_cfg(cfg)
    if not warmup_cfg["enabled"]:
        return
    if get_multi_gpu_cfg(cfg)["enabled"]:
        print("Skipping compile warmup because multi_gpu.enabled=true uses spawned workers.")
        return
    warmup_count = min(int(warmup_cfg["num_eval"]), len(eval_episodes))
    if warmup_count < 1:
        return
    warmup_eval_cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=False))
    warmup_eval_cfg.eval.num_eval = warmup_count
    warmup_eval_cfg.eval.save_video = False
    if warmup_eval_cfg.get("profile") is None:
        warmup_eval_cfg.profile = OmegaConf.create({"enabled": False})
    else:
        warmup_eval_cfg.profile.enabled = False
    with tempfile.TemporaryDirectory(prefix="lewm_compile_warmup_") as tmpdir:
        run_eval_subset(
            warmup_eval_cfg,
            eval_episodes[:warmup_count].tolist(),
            eval_start_idx[:warmup_count].tolist(),
            Path(tmpdir),
            enable_profile=False,
        )
 def _multi_gpu_eval_worker(
    cfg_container,
    eval_episodes,
@@ -515,6 +563,8 @@ def run(cfg: DictConfig):
    output_dir = Path.cwd().resolve()
    profile_cfg = get_profile_cfg(cfg)
    maybe_run_compile_warmup(cfg, eval_episodes, eval_start_idx)
    if get_multi_gpu_cfg(cfg)["enabled"]:
        if profile_cfg["enabled"]:
            raise ValueError("Profiling is not supported together with multi_gpu.enabled=true")