diff --git a/config/eval/cube.yaml b/config/eval/cube.yaml index a580c2f..3ed5e24 100644 --- a/config/eval/cube.yaml +++ b/config/eval/cube.yaml @@ -24,6 +24,7 @@ dataset: seed: 42 policy: random # ckpt name or random +inference_precision: fp16 plan_config: horizon: 5 diff --git a/config/eval/reacher.yaml b/config/eval/reacher.yaml index e5af58f..ddda397 100644 --- a/config/eval/reacher.yaml +++ b/config/eval/reacher.yaml @@ -18,6 +18,7 @@ dataset: seed: 42 policy: random # ckpt name or random +inference_precision: fp16 plan_config: horizon: 5 diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py new file mode 100644 index 0000000..309d286 --- /dev/null +++ b/scripts/convert_hf_checkpoint.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +"""Convert LeWM HuggingFace weights into eval-compatible object checkpoints.""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +import stable_pretraining as spt +import torch + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from jepa import JEPA +from module import ARPredictor, Embedder, MLP + + +def _load_json(path: Path) -> dict: + with path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def _strip_target(config: dict) -> dict: + return {key: value for key, value in config.items() if key != "_target_"} + + +def infer_config_from_state_dict(state_dict: dict) -> dict: + action_dim = state_dict["action_encoder.patch_embed.weight"].shape[1] + return { + "encoder": { + "size": "tiny", + "patch_size": 14, + "image_size": 224, + "pretrained": False, + "use_mask_token": False, + }, + "predictor": { + "num_frames": 3, + "input_dim": 192, + "hidden_dim": 192, + "output_dim": 192, + "depth": 6, + "heads": 16, + "mlp_dim": 2048, + "dim_head": 64, + "dropout": 0.1, + "emb_dropout": 0.0, + }, + "action_encoder": { + "input_dim": action_dim, + "emb_dim": 192, + }, + "projector": { + "input_dim": 192, + "output_dim": 192, + "hidden_dim": 2048, + }, + "pred_proj": { + "input_dim": 192, + "output_dim": 192, + "hidden_dim": 2048, + }, + } + + +def build_model(config: dict) -> JEPA: + encoder = spt.backbone.utils.vit_hf(**_strip_target(config["encoder"])) + predictor = ARPredictor(**_strip_target(config["predictor"])) + action_encoder = Embedder(**_strip_target(config["action_encoder"])) + + projector_cfg = _strip_target(config["projector"]) + projector_cfg["norm_fn"] = torch.nn.BatchNorm1d + projector = MLP(**projector_cfg) + + pred_proj_cfg = _strip_target(config["pred_proj"]) + pred_proj_cfg["norm_fn"] = torch.nn.BatchNorm1d + pred_proj = MLP(**pred_proj_cfg) + + return JEPA( + encoder=encoder, + predictor=predictor, + action_encoder=action_encoder, + projector=projector, + pred_proj=pred_proj, + ) + + +def convert_checkpoint(input_dir: Path, output_name: str) -> tuple[Path, Path]: + config_path = input_dir / "config.json" + weights_path = input_dir / "weights.pt" + if not weights_path.exists(): + raise FileNotFoundError(f"Missing weights file: {weights_path}") + + state_dict = torch.load(weights_path, map_location="cpu") + config = _load_json(config_path) if config_path.exists() else infer_config_from_state_dict(state_dict) + model = build_model(config) + missing, unexpected = model.load_state_dict(state_dict, strict=True) + if missing or unexpected: + raise RuntimeError( + f"State dict mismatch: missing={missing}, unexpected={unexpected}" + ) + model.eval() + + object_path = input_dir / f"{output_name}_object.ckpt" + weight_path = input_dir / f"{output_name}_weight.ckpt" + torch.save(model, object_path) + torch.save(model.state_dict(), weight_path) + return object_path, weight_path + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "input_dir", + type=Path, + help="Directory containing weights.pt and optionally config.json.", + ) + parser.add_argument("--output-name", default="lewm") + args = parser.parse_args() + + object_path, weight_path = convert_checkpoint(args.input_dir, args.output_name) + print(f"wrote {object_path}") + print(f"wrote {weight_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/warmup_eval.sh b/scripts/warmup_eval.sh new file mode 100755 index 0000000..83f0cfa --- /dev/null +++ b/scripts/warmup_eval.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Warm up LeWM evaluation before a formal run. +# +# This script intentionally does a small eval for each task so ROCm/PyTorch can +# initialize GPU contexts, compile predictor graphs, populate kernel caches, and +# touch dataset/checkpoint paths before the timed run. +# +# Site-specific things to check before using this at the competition: +# 1. STABLEWM_HOME points to the directory containing datasets/checkpoints. +# 2. The policy names below match the checkpoint folders at STABLEWM_HOME. +# 3. The dataset names in config/eval/*.yaml match the onsite dataset files. +# 4. The GPU visibility variables match the GPUs allocated to this job. +# 5. WARMUP_NUM_EVAL is close enough to the formal shape to trigger useful +# compilation, but small enough not to waste much time. + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "${REPO_ROOT}" + +PYTHON_BIN="${PYTHON_BIN:-${REPO_ROOT}/.venv/bin/python}" +STABLEWM_HOME="${STABLEWM_HOME:-/mnt/ASC1637/stablewm}" +export STABLEWM_HOME + +# If Slurm allocates multiple GPUs, set these to the allocated physical GPU ids. +# Example for physical GPU 2 and 3: +# ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=0,1 CUDA_VISIBLE_DEVICES=0,1 +# +# Important ROCm detail: +# ROCR_VISIBLE_DEVICES uses physical ids. +# HIP_VISIBLE_DEVICES/CUDA_VISIBLE_DEVICES use ids after ROCR remapping. +export ROCR_VISIBLE_DEVICES="${ROCR_VISIBLE_DEVICES:-0}" +export HIP_VISIBLE_DEVICES="${HIP_VISIBLE_DEVICES:-0}" +export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" + +WARMUP_NUM_EVAL="${WARMUP_NUM_EVAL:-10}" +INFERENCE_PRECISION="${INFERENCE_PRECISION:-fp16}" +OUTPUT_DIR="${OUTPUT_DIR:-/tmp/lewm_warmup}" +mkdir -p "${OUTPUT_DIR}" + +# Enable multi-GPU warmup by setting MULTI_GPU=1. +# MULTI_GPU_DEVICES are process-local ids, not physical ids after ROCR remapping. +# Example: +# ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=0,1 MULTI_GPU=1 MULTI_GPU_DEVICES='[0,1]' +MULTI_GPU="${MULTI_GPU:-0}" +MULTI_GPU_DEVICES="${MULTI_GPU_DEVICES:-[0,1]}" + +COMMON_ARGS=( + "eval.num_eval=${WARMUP_NUM_EVAL}" + "inference_precision=${INFERENCE_PRECISION}" +) + +if [[ "${MULTI_GPU}" == "1" ]]; then + COMMON_ARGS+=( + "+multi_gpu.enabled=true" + "+multi_gpu.devices=${MULTI_GPU_DEVICES}" + ) +fi + +run_warmup() { + local config_name="$1" + local policy="$2" + local output_name="$3" + + echo + echo "== Warmup ${config_name} policy=${policy} ==" + "${PYTHON_BIN}" eval.py \ + "--config-name=${config_name}" \ + "policy=${policy}" \ + "output.filename=${OUTPUT_DIR}/${output_name}" \ + "${COMMON_ARGS[@]}" +} + +echo "LeWM warmup" +echo " repo: ${REPO_ROOT}" +echo " python: ${PYTHON_BIN}" +echo " STABLEWM_HOME: ${STABLEWM_HOME}" +echo " ROCR_VISIBLE_DEVICES: ${ROCR_VISIBLE_DEVICES}" +echo " HIP_VISIBLE_DEVICES: ${HIP_VISIBLE_DEVICES}" +echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}" +echo " WARMUP_NUM_EVAL: ${WARMUP_NUM_EVAL}" +echo " INFERENCE_PRECISION: ${INFERENCE_PRECISION}" +echo " MULTI_GPU: ${MULTI_GPU}" +if [[ "${MULTI_GPU}" == "1" ]]; then + echo " MULTI_GPU_DEVICES: ${MULTI_GPU_DEVICES}" +fi + +# Defaults match the checkpoint names used in this repo. If onsite checkpoint +# folders differ, override by editing these calls or passing the equivalent +# eval.py command manually. +run_warmup "pusht.yaml" "pusht/lewm" "warmup_pusht.txt" +run_warmup "reacher.yaml" "reacher/lewm" "warmup_reacher.txt" +run_warmup "cube.yaml" "cube/lewm" "warmup_cube.txt" +run_warmup "tworoom.yaml" "tworoom/lewm" "warmup_tworoom.txt" + +echo +echo "Warmup complete. Logs were appended under ${OUTPUT_DIR}."