def load_eval_config(path: str | Path) -> EvalSuiteConfig:
"""Load and validate an eval suite config from a TOML file.
Args:
path: Path to the TOML config file.
Returns:
Validated EvalSuiteConfig.
Raises:
EvalConfigError: On structural validation failures.
FileNotFoundError: If the config file doesn't exist.
"""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Config file not found: {path}")
with open(path, "rb") as f:
raw = tomllib.load(f)
# Parse [meta]
meta_raw = raw.get("meta", {})
meta = MetaConfig(
name=meta_raw.get("name", ""),
description=meta_raw.get("description", ""),
)
# Parse [defaults]
defaults_raw = raw.get("defaults", {})
defaults = DefaultsConfig(
temperature=float(defaults_raw.get("temperature", 0.0)),
max_tokens=int(defaults_raw.get("max_tokens", 2048)),
)
# Parse [judge]
judge_raw = raw.get("judge", {})
judge = JudgeConfig(
model=judge_raw.get("model", "gpt-5-mini-2025-08-07"),
engine=judge_raw.get("engine"),
provider=judge_raw.get("provider"),
temperature=float(judge_raw.get("temperature", 0.0)),
max_tokens=int(judge_raw.get("max_tokens", 1024)),
)
# Parse [run]
run_raw = raw.get("run", {})
execution = ExecutionConfig(
max_workers=int(run_raw.get("max_workers", 4)),
output_dir=run_raw.get("output_dir", "results/"),
seed=int(run_raw.get("seed", 42)),
telemetry=bool(run_raw.get("telemetry", False)),
gpu_metrics=bool(run_raw.get("gpu_metrics", False)),
warmup_samples=int(run_raw.get("warmup_samples", 0)),
energy_vendor=run_raw.get("energy_vendor", ""),
wandb_project=run_raw.get("wandb_project", ""),
wandb_entity=run_raw.get("wandb_entity", ""),
wandb_tags=run_raw.get("wandb_tags", ""),
wandb_group=run_raw.get("wandb_group", ""),
sheets_spreadsheet_id=run_raw.get("sheets_spreadsheet_id", ""),
sheets_worksheet=run_raw.get("sheets_worksheet", "Results"),
sheets_credentials_path=run_raw.get("sheets_credentials_path", ""),
max_turns=(int(run_raw["max_turns"]) if "max_turns" in run_raw else None),
)
# Parse [[models]]
models_raw = raw.get("models", [])
if not models_raw:
raise EvalConfigError("Config must define at least one [[models]] entry")
models: List[ModelConfig] = []
for m in models_raw:
if not m.get("name"):
raise EvalConfigError("Each [[models]] entry must have a 'name' field")
models.append(
ModelConfig(
name=m["name"],
engine=m.get("engine"),
provider=m.get("provider"),
temperature=float(m["temperature"]) if "temperature" in m else None,
max_tokens=int(m["max_tokens"]) if "max_tokens" in m else None,
param_count_b=float(m.get("param_count_b", 0.0)),
active_params_b=(
float(m["active_params_b"]) if "active_params_b" in m else None
),
gpu_peak_tflops=float(m.get("gpu_peak_tflops", 0.0)),
gpu_peak_bandwidth_gb_s=float(m.get("gpu_peak_bandwidth_gb_s", 0.0)),
num_gpus=int(m.get("num_gpus", 1)),
)
)
# Parse [[benchmarks]]
benchmarks_raw = raw.get("benchmarks", [])
if not benchmarks_raw:
raise EvalConfigError("Config must define at least one [[benchmarks]] entry")
benchmarks: List[BenchmarkConfig] = []
for b in benchmarks_raw:
if not b.get("name"):
raise EvalConfigError("Each [[benchmarks]] entry must have a 'name' field")
backend = b.get("backend", "jarvis-direct")
if backend not in VALID_BACKENDS:
raise EvalConfigError(
f"Invalid backend '{backend}' for benchmark '{b['name']}'. "
f"Must be one of: {', '.join(sorted(VALID_BACKENDS))}"
)
bench_name = b["name"]
if bench_name not in KNOWN_BENCHMARKS:
logger.warning("Unknown benchmark name: '%s'", bench_name)
tools_raw = b.get("tools", [])
record_ids_raw = b.get("record_ids")
record_ids = (
[str(r) for r in record_ids_raw]
if isinstance(record_ids_raw, list) and record_ids_raw
else None
)
benchmarks.append(
BenchmarkConfig(
name=bench_name,
backend=backend,
max_samples=int(b["max_samples"]) if "max_samples" in b else None,
split=b.get("split"),
agent=b.get("agent"),
tools=list(tools_raw),
judge_model=b.get("judge_model"),
temperature=float(b["temperature"]) if "temperature" in b else None,
max_tokens=int(b["max_tokens"]) if "max_tokens" in b else None,
subset=b.get("subset"),
record_ids=record_ids,
)
)
# Parse optional [backend.external] section (for hermes/openclaw backends).
# Env vars override TOML values; either source may be empty.
external_raw = raw.get("backend", {}).get("external", {})
backend_external_base_url = (
os.environ.get("JARVIS_BACKEND_BASE_URL")
or external_raw.get("base_url")
or None
)
backend_external_api_key = (
os.environ.get("JARVIS_BACKEND_API_KEY") or external_raw.get("api_key") or None
)
return EvalSuiteConfig(
meta=meta,
defaults=defaults,
judge=judge,
run=execution,
models=models,
benchmarks=benchmarks,
backend_external_base_url=backend_external_base_url,
backend_external_api_key=backend_external_api_key,
)