@eval_group.command("run")
@click.option(
"-c",
"--config",
"config_path",
default=None,
type=click.Path(),
help="TOML config file for suite runs.",
)
@click.option(
"-b",
"--benchmark",
"benchmark",
default=None,
help="Benchmark to run (e.g. supergpqa, gaia, frames, wildchat).",
)
@click.option(
"-m",
"--model",
"model",
default=None,
help="Model identifier.",
)
@click.option(
"-n",
"--max-samples",
"max_samples",
type=int,
default=None,
help="Maximum samples to evaluate.",
)
@click.option(
"--backend",
"backend",
default="jarvis-direct",
type=click.Choice(
["jarvis-direct", "jarvis-agent", "hermes", "openclaw", "terminalbench-native"]
),
help=(
"Inference backend. For hermes/openclaw, also pass --base-url and "
"--api-key (or set JARVIS_BACKEND_BASE_URL/JARVIS_BACKEND_API_KEY)."
),
)
@click.option(
"--base-url",
"base_url",
default=None,
help=(
"OpenAI-compat endpoint URL for hermes/openclaw backends "
"(env: JARVIS_BACKEND_BASE_URL)."
),
)
@click.option(
"--api-key",
"api_key",
default=None,
help=("API key for the hermes/openclaw endpoint (env: JARVIS_BACKEND_API_KEY)."),
)
@click.option(
"--agent",
"agent_name",
default=None,
help="Agent name for jarvis-agent backend.",
)
@click.option(
"-e",
"--engine",
"engine_key",
default=None,
help="Engine key (ollama, vllm, cloud, ...).",
)
@click.option(
"--tools",
"tools",
default="",
help="Comma-separated tool names.",
)
@click.option(
"--telemetry/--no-telemetry",
"telemetry",
default=False,
help="Enable telemetry collection during eval.",
)
@click.option(
"--gpu-metrics/--no-gpu-metrics",
"gpu_metrics",
default=False,
help="Enable GPU metrics collection.",
)
@click.option(
"--seed",
"seed",
type=int,
default=42,
help="Random seed.",
)
@click.option(
"--temperature",
"temperature",
type=float,
default=0.0,
help="Generation temperature.",
)
@click.option(
"--max-tokens",
"max_tokens",
type=int,
default=2048,
help="Max output tokens.",
)
@click.option(
"--model-filter",
"model_filter",
default=None,
help="Filter models by name substring (for multi-model configs).",
)
@click.option(
"-o",
"--output",
"output_path",
default=None,
type=click.Path(),
help="Output JSONL path.",
)
@click.option(
"--wandb-project",
"wandb_project",
default="",
help="W&B project name (enables W&B tracking).",
)
@click.option(
"--wandb-entity",
"wandb_entity",
default="",
help="W&B entity (team or user).",
)
@click.option(
"--wandb-tags",
"wandb_tags",
default="",
help="Comma-separated W&B tags.",
)
@click.option(
"--wandb-group",
"wandb_group",
default="",
help="W&B run group.",
)
@click.option(
"--sheets-id",
"sheets_spreadsheet_id",
default="",
help="Google Sheets spreadsheet ID.",
)
@click.option(
"--sheets-worksheet",
"sheets_worksheet",
default="Results",
help="Google Sheets worksheet name.",
)
@click.option(
"--sheets-creds",
"sheets_credentials_path",
default="",
help="Path to Google service account JSON.",
)
@click.option(
"-v",
"--verbose",
"verbose",
is_flag=True,
default=False,
help="Verbose logging.",
)
def eval_run(
config_path: Optional[str],
benchmark: Optional[str],
model: Optional[str],
max_samples: Optional[int],
backend: str,
base_url: Optional[str],
api_key: Optional[str],
agent_name: Optional[str],
engine_key: Optional[str],
tools: str,
telemetry: bool,
gpu_metrics: bool,
seed: int,
temperature: float,
max_tokens: int,
model_filter: Optional[str],
output_path: Optional[str],
wandb_project: str,
wandb_entity: str,
wandb_tags: str,
wandb_group: str,
sheets_spreadsheet_id: str,
sheets_worksheet: str,
sheets_credentials_path: str,
verbose: bool,
) -> None:
"""Run evaluation benchmarks."""
console = Console(stderr=True)
# Config-driven mode: load TOML suite, expand, run all
if config_path is not None:
try:
from openjarvis.evals.core.config import expand_suite, load_eval_config
except ImportError:
console.print(
"[red]Eval framework not available. "
"Ensure the evals package is importable.[/red]"
)
sys.exit(1)
try:
suite = load_eval_config(config_path)
run_configs = expand_suite(suite)
except Exception as exc:
console.print(f"[red]Error loading config: {exc}[/red]")
sys.exit(1)
# Filter by model name substring if requested
if model_filter:
run_configs = [rc for rc in run_configs if model_filter in rc.model]
if not run_configs:
console.print(f"[red]No models match filter '{model_filter}'[/red]")
sys.exit(1)
console.print(
f"[cyan]Suite:[/cyan] {suite.meta.name or Path(config_path).stem}"
)
console.print(
f"[cyan]Matrix:[/cyan] {len(suite.models)} model(s) x "
f"{len(suite.benchmarks)} benchmark(s) = {len(run_configs)} run(s)"
)
try:
from openjarvis.evals.cli import _run_single
except ImportError:
console.print("[red]Eval CLI module not available.[/red]")
sys.exit(1)
for i, rc in enumerate(run_configs, 1):
console.print(
f"\n[bold]Run {i}/{len(run_configs)}:[/bold] "
f"{rc.benchmark} / {rc.model}"
)
try:
summary = _run_single(rc, console=console)
console.print(
f" [green]{summary.accuracy:.4f}[/green] "
f"({summary.correct}/{summary.scored_samples})"
)
except Exception as exc:
console.print(f" [red bold]FAILED:[/red bold] {exc}")
return
# CLI-driven mode: require --benchmark and --model
if benchmark is None or model is None:
raise click.UsageError(
"Provide either --config/-c for suite mode, "
"or both --benchmark/-b and --model/-m for single-run mode."
)
if benchmark not in KNOWN_BENCHMARKS:
console.print(f"[yellow]Warning: unknown benchmark '{benchmark}'[/yellow]")
try:
from openjarvis.evals.core.types import RunConfig
except ImportError:
console.print(
"[red]Eval framework not available. "
"Ensure the evals package is importable.[/red]"
)
sys.exit(1)
tool_list = [t.strip() for t in tools.split(",") if t.strip()] if tools else []
config = RunConfig(
benchmark=benchmark,
backend=backend,
model=model,
max_samples=max_samples,
agent_name=agent_name,
engine_key=engine_key,
tools=tool_list,
output_path=output_path,
seed=seed,
temperature=temperature,
max_tokens=max_tokens,
telemetry=telemetry,
gpu_metrics=gpu_metrics,
wandb_project=wandb_project,
wandb_entity=wandb_entity,
wandb_tags=wandb_tags,
wandb_group=wandb_group,
sheets_spreadsheet_id=sheets_spreadsheet_id,
sheets_worksheet=sheets_worksheet,
sheets_credentials_path=sheets_credentials_path,
# Spec §6.2 — for hermes/openclaw external backends. Falls back to env vars
# so users can also set JARVIS_BACKEND_BASE_URL/JARVIS_BACKEND_API_KEY.
base_url=base_url or os.environ.get("JARVIS_BACKEND_BASE_URL"),
api_key=api_key or os.environ.get("JARVIS_BACKEND_API_KEY"),
)
try:
from openjarvis.evals.cli import _run_single
console.print(
f"[cyan]Benchmark:[/cyan] {benchmark}\n"
f"[cyan]Model:[/cyan] {model}\n"
f"[cyan]Backend:[/cyan] {backend}"
)
summary = _run_single(config, console=console)
console.print(
f"\n[green]Accuracy: {summary.accuracy:.4f}[/green] "
f"({summary.correct}/{summary.scored_samples})"
)
except ImportError:
console.print("[red]Eval CLI module not available.[/red]")
sys.exit(1)
except Exception as exc:
console.print(f"[red]Error: {exc}[/red]")
sys.exit(1)