@main.command()
@click.option(
"-c",
"--config",
"config_path",
default=None,
type=click.Path(),
help="TOML config file for suite runs",
)
@click.option(
"-b",
"--benchmark",
default=None,
type=click.Choice(list(BENCHMARKS.keys())),
help="Benchmark to run",
)
@click.option(
"--backend",
default="jarvis-direct",
type=click.Choice(list(BACKENDS.keys())),
help="Inference backend",
)
@click.option("-m", "--model", default=None, help="Model identifier")
@click.option(
"-e",
"--engine",
"engine_key",
default=None,
help="Engine key (ollama, vllm, cloud, ...)",
)
@click.option(
"--agent",
"agent_name",
default="orchestrator",
help="Agent name for jarvis-agent backend",
)
@click.option("--tools", default="", help="Comma-separated tool names")
@click.option(
"-n", "--max-samples", type=int, default=None, help="Maximum samples to evaluate"
)
@click.option("-w", "--max-workers", type=int, default=4, help="Parallel workers")
@click.option("--judge-model", default="gpt-5-mini-2025-08-07", help="LLM judge model")
@click.option("-o", "--output", "output_path", default=None, help="Output JSONL path")
@click.option("--seed", type=int, default=42, help="Random seed")
@click.option("--split", "dataset_split", default=None, help="Dataset split override")
@click.option("--temperature", type=float, default=0.0, help="Generation temperature")
@click.option("--max-tokens", type=int, default=2048, help="Max output tokens")
@click.option(
"--telemetry/--no-telemetry",
default=False,
help="Enable telemetry collection during eval",
)
@click.option(
"--gpu-metrics/--no-gpu-metrics",
default=False,
help="Enable GPU metrics collection",
)
@click.option(
"--compact",
is_flag=True,
default=False,
help="Dense single-table output",
)
@click.option(
"--trace-detail",
is_flag=True,
default=False,
help="Full per-step trace listing",
)
@click.option("--wandb-project", default="", help="W&B project name (enables tracking)")
@click.option("--wandb-entity", default="", help="W&B entity (team or user)")
@click.option("--wandb-tags", default="", help="Comma-separated W&B tags")
@click.option("--wandb-group", default="", help="W&B run group")
@click.option(
"--sheets-id",
"sheets_spreadsheet_id",
default="",
help="Google Sheets spreadsheet ID",
)
@click.option(
"--sheets-worksheet", default="Results", help="Google Sheets worksheet name"
)
@click.option(
"--sheets-creds",
"sheets_credentials_path",
default="",
help="Service account JSON path",
)
@click.option(
"--model-filter",
default=None,
help="Filter models by name substring (for multi-model configs)",
)
@click.option(
"--judge-engine",
default="cloud",
help="Engine key for LLM judge (default: cloud). Use 'vllm' to judge locally.",
)
@click.option(
"--agentic",
is_flag=True,
default=False,
help="Use AgenticRunner for multi-turn agent execution",
)
@click.option(
"--episode-mode",
is_flag=True,
default=False,
help="Sequential episode processing with lifelong learning "
"(required for lifelong-agent and similar benchmarks)",
)
@click.option(
"--concurrency",
type=int,
default=1,
help="Parallel query execution (AgenticRunner only)",
)
@click.option(
"--query-timeout",
type=float,
default=None,
help="Per-query wall-clock timeout in seconds (AgenticRunner only)",
)
@click.option("-v", "--verbose", is_flag=True, help="Verbose logging")
@click.pass_context
def run(
ctx,
config_path,
benchmark,
backend,
model,
engine_key,
agent_name,
tools,
max_samples,
max_workers,
judge_model,
output_path,
seed,
dataset_split,
temperature,
max_tokens,
telemetry,
gpu_metrics,
compact,
trace_detail,
wandb_project,
wandb_entity,
wandb_tags,
wandb_group,
sheets_spreadsheet_id,
sheets_worksheet,
sheets_credentials_path,
model_filter,
judge_engine,
agentic,
episode_mode,
concurrency,
query_timeout,
verbose,
):
"""Run a single benchmark evaluation, or a full suite from a TOML config."""
_setup_logging(verbose)
console = Console()
# Config-driven mode
if config_path is not None:
_run_from_config(config_path, verbose, model_filter=model_filter)
return
# CLI-driven mode: validate required args
if benchmark is None:
raise click.UsageError(
"Missing option '-b' / '--benchmark' "
"(required when --config is not provided)"
)
if model is None:
raise click.UsageError(
"Missing option '-m' / '--model' (required when --config is not provided)"
)
from openjarvis.evals.core.types import RunConfig
tool_list = [t.strip() for t in tools.split(",") if t.strip()] if tools else []
# --episode-mode takes precedence over --agentic.
# Note: EvalRunner also auto-detects episode_mode from the dataset
# (via iter_episodes), so passing --episode-mode here is optional for
# benchmarks like lifelong-agent that always require it.
if episode_mode and agentic:
LOGGER.warning(
"--episode-mode and --agentic both set; using --episode-mode "
"(provides proper multi-turn interaction via EvalRunner)"
)
agentic = False
config = RunConfig(
benchmark=benchmark,
backend=backend,
model=model,
max_samples=max_samples,
max_workers=max_workers,
temperature=temperature,
max_tokens=max_tokens,
judge_model=judge_model,
judge_engine=judge_engine,
engine_key=engine_key,
agent_name=agent_name,
tools=tool_list,
output_path=output_path,
seed=seed,
dataset_split=dataset_split,
telemetry=telemetry,
gpu_metrics=gpu_metrics,
wandb_project=wandb_project,
wandb_entity=wandb_entity,
wandb_tags=wandb_tags,
wandb_group=wandb_group,
sheets_spreadsheet_id=sheets_spreadsheet_id,
sheets_worksheet=sheets_worksheet,
sheets_credentials_path=sheets_credentials_path,
episode_mode=episode_mode,
)
# Banner + config
print_banner(console)
print_section(console, "Configuration")
print_run_header(
console,
benchmark=benchmark,
model=model,
backend=backend,
samples=max_samples,
workers=max_workers,
)
if episode_mode:
console.print(
" [cyan]Mode:[/cyan] episode (sequential + lifelong learning)"
)
if agentic:
# --- Agentic runner path ---
print_section(console, "Agentic Evaluation")
console.print(f" [cyan]Concurrency:[/cyan] {concurrency}")
if query_timeout:
console.print(f" [cyan]Timeout:[/cyan] {query_timeout}s per query")
_run_agentic(
config,
console=console,
concurrency=concurrency,
query_timeout=query_timeout,
)
return
# Evaluation
print_section(console, "Evaluation")
summary = _run_single(config, console=console)
# Results
_output_path = getattr(summary, "_output_path", None)
_traces_dir = getattr(summary, "_traces_dir", None)
_print_summary(
summary,
console=console,
output_path=_output_path,
traces_dir=_traces_dir,
compact=compact,
trace_detail=trace_detail,
)