@main.command()
@click.option("-c", "--config", "config_path", default=None,
type=click.Path(), help="TOML config file for suite runs")
@click.option("-b", "--benchmark", default=None,
type=click.Choice(list(BENCHMARKS.keys())),
help="Benchmark to run")
@click.option("--backend", default="jarvis-direct",
type=click.Choice(list(BACKENDS.keys())),
help="Inference backend")
@click.option("-m", "--model", default=None, help="Model identifier")
@click.option("-e", "--engine", "engine_key", default=None,
help="Engine key (ollama, vllm, cloud, ...)")
@click.option("--agent", "agent_name", default="orchestrator",
help="Agent name for jarvis-agent backend")
@click.option("--tools", default="", help="Comma-separated tool names")
@click.option("-n", "--max-samples", type=int, default=None,
help="Maximum samples to evaluate")
@click.option("-w", "--max-workers", type=int, default=4,
help="Parallel workers")
@click.option("--judge-model", default="gpt-5-mini-2025-08-07",
help="LLM judge model")
@click.option("-o", "--output", "output_path", default=None,
help="Output JSONL path")
@click.option("--seed", type=int, default=42, help="Random seed")
@click.option("--split", "dataset_split", default=None,
help="Dataset split override")
@click.option("--temperature", type=float, default=0.0,
help="Generation temperature")
@click.option("--max-tokens", type=int, default=2048,
help="Max output tokens")
@click.option("--telemetry/--no-telemetry", default=False,
help="Enable telemetry collection during eval")
@click.option("--gpu-metrics/--no-gpu-metrics", default=False,
help="Enable GPU metrics collection")
@click.option(
"--compact", is_flag=True, default=False,
help="Dense single-table output",
)
@click.option(
"--trace-detail", is_flag=True, default=False,
help="Full per-step trace listing",
)
@click.option("--wandb-project", default="",
help="W&B project name (enables tracking)")
@click.option("--wandb-entity", default="",
help="W&B entity (team or user)")
@click.option("--wandb-tags", default="",
help="Comma-separated W&B tags")
@click.option("--wandb-group", default="",
help="W&B run group")
@click.option("--sheets-id", "sheets_spreadsheet_id", default="",
help="Google Sheets spreadsheet ID")
@click.option("--sheets-worksheet", default="Results",
help="Google Sheets worksheet name")
@click.option("--sheets-creds", "sheets_credentials_path",
default="",
help="Service account JSON path")
@click.option("--model-filter", default=None,
help="Filter models by name substring (for multi-model configs)")
@click.option("--judge-engine", default="cloud",
help="Engine key for LLM judge (default: cloud). "
"Use 'vllm' to judge locally.")
@click.option("--agentic", is_flag=True, default=False,
help="Use AgenticRunner for multi-turn agent execution")
@click.option("--episode-mode", is_flag=True, default=False,
help="Sequential episode processing with lifelong learning "
"(required for lifelong-agent and similar benchmarks)")
@click.option("--concurrency", type=int, default=1,
help="Parallel query execution (AgenticRunner only)")
@click.option("--query-timeout", type=float, default=None,
help="Per-query wall-clock timeout in seconds (AgenticRunner only)")
@click.option("-v", "--verbose", is_flag=True, help="Verbose logging")
@click.pass_context
def run(ctx, config_path, benchmark, backend, model, engine_key, agent_name,
tools, max_samples, max_workers, judge_model, output_path, seed,
dataset_split, temperature, max_tokens, telemetry, gpu_metrics,
compact, trace_detail,
wandb_project, wandb_entity, wandb_tags, wandb_group,
sheets_spreadsheet_id, sheets_worksheet, sheets_credentials_path,
model_filter, judge_engine, agentic, episode_mode,
concurrency, query_timeout, verbose):
"""Run a single benchmark evaluation, or a full suite from a TOML config."""
_setup_logging(verbose)
console = Console()
# Config-driven mode
if config_path is not None:
_run_from_config(config_path, verbose, model_filter=model_filter)
return
# CLI-driven mode: validate required args
if benchmark is None:
raise click.UsageError(
"Missing option '-b' / '--benchmark' "
"(required when --config is not provided)"
)
if model is None:
raise click.UsageError(
"Missing option '-m' / '--model' "
"(required when --config is not provided)"
)
from openjarvis.evals.core.types import RunConfig
tool_list = [t.strip() for t in tools.split(",") if t.strip()] if tools else []
# --episode-mode takes precedence over --agentic.
# Note: EvalRunner also auto-detects episode_mode from the dataset
# (via iter_episodes), so passing --episode-mode here is optional for
# benchmarks like lifelong-agent that always require it.
if episode_mode and agentic:
LOGGER.warning(
"--episode-mode and --agentic both set; using --episode-mode "
"(provides proper multi-turn interaction via EvalRunner)"
)
agentic = False
config = RunConfig(
benchmark=benchmark,
backend=backend,
model=model,
max_samples=max_samples,
max_workers=max_workers,
temperature=temperature,
max_tokens=max_tokens,
judge_model=judge_model,
judge_engine=judge_engine,
engine_key=engine_key,
agent_name=agent_name,
tools=tool_list,
output_path=output_path,
seed=seed,
dataset_split=dataset_split,
telemetry=telemetry,
gpu_metrics=gpu_metrics,
wandb_project=wandb_project,
wandb_entity=wandb_entity,
wandb_tags=wandb_tags,
wandb_group=wandb_group,
sheets_spreadsheet_id=sheets_spreadsheet_id,
sheets_worksheet=sheets_worksheet,
sheets_credentials_path=sheets_credentials_path,
episode_mode=episode_mode,
)
# Banner + config
print_banner(console)
print_section(console, "Configuration")
print_run_header(
console,
benchmark=benchmark,
model=model,
backend=backend,
samples=max_samples,
workers=max_workers,
)
if episode_mode:
console.print(
" [cyan]Mode:[/cyan] episode "
"(sequential + lifelong learning)"
)
if agentic:
# --- Agentic runner path ---
print_section(console, "Agentic Evaluation")
console.print(
f" [cyan]Concurrency:[/cyan] {concurrency}"
)
if query_timeout:
console.print(
f" [cyan]Timeout:[/cyan] {query_timeout}s per query"
)
_run_agentic(
config, console=console,
concurrency=concurrency,
query_timeout=query_timeout,
)
return
# Evaluation
print_section(console, "Evaluation")
summary = _run_single(config, console=console)
# Results
_output_path = getattr(summary, "_output_path", None)
_traces_dir = getattr(summary, "_traces_dir", None)
_print_summary(
summary,
console=console,
output_path=_output_path,
traces_dir=_traces_dir,
compact=compact,
trace_detail=trace_detail,
)