cli

cli ¶

CLI for the OpenJarvis evaluation framework.

Functions¶

main ¶

main()

OpenJarvis Evaluation Framework.

Source code in src/openjarvis/evals/cli.py

@click.group()
def main():
    """OpenJarvis Evaluation Framework."""

run ¶

run(ctx, config_path, benchmark, backend, model, engine_key, agent_name, tools, max_samples, max_workers, judge_model, output_path, seed, dataset_split, temperature, max_tokens, telemetry, gpu_metrics, compact, trace_detail, wandb_project, wandb_entity, wandb_tags, wandb_group, sheets_spreadsheet_id, sheets_worksheet, sheets_credentials_path, model_filter, judge_engine, agentic, episode_mode, concurrency, query_timeout, verbose)

Run a single benchmark evaluation, or a full suite from a TOML config.

Source code in src/openjarvis/evals/cli.py

@main.command()
@click.option(
    "-c",
    "--config",
    "config_path",
    default=None,
    type=click.Path(),
    help="TOML config file for suite runs",
)
@click.option(
    "-b",
    "--benchmark",
    default=None,
    type=click.Choice(list(BENCHMARKS.keys())),
    help="Benchmark to run",
)
@click.option(
    "--backend",
    default="jarvis-direct",
    type=click.Choice(list(BACKENDS.keys())),
    help="Inference backend",
)
@click.option("-m", "--model", default=None, help="Model identifier")
@click.option(
    "-e",
    "--engine",
    "engine_key",
    default=None,
    help="Engine key (ollama, vllm, cloud, ...)",
)
@click.option(
    "--agent",
    "agent_name",
    default="orchestrator",
    help="Agent name for jarvis-agent backend",
)
@click.option("--tools", default="", help="Comma-separated tool names")
@click.option(
    "-n", "--max-samples", type=int, default=None, help="Maximum samples to evaluate"
)
@click.option("-w", "--max-workers", type=int, default=4, help="Parallel workers")
@click.option("--judge-model", default="gpt-5-mini-2025-08-07", help="LLM judge model")
@click.option("-o", "--output", "output_path", default=None, help="Output JSONL path")
@click.option("--seed", type=int, default=42, help="Random seed")
@click.option("--split", "dataset_split", default=None, help="Dataset split override")
@click.option("--temperature", type=float, default=0.0, help="Generation temperature")
@click.option("--max-tokens", type=int, default=2048, help="Max output tokens")
@click.option(
    "--telemetry/--no-telemetry",
    default=False,
    help="Enable telemetry collection during eval",
)
@click.option(
    "--gpu-metrics/--no-gpu-metrics",
    default=False,
    help="Enable GPU metrics collection",
)
@click.option(
    "--compact",
    is_flag=True,
    default=False,
    help="Dense single-table output",
)
@click.option(
    "--trace-detail",
    is_flag=True,
    default=False,
    help="Full per-step trace listing",
)
@click.option("--wandb-project", default="", help="W&B project name (enables tracking)")
@click.option("--wandb-entity", default="", help="W&B entity (team or user)")
@click.option("--wandb-tags", default="", help="Comma-separated W&B tags")
@click.option("--wandb-group", default="", help="W&B run group")
@click.option(
    "--sheets-id",
    "sheets_spreadsheet_id",
    default="",
    help="Google Sheets spreadsheet ID",
)
@click.option(
    "--sheets-worksheet", default="Results", help="Google Sheets worksheet name"
)
@click.option(
    "--sheets-creds",
    "sheets_credentials_path",
    default="",
    help="Service account JSON path",
)
@click.option(
    "--model-filter",
    default=None,
    help="Filter models by name substring (for multi-model configs)",
)
@click.option(
    "--judge-engine",
    default="cloud",
    help="Engine key for LLM judge (default: cloud). Use 'vllm' to judge locally.",
)
@click.option(
    "--agentic",
    is_flag=True,
    default=False,
    help="Use AgenticRunner for multi-turn agent execution",
)
@click.option(
    "--episode-mode",
    is_flag=True,
    default=False,
    help="Sequential episode processing with lifelong learning "
    "(required for lifelong-agent and similar benchmarks)",
)
@click.option(
    "--concurrency",
    type=int,
    default=1,
    help="Parallel query execution (AgenticRunner only)",
)
@click.option(
    "--query-timeout",
    type=float,
    default=None,
    help="Per-query wall-clock timeout in seconds (AgenticRunner only)",
)
@click.option("-v", "--verbose", is_flag=True, help="Verbose logging")
@click.pass_context
def run(
    ctx,
    config_path,
    benchmark,
    backend,
    model,
    engine_key,
    agent_name,
    tools,
    max_samples,
    max_workers,
    judge_model,
    output_path,
    seed,
    dataset_split,
    temperature,
    max_tokens,
    telemetry,
    gpu_metrics,
    compact,
    trace_detail,
    wandb_project,
    wandb_entity,
    wandb_tags,
    wandb_group,
    sheets_spreadsheet_id,
    sheets_worksheet,
    sheets_credentials_path,
    model_filter,
    judge_engine,
    agentic,
    episode_mode,
    concurrency,
    query_timeout,
    verbose,
):
    """Run a single benchmark evaluation, or a full suite from a TOML config."""
    _setup_logging(verbose)

    console = Console()

    # Config-driven mode
    if config_path is not None:
        _run_from_config(config_path, verbose, model_filter=model_filter)
        return

    # CLI-driven mode: validate required args
    if benchmark is None:
        raise click.UsageError(
            "Missing option '-b' / '--benchmark' "
            "(required when --config is not provided)"
        )
    if model is None:
        raise click.UsageError(
            "Missing option '-m' / '--model' (required when --config is not provided)"
        )

    from openjarvis.evals.core.types import RunConfig

    tool_list = [t.strip() for t in tools.split(",") if t.strip()] if tools else []

    # --episode-mode takes precedence over --agentic.
    # Note: EvalRunner also auto-detects episode_mode from the dataset
    # (via iter_episodes), so passing --episode-mode here is optional for
    # benchmarks like lifelong-agent that always require it.
    if episode_mode and agentic:
        LOGGER.warning(
            "--episode-mode and --agentic both set; using --episode-mode "
            "(provides proper multi-turn interaction via EvalRunner)"
        )
        agentic = False

    config = RunConfig(
        benchmark=benchmark,
        backend=backend,
        model=model,
        max_samples=max_samples,
        max_workers=max_workers,
        temperature=temperature,
        max_tokens=max_tokens,
        judge_model=judge_model,
        judge_engine=judge_engine,
        engine_key=engine_key,
        agent_name=agent_name,
        tools=tool_list,
        output_path=output_path,
        seed=seed,
        dataset_split=dataset_split,
        telemetry=telemetry,
        gpu_metrics=gpu_metrics,
        wandb_project=wandb_project,
        wandb_entity=wandb_entity,
        wandb_tags=wandb_tags,
        wandb_group=wandb_group,
        sheets_spreadsheet_id=sheets_spreadsheet_id,
        sheets_worksheet=sheets_worksheet,
        sheets_credentials_path=sheets_credentials_path,
        episode_mode=episode_mode,
    )

    # Banner + config
    print_banner(console)
    print_section(console, "Configuration")
    print_run_header(
        console,
        benchmark=benchmark,
        model=model,
        backend=backend,
        samples=max_samples,
        workers=max_workers,
    )
    if episode_mode:
        console.print(
            "  [cyan]Mode:[/cyan]       episode (sequential + lifelong learning)"
        )

    if agentic:
        # --- Agentic runner path ---
        print_section(console, "Agentic Evaluation")
        console.print(f"  [cyan]Concurrency:[/cyan] {concurrency}")
        if query_timeout:
            console.print(f"  [cyan]Timeout:[/cyan]     {query_timeout}s per query")
        _run_agentic(
            config,
            console=console,
            concurrency=concurrency,
            query_timeout=query_timeout,
        )
        return

    # Evaluation
    print_section(console, "Evaluation")
    summary = _run_single(config, console=console)

    # Results
    _output_path = getattr(summary, "_output_path", None)
    _traces_dir = getattr(summary, "_traces_dir", None)
    _print_summary(
        summary,
        console=console,
        output_path=_output_path,
        traces_dir=_traces_dir,
        compact=compact,
        trace_detail=trace_detail,
    )

run_all ¶

run_all(model, engine_key, max_samples, max_workers, judge_model, output_dir, seed, verbose)

Run all benchmarks.

Source code in src/openjarvis/evals/cli.py

@main.command("run-all")
@click.option("-m", "--model", required=True, help="Model identifier")
@click.option("-e", "--engine", "engine_key", default=None, help="Engine key")
@click.option(
    "-n", "--max-samples", type=int, default=None, help="Max samples per benchmark"
)
@click.option("-w", "--max-workers", type=int, default=4, help="Parallel workers")
@click.option("--judge-model", default="gpt-5-mini-2025-08-07", help="LLM judge model")
@click.option("--output-dir", default="results/", help="Output directory for results")
@click.option("--seed", type=int, default=42, help="Random seed")
@click.option("-v", "--verbose", is_flag=True, help="Verbose logging")
def run_all(
    model, engine_key, max_samples, max_workers, judge_model, output_dir, seed, verbose
):
    """Run all benchmarks."""
    _setup_logging(verbose)

    from openjarvis.evals.core.runner import EvalRunner
    from openjarvis.evals.core.types import RunConfig

    console = Console()

    print_banner(console)
    print_section(console, "Suite Configuration")
    console.print(
        f"  [cyan]Model:[/cyan]      {model}\n"
        f"  [cyan]Benchmarks:[/cyan] {', '.join(BENCHMARKS.keys())}\n"
        f"  [cyan]Samples:[/cyan]    {max_samples if max_samples else 'all'}"
    )

    output_dir_path = Path(output_dir)
    output_dir_path.mkdir(parents=True, exist_ok=True)

    model_slug = model.replace("/", "-").replace(":", "-")
    summaries = []

    for i, bench_name in enumerate(BENCHMARKS, 1):
        print_section(console, f"Run {i}/{len(BENCHMARKS)}: {bench_name}")
        output_path = output_dir_path / f"{bench_name}_{model_slug}.jsonl"

        config = RunConfig(
            benchmark=bench_name,
            backend="jarvis-direct",
            model=model,
            max_samples=max_samples,
            max_workers=max_workers,
            judge_model=judge_model,
            engine_key=engine_key,
            output_path=str(output_path),
            seed=seed,
        )

        eval_backend = _build_backend("jarvis-direct", engine_key, "orchestrator", [])
        dataset = _build_dataset(bench_name)
        judge_backend = _build_judge_backend(judge_model, engine_key="cloud")
        scorer = _build_scorer(bench_name, judge_backend, judge_model)

        trackers = _build_trackers(config)
        runner = EvalRunner(config, dataset, eval_backend, scorer, trackers=trackers)
        try:
            if max_samples and max_samples > 0:
                with Progress(
                    SpinnerColumn(),
                    TextColumn("[progress.description]{task.description}"),
                    BarColumn(),
                    TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                    TimeRemainingColumn(),
                    console=console,
                ) as progress:
                    task = progress.add_task(
                        f"Evaluating {bench_name}...",
                        total=max_samples,
                    )
                    summary = runner.run(
                        progress_callback=lambda done, total: progress.update(
                            task,
                            completed=done,
                        ),
                    )
            else:
                with console.status(f"Evaluating {bench_name}..."):
                    summary = runner.run()
            summaries.append(summary)
            console.print(
                f"  [green]{summary.accuracy:.4f}[/green] "
                f"({summary.correct}/{summary.scored_samples})"
            )
        except Exception as exc:
            console.print(f"  [red bold]FAILED:[/red bold] {exc}")
        finally:
            eval_backend.close()
            if judge_backend is not None:
                judge_backend.close()

    # Print overall summary
    if summaries:
        print_section(console, "Suite Results")
        print_suite_summary(console, summaries, f"All Benchmarks / {model}")

summarize ¶

summarize(jsonl_path)

Summarize results from a JSONL output file.

Source code in src/openjarvis/evals/cli.py

@main.command()
@click.argument("jsonl_path", type=click.Path(exists=True))
def summarize(jsonl_path):
    """Summarize results from a JSONL output file."""
    records = []
    with open(jsonl_path) as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))

    if not records:
        click.echo("No records found.")
        return

    console = Console()
    total = len(records)
    scored = [r for r in records if r.get("is_correct") is not None]
    correct = [r for r in scored if r["is_correct"]]
    errors = [r for r in records if r.get("error")]
    accuracy = len(correct) / len(scored) if scored else 0.0

    console.print(f"[cyan]File:[/cyan]      {jsonl_path}")
    console.print(f"[cyan]Benchmark:[/cyan] {records[0].get('benchmark', '?')}")
    console.print(f"[cyan]Model:[/cyan]     {records[0].get('model', '?')}")
    console.print(f"[cyan]Total:[/cyan]     {total}")
    console.print(f"[cyan]Scored:[/cyan]    {len(scored)}")
    console.print(f"[cyan]Correct:[/cyan]   {len(correct)}")
    console.print(f"[cyan]Accuracy:[/cyan]  [bold]{accuracy:.4f}[/bold]")
    console.print(f"[cyan]Errors:[/cyan]    {len(errors)}")

list_cmd ¶

list_cmd()

List available benchmarks and backends.

Source code in src/openjarvis/evals/cli.py

@main.command("list")
def list_cmd():
    """List available benchmarks and backends."""
    console = Console()
    print_banner(console)

    from rich.table import Table

    bench_table = Table(
        title="[bold]Available Benchmarks[/bold]",
        border_style="bright_blue",
        title_style="bold cyan",
    )
    bench_table.add_column("Name", style="cyan", no_wrap=True)
    bench_table.add_column("Category", style="white")
    bench_table.add_column("Description")
    for name, info in BENCHMARKS.items():
        bench_table.add_row(name, info["category"], info["description"])
    console.print(bench_table)

    backend_table = Table(
        title="[bold]Available Backends[/bold]",
        border_style="bright_blue",
        title_style="bold cyan",
    )
    backend_table.add_column("Name", style="cyan", no_wrap=True)
    backend_table.add_column("Description")
    for name, desc in BACKENDS.items():
        backend_table.add_row(name, desc)
    console.print(backend_table)