Skip to content

cli

cli

CLI for the OpenJarvis evaluation framework.

Functions

main

main()

OpenJarvis Evaluation Framework.

Source code in src/openjarvis/evals/cli.py
@click.group()
def main():
    """OpenJarvis Evaluation Framework."""

run

run(ctx, config_path, benchmark, backend, model, engine_key, agent_name, tools, max_samples, max_workers, judge_model, output_path, seed, dataset_split, temperature, max_tokens, telemetry, gpu_metrics, compact, trace_detail, wandb_project, wandb_entity, wandb_tags, wandb_group, sheets_spreadsheet_id, sheets_worksheet, sheets_credentials_path, model_filter, judge_engine, agentic, episode_mode, concurrency, query_timeout, verbose)

Run a single benchmark evaluation, or a full suite from a TOML config.

Source code in src/openjarvis/evals/cli.py
@main.command()
@click.option("-c", "--config", "config_path", default=None,
              type=click.Path(), help="TOML config file for suite runs")
@click.option("-b", "--benchmark", default=None,
              type=click.Choice(list(BENCHMARKS.keys())),
              help="Benchmark to run")
@click.option("--backend", default="jarvis-direct",
              type=click.Choice(list(BACKENDS.keys())),
              help="Inference backend")
@click.option("-m", "--model", default=None, help="Model identifier")
@click.option("-e", "--engine", "engine_key", default=None,
              help="Engine key (ollama, vllm, cloud, ...)")
@click.option("--agent", "agent_name", default="orchestrator",
              help="Agent name for jarvis-agent backend")
@click.option("--tools", default="", help="Comma-separated tool names")
@click.option("-n", "--max-samples", type=int, default=None,
              help="Maximum samples to evaluate")
@click.option("-w", "--max-workers", type=int, default=4,
              help="Parallel workers")
@click.option("--judge-model", default="gpt-5-mini-2025-08-07",
              help="LLM judge model")
@click.option("-o", "--output", "output_path", default=None,
              help="Output JSONL path")
@click.option("--seed", type=int, default=42, help="Random seed")
@click.option("--split", "dataset_split", default=None,
              help="Dataset split override")
@click.option("--temperature", type=float, default=0.0,
              help="Generation temperature")
@click.option("--max-tokens", type=int, default=2048,
              help="Max output tokens")
@click.option("--telemetry/--no-telemetry", default=False,
              help="Enable telemetry collection during eval")
@click.option("--gpu-metrics/--no-gpu-metrics", default=False,
              help="Enable GPU metrics collection")
@click.option(
    "--compact", is_flag=True, default=False,
    help="Dense single-table output",
)
@click.option(
    "--trace-detail", is_flag=True, default=False,
    help="Full per-step trace listing",
)
@click.option("--wandb-project", default="",
              help="W&B project name (enables tracking)")
@click.option("--wandb-entity", default="",
              help="W&B entity (team or user)")
@click.option("--wandb-tags", default="",
              help="Comma-separated W&B tags")
@click.option("--wandb-group", default="",
              help="W&B run group")
@click.option("--sheets-id", "sheets_spreadsheet_id", default="",
              help="Google Sheets spreadsheet ID")
@click.option("--sheets-worksheet", default="Results",
              help="Google Sheets worksheet name")
@click.option("--sheets-creds", "sheets_credentials_path",
              default="",
              help="Service account JSON path")
@click.option("--model-filter", default=None,
              help="Filter models by name substring (for multi-model configs)")
@click.option("--judge-engine", default="cloud",
              help="Engine key for LLM judge (default: cloud). "
              "Use 'vllm' to judge locally.")
@click.option("--agentic", is_flag=True, default=False,
              help="Use AgenticRunner for multi-turn agent execution")
@click.option("--episode-mode", is_flag=True, default=False,
              help="Sequential episode processing with lifelong learning "
                   "(required for lifelong-agent and similar benchmarks)")
@click.option("--concurrency", type=int, default=1,
              help="Parallel query execution (AgenticRunner only)")
@click.option("--query-timeout", type=float, default=None,
              help="Per-query wall-clock timeout in seconds (AgenticRunner only)")
@click.option("-v", "--verbose", is_flag=True, help="Verbose logging")
@click.pass_context
def run(ctx, config_path, benchmark, backend, model, engine_key, agent_name,
        tools, max_samples, max_workers, judge_model, output_path, seed,
        dataset_split, temperature, max_tokens, telemetry, gpu_metrics,
        compact, trace_detail,
        wandb_project, wandb_entity, wandb_tags, wandb_group,
        sheets_spreadsheet_id, sheets_worksheet, sheets_credentials_path,
        model_filter, judge_engine, agentic, episode_mode,
        concurrency, query_timeout, verbose):
    """Run a single benchmark evaluation, or a full suite from a TOML config."""
    _setup_logging(verbose)

    console = Console()

    # Config-driven mode
    if config_path is not None:
        _run_from_config(config_path, verbose, model_filter=model_filter)
        return

    # CLI-driven mode: validate required args
    if benchmark is None:
        raise click.UsageError(
            "Missing option '-b' / '--benchmark' "
            "(required when --config is not provided)"
        )
    if model is None:
        raise click.UsageError(
            "Missing option '-m' / '--model' "
            "(required when --config is not provided)"
        )

    from openjarvis.evals.core.types import RunConfig

    tool_list = [t.strip() for t in tools.split(",") if t.strip()] if tools else []

    # --episode-mode takes precedence over --agentic.
    # Note: EvalRunner also auto-detects episode_mode from the dataset
    # (via iter_episodes), so passing --episode-mode here is optional for
    # benchmarks like lifelong-agent that always require it.
    if episode_mode and agentic:
        LOGGER.warning(
            "--episode-mode and --agentic both set; using --episode-mode "
            "(provides proper multi-turn interaction via EvalRunner)"
        )
        agentic = False

    config = RunConfig(
        benchmark=benchmark,
        backend=backend,
        model=model,
        max_samples=max_samples,
        max_workers=max_workers,
        temperature=temperature,
        max_tokens=max_tokens,
        judge_model=judge_model,
        judge_engine=judge_engine,
        engine_key=engine_key,
        agent_name=agent_name,
        tools=tool_list,
        output_path=output_path,
        seed=seed,
        dataset_split=dataset_split,
        telemetry=telemetry,
        gpu_metrics=gpu_metrics,
        wandb_project=wandb_project,
        wandb_entity=wandb_entity,
        wandb_tags=wandb_tags,
        wandb_group=wandb_group,
        sheets_spreadsheet_id=sheets_spreadsheet_id,
        sheets_worksheet=sheets_worksheet,
        sheets_credentials_path=sheets_credentials_path,
        episode_mode=episode_mode,
    )

    # Banner + config
    print_banner(console)
    print_section(console, "Configuration")
    print_run_header(
        console,
        benchmark=benchmark,
        model=model,
        backend=backend,
        samples=max_samples,
        workers=max_workers,
    )
    if episode_mode:
        console.print(
            "  [cyan]Mode:[/cyan]       episode "
            "(sequential + lifelong learning)"
        )

    if agentic:
        # --- Agentic runner path ---
        print_section(console, "Agentic Evaluation")
        console.print(
            f"  [cyan]Concurrency:[/cyan] {concurrency}"
        )
        if query_timeout:
            console.print(
                f"  [cyan]Timeout:[/cyan]     {query_timeout}s per query"
            )
        _run_agentic(
            config, console=console,
            concurrency=concurrency,
            query_timeout=query_timeout,
        )
        return

    # Evaluation
    print_section(console, "Evaluation")
    summary = _run_single(config, console=console)

    # Results
    _output_path = getattr(summary, "_output_path", None)
    _traces_dir = getattr(summary, "_traces_dir", None)
    _print_summary(
        summary,
        console=console,
        output_path=_output_path,
        traces_dir=_traces_dir,
        compact=compact,
        trace_detail=trace_detail,
    )

run_all

run_all(model, engine_key, max_samples, max_workers, judge_model, output_dir, seed, verbose)

Run all benchmarks.

Source code in src/openjarvis/evals/cli.py
@main.command("run-all")
@click.option("-m", "--model", required=True, help="Model identifier")
@click.option("-e", "--engine", "engine_key", default=None,
              help="Engine key")
@click.option("-n", "--max-samples", type=int, default=None,
              help="Max samples per benchmark")
@click.option("-w", "--max-workers", type=int, default=4,
              help="Parallel workers")
@click.option("--judge-model", default="gpt-5-mini-2025-08-07", help="LLM judge model")
@click.option("--output-dir", default="results/",
              help="Output directory for results")
@click.option("--seed", type=int, default=42, help="Random seed")
@click.option("-v", "--verbose", is_flag=True, help="Verbose logging")
def run_all(model, engine_key, max_samples, max_workers, judge_model,
            output_dir, seed, verbose):
    """Run all benchmarks."""
    _setup_logging(verbose)

    from openjarvis.evals.core.runner import EvalRunner
    from openjarvis.evals.core.types import RunConfig

    console = Console()

    print_banner(console)
    print_section(console, "Suite Configuration")
    console.print(
        f"  [cyan]Model:[/cyan]      {model}\n"
        f"  [cyan]Benchmarks:[/cyan] {', '.join(BENCHMARKS.keys())}\n"
        f"  [cyan]Samples:[/cyan]    {max_samples if max_samples else 'all'}"
    )

    output_dir_path = Path(output_dir)
    output_dir_path.mkdir(parents=True, exist_ok=True)

    model_slug = model.replace("/", "-").replace(":", "-")
    summaries = []

    for i, bench_name in enumerate(BENCHMARKS, 1):
        print_section(console, f"Run {i}/{len(BENCHMARKS)}: {bench_name}")
        output_path = output_dir_path / f"{bench_name}_{model_slug}.jsonl"

        config = RunConfig(
            benchmark=bench_name,
            backend="jarvis-direct",
            model=model,
            max_samples=max_samples,
            max_workers=max_workers,
            judge_model=judge_model,
            engine_key=engine_key,
            output_path=str(output_path),
            seed=seed,
        )

        eval_backend = _build_backend("jarvis-direct", engine_key, "orchestrator", [])
        dataset = _build_dataset(bench_name)
        judge_backend = _build_judge_backend(judge_model, engine_key="cloud")
        scorer = _build_scorer(bench_name, judge_backend, judge_model)

        trackers = _build_trackers(config)
        runner = EvalRunner(config, dataset, eval_backend, scorer, trackers=trackers)
        try:
            if max_samples and max_samples > 0:
                with Progress(
                    SpinnerColumn(),
                    TextColumn("[progress.description]{task.description}"),
                    BarColumn(),
                    TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                    TimeRemainingColumn(),
                    console=console,
                ) as progress:
                    task = progress.add_task(
                        f"Evaluating {bench_name}...", total=max_samples,
                    )
                    summary = runner.run(
                        progress_callback=lambda done, total: progress.update(
                            task, completed=done,
                        ),
                    )
            else:
                with console.status(f"Evaluating {bench_name}..."):
                    summary = runner.run()
            summaries.append(summary)
            console.print(
                f"  [green]{summary.accuracy:.4f}[/green] "
                f"({summary.correct}/{summary.scored_samples})"
            )
        except Exception as exc:
            console.print(f"  [red bold]FAILED:[/red bold] {exc}")
        finally:
            eval_backend.close()
            if judge_backend is not None:
                judge_backend.close()

    # Print overall summary
    if summaries:
        print_section(console, "Suite Results")
        print_suite_summary(console, summaries, f"All Benchmarks / {model}")

summarize

summarize(jsonl_path)

Summarize results from a JSONL output file.

Source code in src/openjarvis/evals/cli.py
@main.command()
@click.argument("jsonl_path", type=click.Path(exists=True))
def summarize(jsonl_path):
    """Summarize results from a JSONL output file."""
    records = []
    with open(jsonl_path) as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))

    if not records:
        click.echo("No records found.")
        return

    console = Console()
    total = len(records)
    scored = [r for r in records if r.get("is_correct") is not None]
    correct = [r for r in scored if r["is_correct"]]
    errors = [r for r in records if r.get("error")]
    accuracy = len(correct) / len(scored) if scored else 0.0

    console.print(f"[cyan]File:[/cyan]      {jsonl_path}")
    console.print(f"[cyan]Benchmark:[/cyan] {records[0].get('benchmark', '?')}")
    console.print(f"[cyan]Model:[/cyan]     {records[0].get('model', '?')}")
    console.print(f"[cyan]Total:[/cyan]     {total}")
    console.print(f"[cyan]Scored:[/cyan]    {len(scored)}")
    console.print(f"[cyan]Correct:[/cyan]   {len(correct)}")
    console.print(f"[cyan]Accuracy:[/cyan]  [bold]{accuracy:.4f}[/bold]")
    console.print(f"[cyan]Errors:[/cyan]    {len(errors)}")

list_cmd

list_cmd()

List available benchmarks and backends.

Source code in src/openjarvis/evals/cli.py
@main.command("list")
def list_cmd():
    """List available benchmarks and backends."""
    console = Console()
    print_banner(console)

    from rich.table import Table

    bench_table = Table(
        title="[bold]Available Benchmarks[/bold]",
        border_style="bright_blue",
        title_style="bold cyan",
    )
    bench_table.add_column("Name", style="cyan", no_wrap=True)
    bench_table.add_column("Category", style="white")
    bench_table.add_column("Description")
    for name, info in BENCHMARKS.items():
        bench_table.add_row(name, info["category"], info["description"])
    console.print(bench_table)

    backend_table = Table(
        title="[bold]Available Backends[/bold]",
        border_style="bright_blue",
        title_style="bold cyan",
    )
    backend_table.add_column("Name", style="cyan", no_wrap=True)
    backend_table.add_column("Description")
    for name, desc in BACKENDS.items():
        backend_table.add_row(name, desc)
    console.print(backend_table)