Skip to content

trial_runner

trial_runner

TrialRunner -- evaluates a proposed config against a benchmark.

Classes

BenchmarkSpec dataclass

BenchmarkSpec(benchmark: str, max_samples: int = 200, weight: float = 1.0)

Specification for one benchmark in a multi-benchmark optimization.

TrialRunner

TrialRunner(benchmark: str, max_samples: int = 50, judge_model: str = 'gpt-5-mini-2025-08-07', output_dir: str = 'results/optimize/')

Evaluates a proposed config against a benchmark.

Bridges the optimization types (:class:TrialConfig) to the eval framework (:class:EvalRunner) so the optimizer can score candidate configurations end-to-end.

Source code in src/openjarvis/learning/optimize/trial_runner.py
def __init__(
    self,
    benchmark: str,
    max_samples: int = 50,
    judge_model: str = "gpt-5-mini-2025-08-07",
    output_dir: str = "results/optimize/",
) -> None:
    self.benchmark = benchmark
    self.max_samples = max_samples
    self.judge_model = judge_model
    self.output_dir = output_dir
Functions
run_trial
run_trial(trial: TrialConfig) -> TrialResult

Run trial against the configured benchmark and return a result.

Steps: 1. Convert trial to a :class:Recipe and extract params. 2. Build a :class:RunConfig from recipe + benchmark settings. 3. Lazily import eval-framework registries to resolve the benchmark -> dataset + scorer, and build the backend. 4. Execute via EvalRunner.run() -> :class:RunSummary. 5. Map the summary into a :class:TrialResult.

Source code in src/openjarvis/learning/optimize/trial_runner.py
def run_trial(self, trial: TrialConfig) -> TrialResult:
    """Run *trial* against the configured benchmark and return a result.

    Steps:
    1. Convert ``trial`` to a :class:`Recipe` and extract params.
    2. Build a :class:`RunConfig` from recipe + benchmark settings.
    3. Lazily import eval-framework registries to resolve the
       benchmark -> dataset + scorer, and build the backend.
    4. Execute via ``EvalRunner.run()`` -> :class:`RunSummary`.
    5. Map the summary into a :class:`TrialResult`.
    """
    recipe = trial.to_recipe()
    run_config = self._build_run_config(trial, recipe)

    # Lazy imports so the optimize package stays lightweight
    from openjarvis.evals.cli import (
        _build_backend,
        _build_dataset,
        _build_judge_backend,
        _build_scorer,
    )
    from openjarvis.evals.core.runner import EvalRunner

    dataset = _build_dataset(self.benchmark)
    backend = _build_backend(
        run_config.backend,
        run_config.engine_key,
        run_config.agent_name or "orchestrator",
        run_config.tools,
    )
    judge_backend = _build_judge_backend(run_config.judge_model)
    scorer = _build_scorer(
        self.benchmark, judge_backend, run_config.judge_model,
    )

    try:
        eval_runner = EvalRunner(
            run_config, dataset, backend, scorer,
        )
        summary: RunSummary = eval_runner.run()
        eval_results = eval_runner.results
    finally:
        backend.close()
        judge_backend.close()

    return self._summary_to_result(trial, summary, eval_results=eval_results)

MultiBenchTrialRunner

MultiBenchTrialRunner(benchmark_specs: List[BenchmarkSpec], judge_model: str = 'gpt-5-mini-2025-08-07', output_dir: str = 'results/optimize/')

Evaluates a proposed config across multiple benchmarks.

Delegates to :class:TrialRunner per benchmark, then aggregates results into a single composite :class:TrialResult with weighted metrics and per-benchmark breakdowns.

Source code in src/openjarvis/learning/optimize/trial_runner.py
def __init__(
    self,
    benchmark_specs: List[BenchmarkSpec],
    judge_model: str = "gpt-5-mini-2025-08-07",
    output_dir: str = "results/optimize/",
) -> None:
    self.benchmark_specs = benchmark_specs
    self.judge_model = judge_model
    self.output_dir = output_dir
Functions
run_trial
run_trial(trial: TrialConfig) -> TrialResult

Run trial against all benchmarks and return a composite result.

Source code in src/openjarvis/learning/optimize/trial_runner.py
def run_trial(self, trial: TrialConfig) -> TrialResult:
    """Run *trial* against all benchmarks and return a composite result."""
    per_benchmark: List[BenchmarkScore] = []

    for spec in self.benchmark_specs:
        if spec.benchmark == "terminalbench-native":
            score = self._run_terminalbench_native(trial, spec)
        else:
            runner = TrialRunner(
                benchmark=spec.benchmark,
                max_samples=spec.max_samples,
                judge_model=self.judge_model,
                output_dir=self.output_dir,
            )
            result = runner.run_trial(trial)
            score = BenchmarkScore(
                benchmark=spec.benchmark,
                accuracy=result.accuracy,
                mean_latency_seconds=result.mean_latency_seconds,
                total_cost_usd=result.total_cost_usd,
                total_energy_joules=result.total_energy_joules,
                total_tokens=result.total_tokens,
                samples_evaluated=result.samples_evaluated,
                errors=len([s for s in result.sample_scores if s.error]),
                weight=spec.weight,
                summary=result.summary,
                sample_scores=result.sample_scores,
            )
        per_benchmark.append(score)

    return self._aggregate(trial, per_benchmark)