Skip to content

Index

optimize

Backward-compatibility shim -- optimize moved to learning.optimize.

Classes

LLMOptimizer

LLMOptimizer(search_space: SearchSpace, optimizer_model: str = 'claude-sonnet-4-6', optimizer_backend: Optional[InferenceBackend] = None)

Uses a cloud LLM to propose optimal OpenJarvis configs.

Inspired by DSPy's GEPA: uses textual feedback from execution traces rather than just scalar rewards.

Source code in src/openjarvis/learning/optimize/llm_optimizer.py
def __init__(
    self,
    search_space: SearchSpace,
    optimizer_model: str = "claude-sonnet-4-6",
    optimizer_backend: Optional[InferenceBackend] = None,
) -> None:
    self.search_space = search_space
    self.optimizer_model = optimizer_model
    self.optimizer_backend = optimizer_backend
Functions
propose_initial
propose_initial() -> TrialConfig

Propose a reasonable starting config from the search space.

Source code in src/openjarvis/learning/optimize/llm_optimizer.py
def propose_initial(self) -> TrialConfig:
    """Propose a reasonable starting config from the search space."""
    if self.optimizer_backend is None:
        raise ValueError(
            "optimizer_backend is required to propose configurations"
        )

    prompt = self._build_initial_prompt()
    response = self.optimizer_backend.generate(
        prompt,
        model=self.optimizer_model,
        system="You are an expert AI systems optimizer.",
        temperature=0.7,
        max_tokens=2048,
    )
    return self._parse_config_response(response)
propose_next
propose_next(history: List[TrialResult], traces: Optional[List[Trace]] = None, frontier_ids: Optional[set] = None) -> TrialConfig

Ask the LLM to propose the next config to evaluate.

Source code in src/openjarvis/learning/optimize/llm_optimizer.py
def propose_next(
    self,
    history: List[TrialResult],
    traces: Optional[List[Trace]] = None,
    frontier_ids: Optional[set] = None,
) -> TrialConfig:
    """Ask the LLM to propose the next config to evaluate."""
    if self.optimizer_backend is None:
        raise ValueError(
            "optimizer_backend is required to propose configurations"
        )

    prompt = self._build_propose_prompt(history, traces, frontier_ids=frontier_ids)
    response = self.optimizer_backend.generate(
        prompt,
        model=self.optimizer_model,
        system="You are an expert AI systems optimizer.",
        temperature=0.7,
        max_tokens=2048,
    )
    return self._parse_config_response(response)
analyze_trial
analyze_trial(trial: TrialConfig, summary: RunSummary, traces: Optional[List[Trace]] = None, sample_scores: Optional[List[SampleScore]] = None, per_benchmark: Optional[List[BenchmarkScore]] = None) -> TrialFeedback

Ask the LLM to analyze a completed trial. Returns structured feedback.

Source code in src/openjarvis/learning/optimize/llm_optimizer.py
def analyze_trial(
    self,
    trial: TrialConfig,
    summary: RunSummary,
    traces: Optional[List[Trace]] = None,
    sample_scores: Optional[List[SampleScore]] = None,
    per_benchmark: Optional[List[BenchmarkScore]] = None,
) -> TrialFeedback:
    """Ask the LLM to analyze a completed trial. Returns structured feedback."""
    if self.optimizer_backend is None:
        raise ValueError(
            "optimizer_backend is required to analyze trials"
        )

    prompt = self._build_analyze_prompt(
        trial, summary, traces, sample_scores, per_benchmark,
    )
    response = self.optimizer_backend.generate(
        prompt,
        model=self.optimizer_model,
        system="You are an expert AI systems analyst.",
        temperature=0.3,
        max_tokens=2048,
    )
    return self._parse_feedback_response(response)
propose_targeted
propose_targeted(history: List[TrialResult], base_config: TrialConfig, target_primitive: str, frontier_ids: Optional[set] = None) -> TrialConfig

Propose a config that only changes one primitive.

Source code in src/openjarvis/learning/optimize/llm_optimizer.py
def propose_targeted(
    self,
    history: List[TrialResult],
    base_config: TrialConfig,
    target_primitive: str,
    frontier_ids: Optional[set] = None,
) -> TrialConfig:
    """Propose a config that only changes one primitive."""
    if self.optimizer_backend is None:
        raise ValueError(
            "optimizer_backend is required to propose configurations"
        )

    prompt = self._build_targeted_prompt(
        history, base_config, target_primitive, frontier_ids,
    )
    response = self.optimizer_backend.generate(
        prompt,
        model=self.optimizer_model,
        system="You are an expert AI systems optimizer.",
        temperature=0.7,
        max_tokens=2048,
    )
    proposed = self._parse_config_response(response)

    # Enforce constraint: preserve non-target params from base_config
    merged_params = dict(base_config.params)
    for key, value in proposed.params.items():
        if key.startswith(target_primitive + ".") or key.startswith(
            target_primitive.rstrip("s") + "."
        ):
            merged_params[key] = value
    proposed.params = merged_params
    return proposed
propose_merge
propose_merge(candidates: List[TrialResult], history: List[TrialResult], frontier_ids: Optional[set] = None) -> TrialConfig

Combine best aspects of frontier members into one config.

Source code in src/openjarvis/learning/optimize/llm_optimizer.py
def propose_merge(
    self,
    candidates: List[TrialResult],
    history: List[TrialResult],
    frontier_ids: Optional[set] = None,
) -> TrialConfig:
    """Combine best aspects of frontier members into one config."""
    if self.optimizer_backend is None:
        raise ValueError(
            "optimizer_backend is required to propose configurations"
        )

    prompt = self._build_merge_prompt(candidates, history, frontier_ids)
    response = self.optimizer_backend.generate(
        prompt,
        model=self.optimizer_model,
        system="You are an expert AI systems optimizer.",
        temperature=0.7,
        max_tokens=2048,
    )
    return self._parse_config_response(response)

OptimizationEngine

OptimizationEngine(search_space: SearchSpace, llm_optimizer: LLMOptimizer, trial_runner: TrialRunner, store: Optional[OptimizationStore] = None, max_trials: int = 20, early_stop_patience: int = 5)

Orchestrates the optimize loop: propose -> evaluate -> analyze -> repeat.

Source code in src/openjarvis/learning/optimize/optimizer.py
def __init__(
    self,
    search_space: SearchSpace,
    llm_optimizer: LLMOptimizer,
    trial_runner: TrialRunner,
    store: Optional[OptimizationStore] = None,
    max_trials: int = 20,
    early_stop_patience: int = 5,
) -> None:
    self.search_space = search_space
    self.llm_optimizer = llm_optimizer
    self.trial_runner = trial_runner
    self.store = store
    self.max_trials = max_trials
    self.early_stop_patience = early_stop_patience
Functions
run
run(progress_callback: Optional[Callable[[int, int], None]] = None) -> OptimizationRun

Execute the full optimization loop.

  1. Generate a run_id via uuid.
  2. llm_optimizer.propose_initial() -> first config.
  3. Loop up to max_trials: a. trial_runner.run_trial(config) -> TrialResult b. llm_optimizer.analyze_trial(config, summary, traces) c. Update TrialResult with analysis text d. Append to history e. If store, store.save_trial(result) f. Update best_trial if accuracy improved g. Check early stopping (no improvement for patience trials) h. If not stopped, llm_optimizer.propose_next(history)
  4. Set run status to "completed".
  5. If store, store.save_run(optimization_run).
  6. Return the :class:OptimizationRun.

Args: progress_callback: Optional (trial_num, max_trials) -> None called after each trial completes.

Source code in src/openjarvis/learning/optimize/optimizer.py
def run(
    self,
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> OptimizationRun:
    """Execute the full optimization loop.

    1. Generate a run_id via uuid.
    2. ``llm_optimizer.propose_initial()`` -> first config.
    3. Loop up to ``max_trials``:
       a. ``trial_runner.run_trial(config)`` -> TrialResult
       b. ``llm_optimizer.analyze_trial(config, summary, traces)``
       c. Update TrialResult with analysis text
       d. Append to history
       e. If store, ``store.save_trial(result)``
       f. Update best_trial if accuracy improved
       g. Check early stopping (no improvement for *patience* trials)
       h. If not stopped, ``llm_optimizer.propose_next(history)``
    4. Set run status to ``"completed"``.
    5. If store, ``store.save_run(optimization_run)``.
    6. Return the :class:`OptimizationRun`.

    Args:
        progress_callback: Optional ``(trial_num, max_trials) -> None``
            called after each trial completes.
    """
    run_id = uuid.uuid4().hex[:16]
    # Detect benchmark name(s) from the trial runner
    from openjarvis.learning.optimize.trial_runner import MultiBenchTrialRunner

    benchmark_name = getattr(self.trial_runner, "benchmark", "")
    benchmark_names: List[str] = []
    if isinstance(self.trial_runner, MultiBenchTrialRunner):
        benchmark_names = [
            s.benchmark for s in self.trial_runner.benchmark_specs
        ]
        benchmark_name = "+".join(benchmark_names)

    optimization_run = OptimizationRun(
        run_id=run_id,
        search_space=self.search_space,
        status="running",
        optimizer_model=self.llm_optimizer.optimizer_model,
        benchmark=benchmark_name,
        benchmarks=benchmark_names,
    )

    history: List[TrialResult] = []
    best_accuracy = -1.0
    trials_without_improvement = 0

    # First config
    config = self.llm_optimizer.propose_initial()

    for trial_num in range(1, self.max_trials + 1):
        LOGGER.info(
            "Trial %d/%d (id=%s)",
            trial_num,
            self.max_trials,
            config.trial_id,
        )

        # Evaluate
        result = self.trial_runner.run_trial(config)

        # Analyze — returns TrialFeedback
        if result.summary is not None:
            feedback = self.llm_optimizer.analyze_trial(
                config,
                result.summary,
                sample_scores=result.sample_scores or None,
                per_benchmark=result.per_benchmark or None,
            )
            result.structured_feedback = feedback
            result.analysis = feedback.summary_text
        elif result.per_benchmark:
            # Multi-benchmark composite: build a synthetic summary
            # for analysis from per_benchmark data
            from openjarvis.evals.core.types import RunSummary as _RS

            synth = _RS(
                benchmark="multi",
                category="multi",
                backend="jarvis-agent",
                model=result.config.params.get("intelligence.model", ""),
                accuracy=result.accuracy,
                mean_latency_seconds=result.mean_latency_seconds,
                total_cost_usd=result.total_cost_usd,
                total_energy_joules=result.total_energy_joules,
                total_samples=result.samples_evaluated,
                scored_samples=result.samples_evaluated,
                correct=int(
                    result.accuracy * result.samples_evaluated
                ),
                errors=0,
                total_input_tokens=0,
                total_output_tokens=result.total_tokens,
            )
            feedback = self.llm_optimizer.analyze_trial(
                config,
                synth,
                per_benchmark=result.per_benchmark,
            )
            result.structured_feedback = feedback
            result.analysis = feedback.summary_text
        else:
            result.analysis = ""

        # Record
        history.append(result)
        optimization_run.trials.append(result)

        # Recompute Pareto frontier
        optimization_run.pareto_frontier = compute_pareto_frontier(
            history, optimization_run.objectives,
        )
        frontier_ids = {t.trial_id for t in optimization_run.pareto_frontier}

        # Persist trial
        if self.store is not None:
            self.store.save_trial(run_id, result)

        # Track best
        if result.accuracy > best_accuracy:
            best_accuracy = result.accuracy
            optimization_run.best_trial = result
            trials_without_improvement = 0
        else:
            trials_without_improvement += 1

        # Progress callback
        if progress_callback is not None:
            progress_callback(trial_num, self.max_trials)

        # Early stopping
        if trials_without_improvement >= self.early_stop_patience:
            LOGGER.info(
                "Early stopping after %d trials without improvement.",
                self.early_stop_patience,
            )
            break

        # Propose next (unless this was the last trial)
        if trial_num < self.max_trials:
            # Decide proposal strategy
            target_primitive = ""
            if result.structured_feedback:
                target_primitive = result.structured_feedback.target_primitive

            if (
                trial_num % 5 == 0
                and len(optimization_run.pareto_frontier) >= 2
            ):
                # Merge frontier members periodically
                candidates = optimization_run.pareto_frontier[:3]
                config = self.llm_optimizer.propose_merge(
                    candidates, history, frontier_ids=frontier_ids,
                )
            elif target_primitive and trial_num > 2:
                # Targeted mutation on the suggested primitive
                config = self.llm_optimizer.propose_targeted(
                    history,
                    result.config,
                    target_primitive,
                    frontier_ids=frontier_ids,
                )
            else:
                config = self.llm_optimizer.propose_next(
                    history, frontier_ids=frontier_ids,
                )

    optimization_run.status = "completed"

    if self.store is not None:
        self.store.save_run(optimization_run)

    return optimization_run
export_best_recipe
export_best_recipe(run: OptimizationRun, path: Path) -> Path

Export the best trial's config as a TOML recipe file.

Args: run: A completed :class:OptimizationRun. path: Destination path for the TOML file.

Returns: The path written to.

Raises: ValueError: If there is no best trial in the run.

Source code in src/openjarvis/learning/optimize/optimizer.py
def export_best_recipe(
    self, run: OptimizationRun, path: Path
) -> Path:
    """Export the best trial's config as a TOML recipe file.

    Args:
        run: A completed :class:`OptimizationRun`.
        path: Destination path for the TOML file.

    Returns:
        The *path* written to.

    Raises:
        ValueError: If there is no best trial in the run.
    """
    if run.best_trial is None:
        raise ValueError("No best trial to export.")

    recipe_data = self._trial_to_recipe_dict(run.best_trial)
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    if tomli_w is not None:
        with open(path, "wb") as fh:
            tomli_w.dump(recipe_data, fh)
    else:
        # Fallback: write TOML manually
        self._write_toml_fallback(recipe_data, path)

    run.best_recipe_path = str(path)
    return path

OptimizationStore

OptimizationStore(db_path: Union[str, Path])

SQLite-backed storage for optimization runs and trials.

Source code in src/openjarvis/learning/optimize/store.py
def __init__(self, db_path: Union[str, Path]) -> None:
    self._db_path = str(db_path)
    self._conn = sqlite3.connect(self._db_path)
    self._conn.execute("PRAGMA journal_mode=WAL")
    self._conn.execute(_CREATE_RUNS)
    self._conn.execute(_CREATE_TRIALS)
    self._conn.commit()
    self._migrate()
Functions
save_run
save_run(run: OptimizationRun) -> None

Persist an optimization run (insert or update).

Source code in src/openjarvis/learning/optimize/store.py
def save_run(self, run: OptimizationRun) -> None:
    """Persist an optimization run (insert or update)."""
    now = time.time()
    search_space_json = self._search_space_to_json(run.search_space)
    best_trial_id = run.best_trial.trial_id if run.best_trial else None
    pareto_ids = json.dumps([t.trial_id for t in run.pareto_frontier])
    self._conn.execute(
        _INSERT_RUN,
        (
            run.run_id,
            search_space_json,
            run.status,
            run.optimizer_model,
            run.benchmark,
            best_trial_id,
            run.best_recipe_path,
            now,
            now,
        ),
    )
    benchmarks_json = json.dumps(run.benchmarks)
    # Update pareto_frontier_ids and benchmarks separately
    self._conn.execute(
        "UPDATE optimization_runs SET pareto_frontier_ids = ?, "
        "benchmarks = ? WHERE run_id = ?",
        (pareto_ids, benchmarks_json, run.run_id),
    )
    self._conn.commit()
get_run
get_run(run_id: str) -> Optional[OptimizationRun]

Retrieve an optimization run by id, or None.

Source code in src/openjarvis/learning/optimize/store.py
def get_run(self, run_id: str) -> Optional[OptimizationRun]:
    """Retrieve an optimization run by id, or ``None``."""
    row = self._conn.execute(
        "SELECT * FROM optimization_runs WHERE run_id = ?",
        (run_id,),
    ).fetchone()
    if row is None:
        return None
    return self._row_to_run(row)
list_runs
list_runs(limit: int = 50) -> List[Dict[str, Any]]

Return summary dicts of recent optimization runs.

Source code in src/openjarvis/learning/optimize/store.py
def list_runs(self, limit: int = 50) -> List[Dict[str, Any]]:
    """Return summary dicts of recent optimization runs."""
    rows = self._conn.execute(
        "SELECT * FROM optimization_runs ORDER BY created_at DESC LIMIT ?",
        (limit,),
    ).fetchall()
    result: List[Dict[str, Any]] = []
    for row in rows:
        result.append(
            {
                "run_id": row[1],
                "status": row[3],
                "optimizer_model": row[4],
                "benchmark": row[5],
                "best_trial_id": row[6],
                "best_recipe_path": row[7],
                "created_at": row[8],
                "updated_at": row[9],
            }
        )
    return result
save_trial
save_trial(run_id: str, trial: TrialResult) -> None

Persist a single trial result.

Source code in src/openjarvis/learning/optimize/store.py
def save_trial(self, run_id: str, trial: TrialResult) -> None:
    """Persist a single trial result."""
    now = time.time()
    # Serialize sample_scores
    scores_json = json.dumps([
        {
            "record_id": s.record_id,
            "is_correct": s.is_correct,
            "score": s.score,
            "latency_seconds": s.latency_seconds,
            "prompt_tokens": s.prompt_tokens,
            "completion_tokens": s.completion_tokens,
            "cost_usd": s.cost_usd,
            "error": s.error,
            "ttft": s.ttft,
            "energy_joules": s.energy_joules,
            "power_watts": s.power_watts,
            "gpu_utilization_pct": s.gpu_utilization_pct,
            "throughput_tok_per_sec": s.throughput_tok_per_sec,
            "mfu_pct": s.mfu_pct,
            "mbu_pct": s.mbu_pct,
            "ipw": s.ipw,
            "ipj": s.ipj,
            "energy_per_output_token_joules": s.energy_per_output_token_joules,
            "throughput_per_watt": s.throughput_per_watt,
            "mean_itl_ms": s.mean_itl_ms,
        }
        for s in trial.sample_scores
    ])
    # Serialize structured_feedback
    fb = trial.structured_feedback
    fb_json = json.dumps({
        "summary_text": fb.summary_text,
        "failure_patterns": fb.failure_patterns,
        "primitive_ratings": fb.primitive_ratings,
        "suggested_changes": fb.suggested_changes,
        "target_primitive": fb.target_primitive,
    }) if fb else "{}"

    self._conn.execute(
        _INSERT_TRIAL,
        (
            trial.trial_id,
            run_id,
            json.dumps(trial.config.params),
            trial.config.reasoning,
            trial.accuracy,
            trial.mean_latency_seconds,
            trial.total_cost_usd,
            trial.total_energy_joules,
            trial.total_tokens,
            trial.samples_evaluated,
            trial.analysis,
            json.dumps(trial.failure_modes),
            now,
        ),
    )
    # Serialize per_benchmark
    pb_json = json.dumps([
        {
            "benchmark": b.benchmark,
            "accuracy": b.accuracy,
            "mean_latency_seconds": b.mean_latency_seconds,
            "total_cost_usd": b.total_cost_usd,
            "total_energy_joules": b.total_energy_joules,
            "total_tokens": b.total_tokens,
            "samples_evaluated": b.samples_evaluated,
            "errors": b.errors,
            "weight": b.weight,
        }
        for b in trial.per_benchmark
    ])

    # Update new columns separately
    self._conn.execute(
        "UPDATE trial_results SET sample_scores = ?, "
        "structured_feedback = ?, per_benchmark = ? "
        "WHERE trial_id = ? AND run_id = ?",
        (scores_json, fb_json, pb_json, trial.trial_id, run_id),
    )
    self._conn.commit()
get_trials
get_trials(run_id: str) -> List[TrialResult]

Retrieve all trial results for a given run.

Source code in src/openjarvis/learning/optimize/store.py
def get_trials(self, run_id: str) -> List[TrialResult]:
    """Retrieve all trial results for a given run."""
    rows = self._conn.execute(
        "SELECT * FROM trial_results WHERE run_id = ? ORDER BY id",
        (run_id,),
    ).fetchall()
    return [self._row_to_trial(r) for r in rows]
close
close() -> None

Close the underlying SQLite connection.

Source code in src/openjarvis/learning/optimize/store.py
def close(self) -> None:
    """Close the underlying SQLite connection."""
    self._conn.close()

BenchmarkSpec dataclass

BenchmarkSpec(benchmark: str, max_samples: int = 200, weight: float = 1.0)

Specification for one benchmark in a multi-benchmark optimization.

MultiBenchTrialRunner

MultiBenchTrialRunner(benchmark_specs: List[BenchmarkSpec], judge_model: str = 'gpt-5-mini-2025-08-07', output_dir: str = 'results/optimize/')

Evaluates a proposed config across multiple benchmarks.

Delegates to :class:TrialRunner per benchmark, then aggregates results into a single composite :class:TrialResult with weighted metrics and per-benchmark breakdowns.

Source code in src/openjarvis/learning/optimize/trial_runner.py
def __init__(
    self,
    benchmark_specs: List[BenchmarkSpec],
    judge_model: str = "gpt-5-mini-2025-08-07",
    output_dir: str = "results/optimize/",
) -> None:
    self.benchmark_specs = benchmark_specs
    self.judge_model = judge_model
    self.output_dir = output_dir
Functions
run_trial
run_trial(trial: TrialConfig) -> TrialResult

Run trial against all benchmarks and return a composite result.

Source code in src/openjarvis/learning/optimize/trial_runner.py
def run_trial(self, trial: TrialConfig) -> TrialResult:
    """Run *trial* against all benchmarks and return a composite result."""
    per_benchmark: List[BenchmarkScore] = []

    for spec in self.benchmark_specs:
        if spec.benchmark == "terminalbench-native":
            score = self._run_terminalbench_native(trial, spec)
        else:
            runner = TrialRunner(
                benchmark=spec.benchmark,
                max_samples=spec.max_samples,
                judge_model=self.judge_model,
                output_dir=self.output_dir,
            )
            result = runner.run_trial(trial)
            score = BenchmarkScore(
                benchmark=spec.benchmark,
                accuracy=result.accuracy,
                mean_latency_seconds=result.mean_latency_seconds,
                total_cost_usd=result.total_cost_usd,
                total_energy_joules=result.total_energy_joules,
                total_tokens=result.total_tokens,
                samples_evaluated=result.samples_evaluated,
                errors=len([s for s in result.sample_scores if s.error]),
                weight=spec.weight,
                summary=result.summary,
                sample_scores=result.sample_scores,
            )
        per_benchmark.append(score)

    return self._aggregate(trial, per_benchmark)

TrialRunner

TrialRunner(benchmark: str, max_samples: int = 50, judge_model: str = 'gpt-5-mini-2025-08-07', output_dir: str = 'results/optimize/')

Evaluates a proposed config against a benchmark.

Bridges the optimization types (:class:TrialConfig) to the eval framework (:class:EvalRunner) so the optimizer can score candidate configurations end-to-end.

Source code in src/openjarvis/learning/optimize/trial_runner.py
def __init__(
    self,
    benchmark: str,
    max_samples: int = 50,
    judge_model: str = "gpt-5-mini-2025-08-07",
    output_dir: str = "results/optimize/",
) -> None:
    self.benchmark = benchmark
    self.max_samples = max_samples
    self.judge_model = judge_model
    self.output_dir = output_dir
Functions
run_trial
run_trial(trial: TrialConfig) -> TrialResult

Run trial against the configured benchmark and return a result.

Steps: 1. Convert trial to a :class:Recipe and extract params. 2. Build a :class:RunConfig from recipe + benchmark settings. 3. Lazily import eval-framework registries to resolve the benchmark -> dataset + scorer, and build the backend. 4. Execute via EvalRunner.run() -> :class:RunSummary. 5. Map the summary into a :class:TrialResult.

Source code in src/openjarvis/learning/optimize/trial_runner.py
def run_trial(self, trial: TrialConfig) -> TrialResult:
    """Run *trial* against the configured benchmark and return a result.

    Steps:
    1. Convert ``trial`` to a :class:`Recipe` and extract params.
    2. Build a :class:`RunConfig` from recipe + benchmark settings.
    3. Lazily import eval-framework registries to resolve the
       benchmark -> dataset + scorer, and build the backend.
    4. Execute via ``EvalRunner.run()`` -> :class:`RunSummary`.
    5. Map the summary into a :class:`TrialResult`.
    """
    recipe = trial.to_recipe()
    run_config = self._build_run_config(trial, recipe)

    # Lazy imports so the optimize package stays lightweight
    from openjarvis.evals.cli import (
        _build_backend,
        _build_dataset,
        _build_judge_backend,
        _build_scorer,
    )
    from openjarvis.evals.core.runner import EvalRunner

    dataset = _build_dataset(self.benchmark)
    backend = _build_backend(
        run_config.backend,
        run_config.engine_key,
        run_config.agent_name or "orchestrator",
        run_config.tools,
    )
    judge_backend = _build_judge_backend(run_config.judge_model)
    scorer = _build_scorer(
        self.benchmark, judge_backend, run_config.judge_model,
    )

    try:
        eval_runner = EvalRunner(
            run_config, dataset, backend, scorer,
        )
        summary: RunSummary = eval_runner.run()
        eval_results = eval_runner.results
    finally:
        backend.close()
        judge_backend.close()

    return self._summary_to_result(trial, summary, eval_results=eval_results)

BenchmarkScore dataclass

BenchmarkScore(benchmark: str, accuracy: float = 0.0, mean_latency_seconds: float = 0.0, total_cost_usd: float = 0.0, total_energy_joules: float = 0.0, total_tokens: int = 0, samples_evaluated: int = 0, errors: int = 0, weight: float = 1.0, summary: Optional[Any] = None, sample_scores: List['SampleScore'] = list())

Per-benchmark metrics from a multi-benchmark evaluation trial.

ObjectiveSpec dataclass

ObjectiveSpec(metric: str, direction: str, weight: float = 1.0)

A single optimization objective.

OptimizationRun dataclass

OptimizationRun(run_id: str, search_space: SearchSpace, trials: List[TrialResult] = list(), best_trial: Optional[TrialResult] = None, best_recipe_path: Optional[str] = None, status: str = 'running', optimizer_model: str = '', benchmark: str = '', benchmarks: List[str] = list(), pareto_frontier: List[TrialResult] = list(), objectives: List[ObjectiveSpec] = (lambda: list(DEFAULT_OBJECTIVES))())

Complete optimization session.

SampleScore dataclass

SampleScore(record_id: str, is_correct: Optional[bool] = None, score: Optional[float] = None, latency_seconds: float = 0.0, prompt_tokens: int = 0, completion_tokens: int = 0, cost_usd: float = 0.0, error: Optional[str] = None, ttft: float = 0.0, energy_joules: float = 0.0, power_watts: float = 0.0, gpu_utilization_pct: float = 0.0, throughput_tok_per_sec: float = 0.0, mfu_pct: float = 0.0, mbu_pct: float = 0.0, ipw: float = 0.0, ipj: float = 0.0, energy_per_output_token_joules: float = 0.0, throughput_per_watt: float = 0.0, mean_itl_ms: float = 0.0)

Per-sample metrics from an evaluation trial.

SearchDimension dataclass

SearchDimension(name: str, dim_type: str, values: List[Any] = list(), low: Optional[float] = None, high: Optional[float] = None, description: str = '', primitive: str = '')

One tunable dimension in the config space.

SearchSpace dataclass

SearchSpace(dimensions: List[SearchDimension] = list(), fixed: Dict[str, Any] = dict(), constraints: List[str] = list())

The full space of configs the optimizer can propose.

Functions
to_prompt_description
to_prompt_description() -> str

Render search space as structured text for the LLM optimizer.

Source code in src/openjarvis/learning/optimize/types.py
def to_prompt_description(self) -> str:
    """Render search space as structured text for the LLM optimizer."""
    lines: List[str] = []
    lines.append("# Search Space")
    lines.append("")

    # Group dimensions by primitive
    by_primitive: Dict[str, List[SearchDimension]] = {}
    for dim in self.dimensions:
        key = dim.primitive or "other"
        by_primitive.setdefault(key, []).append(dim)

    for primitive, dims in sorted(by_primitive.items()):
        lines.append(f"## {primitive.title()}")
        for dim in dims:
            lines.append(f"- **{dim.name}** ({dim.dim_type})")
            if dim.description:
                lines.append(f"  Description: {dim.description}")
            if dim.dim_type in ("categorical", "subset"):
                lines.append(f"  Options: {dim.values}")
            elif dim.dim_type in ("continuous", "integer"):
                lines.append(f"  Range: [{dim.low}, {dim.high}]")
            elif dim.dim_type == "text":
                lines.append("  Free-form text")
        lines.append("")

    if self.fixed:
        lines.append("## Fixed Parameters")
        for k, v in sorted(self.fixed.items()):
            lines.append(f"- {k} = {v}")
        lines.append("")

    if self.constraints:
        lines.append("## Constraints")
        for c in self.constraints:
            lines.append(f"- {c}")
        lines.append("")

    return "\n".join(lines)

TrialConfig dataclass

TrialConfig(trial_id: str, params: Dict[str, Any] = dict(), reasoning: str = '')

A single candidate configuration proposed by the optimizer.

Functions
to_recipe
to_recipe() -> Recipe

Map params back to Recipe fields.

Source code in src/openjarvis/learning/optimize/types.py
def to_recipe(self) -> Recipe:
    """Map params back to Recipe fields."""
    kwargs: Dict[str, Any] = {}
    for dotted_key, value in self.params.items():
        recipe_field = _PARAM_TO_RECIPE.get(dotted_key)
        if recipe_field is not None:
            kwargs[recipe_field] = value

    return Recipe(
        name=f"trial-{self.trial_id}",
        **kwargs,
    )

TrialFeedback dataclass

TrialFeedback(summary_text: str = '', failure_patterns: List[str] = list(), primitive_ratings: Dict[str, str] = dict(), suggested_changes: List[str] = list(), target_primitive: str = '')

Structured feedback from trial analysis.

TrialResult dataclass

TrialResult(trial_id: str, config: TrialConfig, accuracy: float = 0.0, mean_latency_seconds: float = 0.0, total_cost_usd: float = 0.0, total_energy_joules: float = 0.0, total_tokens: int = 0, samples_evaluated: int = 0, analysis: str = '', failure_modes: List[str] = list(), per_sample_feedback: List[Dict[str, Any]] = list(), summary: Optional[RunSummary] = None, sample_scores: List[SampleScore] = list(), structured_feedback: Optional[TrialFeedback] = None, per_benchmark: List[BenchmarkScore] = list())

Result of evaluating a trial, with both scalar and textual feedback.

Functions

load_benchmark_specs

load_benchmark_specs(data: Dict[str, Any]) -> List[Any]

Extract benchmark specs from a loaded optimization config.

Supports two formats: - Multi-benchmark: [[optimize.benchmarks]] array of tables - Single-benchmark fallback: optimize.benchmark string

Returns a list of :class:BenchmarkSpec (from trial_runner). Returns an empty list if no benchmarks are configured (caller should fall back to CLI --benchmark).

Source code in src/openjarvis/learning/optimize/config.py
def load_benchmark_specs(data: Dict[str, Any]) -> List[Any]:
    """Extract benchmark specs from a loaded optimization config.

    Supports two formats:
    - Multi-benchmark: ``[[optimize.benchmarks]]`` array of tables
    - Single-benchmark fallback: ``optimize.benchmark`` string

    Returns a list of :class:`BenchmarkSpec` (from trial_runner).
    Returns an empty list if no benchmarks are configured (caller
    should fall back to CLI --benchmark).
    """
    from openjarvis.learning.optimize.trial_runner import BenchmarkSpec

    optimize = data.get("optimize", {})

    # Multi-benchmark format: [[optimize.benchmarks]]
    raw_benchmarks = optimize.get("benchmarks")
    if raw_benchmarks and isinstance(raw_benchmarks, list):
        # Check if it's a list of dicts (table array) vs list of strings
        if raw_benchmarks and isinstance(raw_benchmarks[0], dict):
            specs: List[BenchmarkSpec] = []
            for entry in raw_benchmarks:
                specs.append(
                    BenchmarkSpec(
                        benchmark=entry.get("name", entry.get("benchmark", "")),
                        max_samples=entry.get("max_samples", 200),
                        weight=entry.get("weight", 1.0),
                    )
                )
            return specs

    # Single-benchmark fallback
    single = optimize.get("benchmark", "")
    if single:
        max_samples = optimize.get("max_samples", 50)
        return [BenchmarkSpec(benchmark=single, max_samples=max_samples)]

    return []

load_objectives

load_objectives(data: Dict[str, Any]) -> List[ObjectiveSpec]

Extract objectives from a loaded optimization config.

Reads optimize.objectives (a list of tables) and returns a list of :class:ObjectiveSpec. Falls back to :data:DEFAULT_OBJECTIVES if the key is absent.

Source code in src/openjarvis/learning/optimize/config.py
def load_objectives(data: Dict[str, Any]) -> List[ObjectiveSpec]:
    """Extract objectives from a loaded optimization config.

    Reads ``optimize.objectives`` (a list of tables) and returns
    a list of :class:`ObjectiveSpec`.  Falls back to
    :data:`DEFAULT_OBJECTIVES` if the key is absent.
    """
    from openjarvis.learning.optimize.types import DEFAULT_OBJECTIVES

    optimize = data.get("optimize", {})
    raw_objectives = optimize.get("objectives")
    if not raw_objectives:
        return list(DEFAULT_OBJECTIVES)

    result: List[ObjectiveSpec] = []
    for obj in raw_objectives:
        result.append(
            ObjectiveSpec(
                metric=obj["metric"],
                direction=obj.get("direction", "maximize"),
                weight=obj.get("weight", 1.0),
            )
        )
    return result

load_optimize_config

load_optimize_config(path: Union[str, Path]) -> Dict[str, Any]

Load an optimization config TOML file.

Returns the raw dict with keys such as optimize.max_trials, optimize.benchmark, optimize.search, optimize.fixed, optimize.constraints, etc.

Raises: FileNotFoundError: If path does not exist.

Source code in src/openjarvis/learning/optimize/config.py
def load_optimize_config(path: Union[str, Path]) -> Dict[str, Any]:
    """Load an optimization config TOML file.

    Returns the raw dict with keys such as ``optimize.max_trials``,
    ``optimize.benchmark``, ``optimize.search``, ``optimize.fixed``,
    ``optimize.constraints``, etc.

    Raises:
        FileNotFoundError: If *path* does not exist.
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"Optimization config not found: {path}")

    with open(path, "rb") as fh:
        data: Dict[str, Any] = tomllib.load(fh)

    return data

compute_pareto_frontier

compute_pareto_frontier(trials: List[TrialResult], objectives: List[ObjectiveSpec]) -> List[TrialResult]

Compute the Pareto frontier: trials not dominated by any other.

A trial A dominates trial B if A is >= B on all objectives and > B on at least one (direction-aware: maximize flips the comparison).

Source code in src/openjarvis/learning/optimize/optimizer.py
def compute_pareto_frontier(
    trials: List[TrialResult],
    objectives: List[ObjectiveSpec],
) -> List[TrialResult]:
    """Compute the Pareto frontier: trials not dominated by any other.

    A trial A dominates trial B if A is >= B on all objectives and > B
    on at least one (direction-aware: maximize flips the comparison).
    """
    if not trials or not objectives:
        return list(trials)

    def _values(trial: TrialResult) -> List[float]:
        vals = []
        for obj in objectives:
            v = _get_objective_value(trial, obj)
            # Normalize: for "minimize", negate so higher is always better
            if obj.direction == "minimize":
                v = -v
            vals.append(v)
        return vals

    trial_vals = [_values(t) for t in trials]
    frontier: List[TrialResult] = []

    for i, trial in enumerate(trials):
        dominated = False
        for j, other in enumerate(trials):
            if i == j:
                continue
            # Check if other dominates trial
            all_ge = all(
                trial_vals[j][k] >= trial_vals[i][k]
                for k in range(len(objectives))
            )
            any_gt = any(
                trial_vals[j][k] > trial_vals[i][k]
                for k in range(len(objectives))
            )
            if all_ge and any_gt:
                dominated = True
                break
        if not dominated:
            frontier.append(trial)

    return frontier

build_search_space

build_search_space(config: Dict[str, Any]) -> SearchSpace

Build a SearchSpace from a TOML-style config dict.

Expected format::

{
    "optimize": {
        "search": [
            {
                "name": "agent.type",
                "type": "categorical",
                "values": ["orchestrator", "native_react"],
                "description": "Agent architecture",
            },
            {
                "name": "intelligence.temperature",
                "type": "continuous",
                "low": 0.0,
                "high": 1.0,
                "description": "Generation temperature",
            },
        ],
        "fixed": {"engine": "ollama", "model": "qwen3:8b"},
        "constraints": {
            "rules": ["SimpleAgent should only have max_turns = 1"],
        },
    }
}
Source code in src/openjarvis/learning/optimize/search_space.py
def build_search_space(config: Dict[str, Any]) -> SearchSpace:
    """Build a SearchSpace from a TOML-style config dict.

    Expected format::

        {
            "optimize": {
                "search": [
                    {
                        "name": "agent.type",
                        "type": "categorical",
                        "values": ["orchestrator", "native_react"],
                        "description": "Agent architecture",
                    },
                    {
                        "name": "intelligence.temperature",
                        "type": "continuous",
                        "low": 0.0,
                        "high": 1.0,
                        "description": "Generation temperature",
                    },
                ],
                "fixed": {"engine": "ollama", "model": "qwen3:8b"},
                "constraints": {
                    "rules": ["SimpleAgent should only have max_turns = 1"],
                },
            }
        }
    """
    opt = config.get("optimize", {})
    search_entries: List[Dict[str, Any]] = opt.get("search", [])
    fixed: Dict[str, Any] = dict(opt.get("fixed", {}))
    constraints_sec = opt.get("constraints", {})
    constraints: List[str] = list(constraints_sec.get("rules", []))

    dimensions: List[SearchDimension] = []
    for entry in search_entries:
        # Infer primitive from the first segment of the dotted name
        name = entry.get("name", "")
        primitive = name.split(".")[0] if "." in name else ""

        dimensions.append(
            SearchDimension(
                name=name,
                dim_type=entry.get("type", "categorical"),
                values=list(entry.get("values", [])),
                low=entry.get("low"),
                high=entry.get("high"),
                description=entry.get("description", ""),
                primitive=primitive,
            )
        )

    return SearchSpace(
        dimensions=dimensions,
        fixed=fixed,
        constraints=constraints,
    )