terminalbench_native

terminalbench_native ¶

Native TerminalBench V2.1 backend.

Uses Harness for Docker-based execution and scoring.

Classes¶

TerminalBenchNativeBackend ¶

TerminalBenchNativeBackend(model: str = 'openai/default', api_base: str = 'http://localhost:8000/v1', temperature: float = 0.2, agent_name: str = 'naive', output_dir: str = 'results/terminalbench/', max_samples: Optional[int] = None, dataset_name: str = 'terminal-bench-core', dataset_version: str = '0.1.1', system_prompt: str = '', max_tokens: int = 16384, n_concurrent: int = 4, global_agent_timeout_sec: Optional[float] = 1800.0, global_timeout_multiplier: Optional[float] = None)

Bases: InferenceBackend

Runs terminal-bench tasks natively via Harness with Docker execution.

Uses terminal-bench's own agent + LiteLLM to call the model, Docker containers for task execution, and built-in test scripts for scoring. This gives real agentic evaluation, not text-only.

Args of note:

global_agent_timeout_sec: Hard wall-clock bound for each trial's agent phase. terminal-bench runs installed-agent SETUP inside this same budget with an infinite tmux timeout, so this bounds SETUP+RUN together (a setup-only timeout needs an upstream terminal-bench change). When set, it REPLACES each task's own max_agent_timeout_sec. Set None or 0 to fall back to per-task budgets. global_timeout_multiplier: Scales per-task budgets when global_agent_timeout_sec is not set. None keeps terminal-bench's default (1.0).

Source code in src/openjarvis/evals/backends/terminalbench_native.py

def __init__(
    self,
    model: str = "openai/default",
    api_base: str = "http://localhost:8000/v1",
    temperature: float = 0.2,
    agent_name: str = "naive",
    output_dir: str = "results/terminalbench/",
    max_samples: Optional[int] = None,
    dataset_name: str = "terminal-bench-core",
    dataset_version: str = "0.1.1",
    system_prompt: str = "",
    max_tokens: int = 16384,
    n_concurrent: int = 4,
    global_agent_timeout_sec: Optional[float] = 1800.0,
    global_timeout_multiplier: Optional[float] = None,
) -> None:
    """Args of note:

    global_agent_timeout_sec: Hard wall-clock bound for each trial's
        agent phase. terminal-bench runs installed-agent SETUP inside
        this same budget with an infinite tmux timeout, so this bounds
        SETUP+RUN together (a setup-only timeout needs an upstream
        terminal-bench change). When set, it REPLACES each task's own
        ``max_agent_timeout_sec``. Set ``None`` or ``0`` to fall back
        to per-task budgets.
    global_timeout_multiplier: Scales per-task budgets when
        ``global_agent_timeout_sec`` is not set. ``None`` keeps
        terminal-bench's default (1.0).
    """
    if not _HAS_TB:
        raise ImportError("terminal-bench is required: pip install terminal-bench")

    self._model = model
    self._api_base = api_base
    self._temperature = temperature
    self._agent_name = agent_name
    self._output_dir = Path(output_dir)
    self._max_samples = max_samples
    self._dataset_name = dataset_name
    self._dataset_version = dataset_version
    self._system_prompt = system_prompt
    self._max_tokens = max_tokens
    self._n_concurrent = n_concurrent
    self._global_agent_timeout_sec = global_agent_timeout_sec
    self._global_timeout_multiplier = global_timeout_multiplier
    self._results: Optional[BenchmarkResults] = None

Functions¶

run_harness ¶

run_harness(run_id: str) -> BenchmarkResults

Run the full terminal-bench harness and return results.

Source code in src/openjarvis/evals/backends/terminalbench_native.py

def run_harness(self, run_id: str) -> BenchmarkResults:
    """Run the full terminal-bench harness and return results."""
    output_path = self._output_dir / run_id
    output_path.mkdir(parents=True, exist_ok=True)

    harness_kwargs: Dict[str, Any] = {
        "output_path": output_path,
        "run_id": run_id,
        "dataset_name": self._dataset_name,
        "dataset_version": self._dataset_version,
        "model_name": self._model,
        "n_concurrent_trials": self._n_concurrent,
        "cleanup": True,
    }

    # Use terminus-2 agent which accepts model_name + api_base as
    # serializable strings (avoids Pydantic serialization issues with
    # LLM objects in the harness lock file).
    from terminal_bench.agents.agent_name import AgentName

    harness_kwargs["agent_name"] = AgentName("terminus-2")
    harness_kwargs["agent_kwargs"] = {
        "model_name": self._model,
        "api_base": self._api_base,
        "temperature": self._temperature,
    }

    if self._max_samples is not None:
        harness_kwargs["n_tasks"] = self._max_samples

    # Bound each trial's agent phase. Without this, an in-container
    # installed-agent SETUP hang runs with an infinite tmux timeout,
    # bounded only by whatever budget the task happens to declare.
    if self._global_agent_timeout_sec:
        harness_kwargs["global_agent_timeout_sec"] = float(
            self._global_agent_timeout_sec
        )
    if self._global_timeout_multiplier is not None:
        harness_kwargs["global_timeout_multiplier"] = float(
            self._global_timeout_multiplier
        )

    self._check_timeout_kwargs_supported(harness_kwargs)

    harness = Harness(**harness_kwargs)
    self._results = harness.run()
    return self._results

Functions¶

summarize_benchmark_results ¶

summarize_benchmark_results(results: Any, *, model: str, benchmark: str = 'terminalbench-native') -> Tuple[RunSummary, List[Dict[str, str]]]

Convert terminal-bench BenchmarkResults into a RunSummary.

Trials are classified into three buckets:

resolved: is_resolved is True -> counted correct.
model miss: unresolved, but the model was actually contacted -> counted in the accuracy denominator.
harness/infra failure: excluded from the accuracy denominator and reported in RunSummary.errors plus the returned failure list.

Zero-model-contact signal choice: terminal-bench 0.2.18 leaves failure_mode UNSET both on clean success and on genuine unresolved misses, so failure_mode cannot distinguish "the model tried and failed" from "the agent never called the model". Token usage can: this backend always runs terminus-2, which reports real LiteLLM usage, so an unresolved trial with zero/missing input+output tokens means no model request ever completed — an infrastructure failure (in-container setup hang/death, tmux failure), not a model miss. CAVEAT: terminal-bench "installed agents" (openhands, claude-code, ...) hardcode 0 tokens even on success; if this backend ever honors agent_name for installed agents, this heuristic must be gated on the agent type.

Source code in src/openjarvis/evals/backends/terminalbench_native.py

def summarize_benchmark_results(
    results: Any,
    *,
    model: str,
    benchmark: str = "terminalbench-native",
) -> Tuple[RunSummary, List[Dict[str, str]]]:
    """Convert terminal-bench ``BenchmarkResults`` into a ``RunSummary``.

    Trials are classified into three buckets:

    - resolved: ``is_resolved`` is True -> counted correct.
    - model miss: unresolved, but the model was actually contacted ->
      counted in the accuracy denominator.
    - harness/infra failure: excluded from the accuracy denominator and
      reported in ``RunSummary.errors`` plus the returned failure list.

    Zero-model-contact signal choice: terminal-bench 0.2.18 leaves
    ``failure_mode`` UNSET both on clean success and on genuine unresolved
    misses, so failure_mode cannot distinguish "the model tried and failed"
    from "the agent never called the model". Token usage can: this backend
    always runs terminus-2, which reports real LiteLLM usage, so an
    unresolved trial with zero/missing input+output tokens means no model
    request ever completed — an infrastructure failure (in-container setup
    hang/death, tmux failure), not a model miss. CAVEAT: terminal-bench
    "installed agents" (openhands, claude-code, ...) hardcode 0 tokens even
    on success; if this backend ever honors ``agent_name`` for installed
    agents, this heuristic must be gated on the agent type.
    """
    trials = list(getattr(results, "results", None) or [])

    harness_failures: List[Dict[str, str]] = []
    scored = 0
    correct = 0

    for tr in trials:
        task_id = getattr(tr, "task_id", None) or getattr(tr, "trial_name", "unknown")
        is_resolved = getattr(tr, "is_resolved", None) is True
        fm = getattr(tr, "failure_mode", None)
        fm_value = str(getattr(fm, "value", fm) or "unset").lower()
        tokens = (getattr(tr, "total_input_tokens", None) or 0) + (
            getattr(tr, "total_output_tokens", None) or 0
        )

        zero_model_contact = not is_resolved and tokens == 0
        infra_failure_mode = fm_value in _INFRA_FAILURE_MODES

        if zero_model_contact or infra_failure_mode:
            harness_failures.append(
                {
                    "task_id": str(task_id),
                    "failure_mode": fm_value,
                    "reason": (
                        "zero_model_requests" if zero_model_contact else fm_value
                    ),
                }
            )
            continue

        scored += 1
        if is_resolved:
            correct += 1

    return (
        RunSummary(
            benchmark=benchmark,
            category="agentic",
            backend="terminalbench-native",
            model=model,
            total_samples=len(trials),
            scored_samples=scored,
            correct=correct,
            accuracy=correct / scored if scored else 0.0,
            errors=len(harness_failures),
            mean_latency_seconds=0.0,
            total_cost_usd=0.0,
        ),
        harness_failures,
    )