TerminalBenchNativeBackend(model: str = 'openai/default', api_base: str = 'http://localhost:8000/v1', temperature: float = 0.2, agent_name: str = 'naive', output_dir: str = 'results/terminalbench/', max_samples: Optional[int] = None, dataset_name: str = 'terminal-bench-core', dataset_version: str = '0.1.1', system_prompt: str = '', max_tokens: int = 16384, n_concurrent: int = 4)
Bases: InferenceBackend
Runs terminal-bench tasks natively via Harness with Docker execution.
Uses terminal-bench's own agent + LiteLLM to call the model,
Docker containers for task execution, and built-in test scripts
for scoring. This gives real agentic evaluation, not text-only.
Source code in src/openjarvis/evals/backends/terminalbench_native.py
| def __init__(
self,
model: str = "openai/default",
api_base: str = "http://localhost:8000/v1",
temperature: float = 0.2,
agent_name: str = "naive",
output_dir: str = "results/terminalbench/",
max_samples: Optional[int] = None,
dataset_name: str = "terminal-bench-core",
dataset_version: str = "0.1.1",
system_prompt: str = "",
max_tokens: int = 16384,
n_concurrent: int = 4,
) -> None:
if not _HAS_TB:
raise ImportError("terminal-bench is required: pip install terminal-bench")
self._model = model
self._api_base = api_base
self._temperature = temperature
self._agent_name = agent_name
self._output_dir = Path(output_dir)
self._max_samples = max_samples
self._dataset_name = dataset_name
self._dataset_version = dataset_version
self._system_prompt = system_prompt
self._max_tokens = max_tokens
self._n_concurrent = n_concurrent
self._results: Optional[BenchmarkResults] = None
|
Functions
run_harness
run_harness(run_id: str) -> BenchmarkResults
Run the full terminal-bench harness and return results.
Source code in src/openjarvis/evals/backends/terminalbench_native.py
| def run_harness(self, run_id: str) -> BenchmarkResults:
"""Run the full terminal-bench harness and return results."""
output_path = self._output_dir / run_id
output_path.mkdir(parents=True, exist_ok=True)
harness_kwargs: Dict[str, Any] = {
"output_path": output_path,
"run_id": run_id,
"dataset_name": self._dataset_name,
"dataset_version": self._dataset_version,
"model_name": self._model,
"n_concurrent_trials": self._n_concurrent,
"cleanup": True,
}
# Use built-in agent (naive uses LiteLLM)
from terminal_bench.agents.agent_name import AgentName
harness_kwargs["agent_name"] = AgentName(self._agent_name)
harness_kwargs["agent_kwargs"] = {
"llm": LiteLLM(
model_name=self._model,
temperature=self._temperature,
api_base=self._api_base,
),
}
if self._max_samples is not None:
harness_kwargs["n_tasks"] = self._max_samples
harness = Harness(**harness_kwargs)
self._results = harness.run()
return self._results
|