Skip to content

terminalbench_env

terminalbench_env

TerminalBench task environment — per-task Docker lifecycle + test execution.

Classes

TerminalBenchTaskEnv

TerminalBenchTaskEnv(metadata: MutableMapping[str, Any])

Per-task Docker environment for TerminalBench.

Context manager that spins up a Docker container, creates a tmux session, and runs test scripts after the agent finishes.

Source code in src/openjarvis/evals/execution/terminalbench_env.py
def __init__(self, metadata: MutableMapping[str, Any]) -> None:
    self._metadata = metadata
    self._terminal: Any = None
    self._terminal_cm: Any = None
    self._logs_tmpdir: Any = None
Functions
run_tests
run_tests() -> tuple[bool, dict[str, Any]]

Copy test scripts into container, execute, parse results.

Source code in src/openjarvis/evals/execution/terminalbench_env.py
def run_tests(self) -> tuple[bool, dict[str, Any]]:
    """Copy test scripts into container, execute, parse results."""
    from terminal_bench.parsers.base_parser import UnitTestStatus
    from terminal_bench.parsers.parser_factory import ParserFactory
    from terminal_bench.terminal.docker_compose_manager import (
        DockerComposeManager,
    )

    task = self._metadata["task"]
    task_paths = self._metadata["task_paths"]
    terminal = self._terminal
    results: dict[str, Any] = {}

    if terminal is None:
        results["error"] = "terminal_not_running"
        self._metadata["is_resolved"] = False
        self._metadata["test_results"] = results
        return False, results

    try:
        paths_to_copy = [task_paths.run_tests_path]
        if task_paths.test_dir.exists():
            paths_to_copy.append(task_paths.test_dir)

        terminal.copy_to_container(
            paths=paths_to_copy,
            container_dir=str(DockerComposeManager.CONTAINER_TEST_DIR),
        )

        if not task.run_tests_in_same_shell:
            test_session = terminal.create_session(
                "tests", is_active_stream=False, as_configured_user=False
            )
        else:
            test_session = terminal.create_session(
                "agent-tests",
                is_active_stream=False,
                as_configured_user=True,
            )

        test_timeout = task.max_test_timeout_sec
        test_script_path = (
            DockerComposeManager.CONTAINER_TEST_DIR
            / task_paths.run_tests_path.name
        )

        try:
            test_session.send_keys(
                ["bash ", str(test_script_path), "Enter"],
                block=True,
                max_timeout_sec=test_timeout,
            )
        except TimeoutError:
            LOGGER.warning(
                "Test command timed out after %.0fs", test_timeout
            )
            results["error"] = "test_timeout"
            self._metadata["is_resolved"] = False
            self._metadata["test_results"] = results
            return False, results

        post_test_pane = test_session.capture_pane(capture_entire=True)
        results["test_output"] = post_test_pane[:10000]

        parser = ParserFactory.get_parser(task.parser_name)
        try:
            parser_results = parser.parse(post_test_pane)
            results["parser_results"] = {
                name: status.value
                for name, status in parser_results.items()
            }
            is_resolved = all(
                status == UnitTestStatus.PASSED
                for status in parser_results.values()
            )
        except Exception as exc:
            LOGGER.warning("Parser failed: %s", exc)
            results["parse_error"] = str(exc)
            is_resolved = False

        results["is_resolved"] = is_resolved
        self._metadata["is_resolved"] = is_resolved
        self._metadata["test_results"] = results
        return is_resolved, results

    except Exception as exc:
        LOGGER.exception("Test execution failed")
        results["error"] = str(exc)
        self._metadata["is_resolved"] = False
        self._metadata["test_results"] = results
        return False, results