def run_tests(self) -> tuple[bool, dict[str, Any]]:
"""Copy test scripts into container, execute, parse results."""
from terminal_bench.parsers.base_parser import UnitTestStatus
from terminal_bench.parsers.parser_factory import ParserFactory
from terminal_bench.terminal.docker_compose_manager import (
DockerComposeManager,
)
task = self._metadata["task"]
task_paths = self._metadata["task_paths"]
terminal = self._terminal
results: dict[str, Any] = {}
if terminal is None:
results["error"] = "terminal_not_running"
self._metadata["is_resolved"] = False
self._metadata["test_results"] = results
return False, results
try:
paths_to_copy = [task_paths.run_tests_path]
if task_paths.test_dir.exists():
paths_to_copy.append(task_paths.test_dir)
terminal.copy_to_container(
paths=paths_to_copy,
container_dir=str(DockerComposeManager.CONTAINER_TEST_DIR),
)
if not task.run_tests_in_same_shell:
test_session = terminal.create_session(
"tests", is_active_stream=False, as_configured_user=False
)
else:
test_session = terminal.create_session(
"agent-tests",
is_active_stream=False,
as_configured_user=True,
)
test_timeout = task.max_test_timeout_sec
test_script_path = (
DockerComposeManager.CONTAINER_TEST_DIR
/ task_paths.run_tests_path.name
)
try:
test_session.send_keys(
["bash ", str(test_script_path), "Enter"],
block=True,
max_timeout_sec=test_timeout,
)
except TimeoutError:
LOGGER.warning(
"Test command timed out after %.0fs", test_timeout
)
results["error"] = "test_timeout"
self._metadata["is_resolved"] = False
self._metadata["test_results"] = results
return False, results
post_test_pane = test_session.capture_pane(capture_entire=True)
results["test_output"] = post_test_pane[:10000]
parser = ParserFactory.get_parser(task.parser_name)
try:
parser_results = parser.parse(post_test_pane)
results["parser_results"] = {
name: status.value
for name, status in parser_results.items()
}
is_resolved = all(
status == UnitTestStatus.PASSED
for status in parser_results.values()
)
except Exception as exc:
LOGGER.warning("Parser failed: %s", exc)
results["parse_error"] = str(exc)
is_resolved = False
results["is_resolved"] = is_resolved
self._metadata["is_resolved"] = is_resolved
self._metadata["test_results"] = results
return is_resolved, results
except Exception as exc:
LOGGER.exception("Test execution failed")
results["error"] = str(exc)
self._metadata["is_resolved"] = False
self._metadata["test_results"] = results
return False, results