Per-task workspace environment for PinchBench.
Context manager that creates an isolated workspace directory,
populates fixture files from the task definition, and runs
grading after agent execution via run_tests().
Source code in src/openjarvis/evals/execution/pinchbench_env.py
| def __init__(
self,
record: EvalRecord,
judge_backend: Any = None,
judge_model: str = "anthropic/claude-opus-4-5",
) -> None:
self._record = record
self._judge_backend = judge_backend
self._judge_model = judge_model
self._workspace: Optional[Path] = None
self._owns_workspace: bool = True
self._event_recorder: Any = None
self._original_cwd: Optional[str] = None
|
Functions
set_event_recorder
set_event_recorder(recorder: Any) -> None
Receive the EventRecorder from AgenticRunner for transcript building.
Source code in src/openjarvis/evals/execution/pinchbench_env.py
| def set_event_recorder(self, recorder: Any) -> None:
"""Receive the EventRecorder from AgenticRunner for transcript building."""
self._event_recorder = recorder
|
run_tests
Grade the agent's work using PinchBench grading logic.
Called by AgenticRunner after agent execution, before QueryTrace
is constructed. Builds transcript from raw EventRecorder events.
Source code in src/openjarvis/evals/execution/pinchbench_env.py
| def run_tests(self) -> None:
"""Grade the agent's work using PinchBench grading logic.
Called by AgenticRunner after agent execution, before QueryTrace
is constructed. Builds transcript from raw EventRecorder events.
"""
from openjarvis.evals.scorers.pinchbench import (
events_to_transcript,
grade_pinchbench_task,
)
workspace_path = self._record.metadata.get("workspace_path", "")
events = self._event_recorder.get_events() if self._event_recorder else []
transcript = events_to_transcript(events)
try:
result = grade_pinchbench_task(
record=self._record,
transcript=transcript,
workspace_path=workspace_path,
judge_backend=self._judge_backend,
judge_model=self._judge_model,
)
except Exception as exc:
LOGGER.error(
"Grading failed for %s: %s",
self._record.record_id,
exc,
)
result = {"score": 0.0, "breakdown": {}, "notes": f"Grading error: {exc}"}
self._record.metadata["is_resolved"] = result["score"] >= 0.5
self._record.metadata["reward"] = result["score"]
self._record.metadata["pinchbench_score"] = result["score"]
self._record.metadata["pinchbench_breakdown"] = result["breakdown"]
self._record.metadata["pinchbench_notes"] = result.get("notes", "")
LOGGER.info(
"PinchBench grading for %s: score=%.2f resolved=%s",
self._record.record_id,
result["score"],
result["score"] >= 0.5,
)
|