Skip to content

pinchbench

pinchbench

PinchBench grading helpers and scorer.

Provides transcript translation (OpenJarvis events → PinchBench format), automated grading (exec of embedded Python), LLM judge grading, and hybrid combination. Used by PinchBenchTaskEnv.run_tests() and the standalone PinchBenchScorer.

Reference: https://github.com/pinchbench/skill

Classes

PinchBenchScorer

PinchBenchScorer(judge_backend: InferenceBackend, judge_model: str)

Bases: LLMJudgeScorer

PinchBench scorer for the non-agentic EvalRunner path.

Source code in src/openjarvis/evals/core/scorer.py
def __init__(self, judge_backend: InferenceBackend, judge_model: str) -> None:
    self._judge_backend = judge_backend
    self._judge_model = judge_model

Functions

events_to_transcript

events_to_transcript(events: List[Any]) -> List[Dict[str, Any]]

Build PinchBench-format transcript from raw EventRecorder events.

Pairs TOOL_CALL_START/END events to extract tool name, arguments, and results. Called by run_tests() before QueryTrace exists.

Source code in src/openjarvis/evals/scorers/pinchbench.py
def events_to_transcript(events: List[Any]) -> List[Dict[str, Any]]:
    """Build PinchBench-format transcript from raw EventRecorder events.

    Pairs TOOL_CALL_START/END events to extract tool name, arguments, and
    results. Called by run_tests() before QueryTrace exists.
    """
    transcript: List[Dict[str, Any]] = []

    for event in events:
        etype = event.event_type
        if isinstance(etype, str):
            # Normalize string event types to enum comparison
            pass
        if (
            etype == EventType.TOOL_CALL_START
            or etype == EventType.TOOL_CALL_START.value
        ):
            tool_name = event.metadata.get("tool", "unknown")
            mapped = _TOOL_NAME_MAP.get(tool_name, tool_name)
            arguments = event.metadata.get("arguments") or {}
            transcript.append(
                {
                    "type": "message",
                    "message": {
                        "role": "assistant",
                        "content": [
                            {"type": "toolCall", "name": mapped, "params": arguments}
                        ],
                    },
                }
            )
        elif etype == EventType.TOOL_CALL_END or etype == EventType.TOOL_CALL_END.value:
            result_text = str(event.metadata.get("result", ""))
            transcript.append(
                {
                    "type": "message",
                    "message": {
                        "role": "toolResult",
                        "content": [{"text": result_text}],
                    },
                }
            )

    return transcript

grade_pinchbench_task

grade_pinchbench_task(*, record: EvalRecord, transcript: List[Dict[str, Any]], workspace_path: str, judge_backend: Any = None, judge_model: str = 'anthropic/claude-opus-4-5') -> Dict[str, Any]

Top-level grading entry point. Routes by grading_type.

Returns {"score": float, "breakdown": dict, "notes": str}.

Source code in src/openjarvis/evals/scorers/pinchbench.py
def grade_pinchbench_task(
    *,
    record: EvalRecord,
    transcript: List[Dict[str, Any]],
    workspace_path: str,
    judge_backend: Any = None,
    judge_model: str = "anthropic/claude-opus-4-5",
) -> Dict[str, Any]:
    """Top-level grading entry point. Routes by grading_type.

    Returns {"score": float, "breakdown": dict, "notes": str}.
    """
    grading_type = record.metadata.get("grading_type", "automated")

    if grading_type == "automated":
        return _grade_automated(record, transcript, workspace_path)
    elif grading_type == "llm_judge":
        return _grade_llm_judge(
            record, transcript, workspace_path, judge_backend, judge_model
        )
    elif grading_type == "hybrid":
        return _grade_hybrid(
            record, transcript, workspace_path, judge_backend, judge_model
        )
    else:
        return {
            "score": 0.0,
            "breakdown": {},
            "notes": f"Unknown grading type: {grading_type}",
        }