PinchBench grading helpers and scorer.
Provides transcript translation (OpenJarvis events → PinchBench format),
automated grading (exec of embedded Python), LLM judge grading, and
hybrid combination. Used by PinchBenchTaskEnv.run_tests() and the
standalone PinchBenchScorer.
Reference: https://github.com/pinchbench/skill
Classes
PinchBenchScorer
Bases: LLMJudgeScorer
PinchBench scorer for the non-agentic EvalRunner path.
Source code in src/openjarvis/evals/core/scorer.py
| def __init__(self, judge_backend: InferenceBackend, judge_model: str) -> None:
self._judge_backend = judge_backend
self._judge_model = judge_model
|
Functions
events_to_transcript
events_to_transcript(events: List[Any]) -> List[Dict[str, Any]]
Build PinchBench-format transcript from raw EventRecorder events.
Pairs TOOL_CALL_START/END events to extract tool name, arguments, and
results. Called by run_tests() before QueryTrace exists.
Source code in src/openjarvis/evals/scorers/pinchbench.py
| def events_to_transcript(events: List[Any]) -> List[Dict[str, Any]]:
"""Build PinchBench-format transcript from raw EventRecorder events.
Pairs TOOL_CALL_START/END events to extract tool name, arguments, and
results. Called by run_tests() before QueryTrace exists.
"""
transcript: List[Dict[str, Any]] = []
for event in events:
etype = event.event_type
if isinstance(etype, str):
# Normalize string event types to enum comparison
pass
if (
etype == EventType.TOOL_CALL_START
or etype == EventType.TOOL_CALL_START.value
):
tool_name = event.metadata.get("tool", "unknown")
mapped = _TOOL_NAME_MAP.get(tool_name, tool_name)
arguments = event.metadata.get("arguments") or {}
transcript.append(
{
"type": "message",
"message": {
"role": "assistant",
"content": [
{"type": "toolCall", "name": mapped, "params": arguments}
],
},
}
)
elif etype == EventType.TOOL_CALL_END or etype == EventType.TOOL_CALL_END.value:
result_text = str(event.metadata.get("result", ""))
transcript.append(
{
"type": "message",
"message": {
"role": "toolResult",
"content": [{"text": result_text}],
},
}
)
return transcript
|
grade_pinchbench_task
grade_pinchbench_task(*, record: EvalRecord, transcript: List[Dict[str, Any]], workspace_path: str, judge_backend: Any = None, judge_model: str = 'anthropic/claude-opus-4-5') -> Dict[str, Any]
Top-level grading entry point. Routes by grading_type.
Returns {"score": float, "breakdown": dict, "notes": str}.
Source code in src/openjarvis/evals/scorers/pinchbench.py
| def grade_pinchbench_task(
*,
record: EvalRecord,
transcript: List[Dict[str, Any]],
workspace_path: str,
judge_backend: Any = None,
judge_model: str = "anthropic/claude-opus-4-5",
) -> Dict[str, Any]:
"""Top-level grading entry point. Routes by grading_type.
Returns {"score": float, "breakdown": dict, "notes": str}.
"""
grading_type = record.metadata.get("grading_type", "automated")
if grading_type == "automated":
return _grade_automated(record, transcript, workspace_path)
elif grading_type == "llm_judge":
return _grade_llm_judge(
record, transcript, workspace_path, judge_backend, judge_model
)
elif grading_type == "hybrid":
return _grade_hybrid(
record, transcript, workspace_path, judge_backend, judge_model
)
else:
return {
"score": 0.0,
"breakdown": {},
"notes": f"Unknown grading type: {grading_type}",
}
|