SWE-bench dataset (princeton-nlp/SWE-bench_Verified).
Agentic coding benchmark — patches for real-world GitHub issues.
Classes
SWEBenchDataset
SWEBenchDataset(variant: str = 'verified_mini')
Bases: DatasetProvider
SWE-bench agentic coding benchmark.
Source code in src/openjarvis/evals/datasets/swebench.py
| def __init__(self, variant: str = "verified_mini") -> None:
# NOTE: default is the 50-task mini variant. The full 500-task set is
# "verified" (princeton-nlp/SWE-bench_Verified). If your subset JSON
# references task_ids from the full set (e.g. subsets/swebench_*_n100*),
# pass variant="verified" explicitly — otherwise the 450 missing tasks
# are silently dropped. See agents/hybrid/runner.py:_load_swebench_tasks.
if variant not in _HF_PATHS:
raise ValueError(
f"Unknown SWE-bench variant {variant!r}; "
f"choose from {sorted(_HF_PATHS)}"
)
self._variant = variant
self._hf_path = _HF_PATHS[variant]
self._records: List[EvalRecord] = []
|