GAIA scorer — normalized exact match with LLM fallback.
Adapted from IPW's gaia.py evaluation handler.
Classes
GAIAScorer
Bases: LLMJudgeScorer
GAIA evaluation: exact match with normalization + LLM fallback.
Source code in src/openjarvis/evals/core/scorer.py
| def __init__(self, judge_backend: InferenceBackend, judge_model: str) -> None:
self._judge_backend = judge_backend
self._judge_model = judge_model
|
Functions
exact_match
exact_match(model_answer: str, ground_truth: str) -> bool
GAIA exact-match scorer with normalization for numbers, lists, and strings.
Source code in src/openjarvis/evals/scorers/gaia_exact.py
| def exact_match(model_answer: str, ground_truth: str) -> bool:
"""GAIA exact-match scorer with normalization for numbers, lists, and strings."""
if model_answer is None:
model_answer = "None"
if _is_float(ground_truth):
normalized = _normalize_number_str(model_answer)
return normalized == float(ground_truth)
if any(char in ground_truth for char in [",", ";"]):
gt_elems = _split_string(ground_truth)
ma_elems = _split_string(model_answer)
if len(gt_elems) != len(ma_elems):
return False
comparisons = []
for ma_elem, gt_elem in zip(ma_elems, gt_elems):
if _is_float(gt_elem):
comparisons.append(
_normalize_number_str(ma_elem) == float(gt_elem)
)
else:
comparisons.append(
_normalize_str(ma_elem, remove_punct=False)
== _normalize_str(gt_elem, remove_punct=False)
)
return all(comparisons)
return _normalize_str(model_answer) == _normalize_str(ground_truth)
|