gaia_exact

gaia_exact ¶

GAIA scorer — normalized exact match with LLM fallback.

Adapted from IPW's gaia.py evaluation handler.

Classes¶

GAIAScorer ¶

GAIAScorer(judge_backend: InferenceBackend, judge_model: str)

Bases: LLMJudgeScorer

GAIA evaluation: exact match with normalization + LLM fallback.

Source code in src/openjarvis/evals/core/scorer.py

def __init__(self, judge_backend: InferenceBackend, judge_model: str) -> None:
    self._judge_backend = judge_backend
    self._judge_model = judge_model

Functions¶

exact_match ¶

exact_match(model_answer: str, ground_truth: str) -> bool

GAIA exact-match scorer with normalization for numbers, lists, and strings.

Source code in src/openjarvis/evals/scorers/gaia_exact.py

def exact_match(model_answer: str, ground_truth: str) -> bool:
    """GAIA exact-match scorer with normalization for numbers, lists, and strings."""
    if model_answer is None:
        model_answer = "None"

    if _is_float(ground_truth):
        normalized = _normalize_number_str(model_answer)
        return normalized == float(ground_truth)

    if any(char in ground_truth for char in [",", ";"]):
        gt_elems = _split_string(ground_truth)
        ma_elems = _split_string(model_answer)
        if len(gt_elems) != len(ma_elems):
            return False
        comparisons = []
        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
            if _is_float(gt_elem):
                comparisons.append(_normalize_number_str(ma_elem) == float(gt_elem))
            else:
                comparisons.append(
                    _normalize_str(ma_elem, remove_punct=False)
                    == _normalize_str(gt_elem, remove_punct=False)
                )
        return all(comparisons)

    return _normalize_str(model_answer) == _normalize_str(ground_truth)