Skip to content

table_gen

table_gen

LaTeX table generator for the framework-comparison harness.

Reads summary.json files produced by EvalRunner, builds a long-format polars DataFrame, then renders 7 tables (T1..T7) as both tabular fragments (paste-into-paper) and \documentclass{standalone} previews (latexmk-renderable).

Classes

TableGenError

Bases: Exception

Base for table-generation problems.

MixedCommitError

Bases: TableGenError

Raised when one (framework, model, benchmark) cell has multiple commits.

NoResultsError

Bases: TableGenError

Raised when no valid summary files were loaded.

ResultsFrame dataclass

ResultsFrame(df: DataFrame, unloadable_count: int = 0)

Long-format DataFrame of all loaded summary.json results.

Functions

load_results

load_results(glob_pattern: str) -> ResultsFrame

Glob summary.json files, validate, return long-format ResultsFrame.

Source code in src/openjarvis/evals/comparison/table_gen.py
def load_results(glob_pattern: str) -> ResultsFrame:
    """Glob summary.json files, validate, return long-format ResultsFrame."""
    paths = glob.glob(glob_pattern, recursive=True)
    rows: List[Dict[str, object]] = []
    unloadable = 0

    for p in paths:
        try:
            raw = json.loads(Path(p).read_text())
            schema = _SummarySchema.model_validate(raw)
        except (json.JSONDecodeError, ValidationError) as e:
            LOGGER.warning("skipping unloadable summary at %s: %s", p, e)
            unloadable += 1
            continue
        for metric_name, stats in schema.metrics.items():
            rows.append(
                {
                    "framework": schema.framework,
                    "framework_commit": schema.framework_commit,
                    "model": schema.model,
                    "benchmark": schema.benchmark,
                    "metric_name": metric_name,
                    "mean": stats.mean,
                    "std": stats.std,
                    "n": stats.n,
                    "source_path": p,
                }
            )

    if rows:
        df = pl.DataFrame(rows)
    else:
        df = pl.DataFrame(
            schema={
                "framework": pl.Utf8,
                "framework_commit": pl.Utf8,
                "model": pl.Utf8,
                "benchmark": pl.Utf8,
                "metric_name": pl.Utf8,
                "mean": pl.Float64,
                "std": pl.Float64,
                "n": pl.Int64,
                "source_path": pl.Utf8,
            }
        )

    # Validate: each (framework, model, benchmark) cell must have one commit.
    if not df.is_empty():
        commit_groups = df.group_by(["framework", "model", "benchmark"]).agg(
            pl.col("framework_commit").unique().alias("commits")
        )
        for row in commit_groups.iter_rows(named=True):
            if len(row["commits"]) > 1:
                # Sort commits for deterministic error message; polars'
                # unique() does not guarantee insertion order across runs.
                commits = sorted(row["commits"])
                raise MixedCommitError(
                    f"{row['framework']}/{row['model']}/{row['benchmark']}: "
                    f"multiple commits {commits}"
                )

    return ResultsFrame(df=df, unloadable_count=unloadable)

main

main(results_glob: str, tables: str, output_dir: Optional[Path]) -> None

Build LaTeX tables from framework-comparison evaluation results.

Source code in src/openjarvis/evals/comparison/table_gen.py
@click.command()
@click.option(
    "--results-glob",
    required=True,
    help='Glob, e.g. "results/comparison/**/summary.json"',
)
@click.option(
    "--tables",
    default="T1,T2,T3,T4,T5,T6,T7",
    help="Comma-separated list of table names to build",
)
@click.option(
    "--output-dir",
    type=click.Path(file_okay=False, path_type=Path),
    default=None,
    help="Output directory (default: experiments/framework_comparison/tables/)",
)
def main(results_glob: str, tables: str, output_dir: Optional[Path]) -> None:
    """Build LaTeX tables from framework-comparison evaluation results."""
    out = output_dir or _table_gen_default_output_dir()
    out.mkdir(parents=True, exist_ok=True)
    (out / "preview").mkdir(parents=True, exist_ok=True)

    frame = load_results(results_glob)
    click.echo(
        f"Loaded {len(frame.df)} metric rows; {frame.unloadable_count} files skipped."
    )
    if frame.df.is_empty():
        raise click.ClickException(
            "No valid summary files matched --results-glob; refusing to emit empty "
            "tables."
        )

    requested = [t.strip() for t in tables.split(",") if t.strip()]
    for name in requested:
        if name not in _TABLE_BUILDERS:
            click.echo(f"  ! unknown table {name}; skipping")
            continue
        try:
            fragment, standalone = _TABLE_BUILDERS[name](frame)
        except Exception as e:
            click.echo(f"  ! {name} build failed: {e}")
            continue
        (out / f"{name}.tex").write_text(fragment + "\n")
        (out / "preview" / f"{name}_preview.tex").write_text(standalone)
        click.echo(f"  ✓ {name}{out}/{name}.tex (+ preview)")