TauBench V2 dataset provider — multi-turn customer service benchmark.
Wraps the tau2-bench framework for evaluation within OpenJarvis.
Supports airline, retail, and telecom domains.
Reference: https://github.com/sierra-research/tau2-bench
Classes
TauBenchDataset
TauBenchDataset(domains: Optional[List[str]] = None)
Bases: DatasetProvider
TauBench V2 multi-turn customer service benchmark.
Wraps tau2-bench's task loading and evaluation infrastructure.
Each EvalRecord represents a single customer service scenario.
Source code in src/openjarvis/evals/datasets/taubench.py
| def __init__(
self,
domains: Optional[List[str]] = None,
) -> None:
self._domains = domains or list(DOMAINS)
self._records: List[EvalRecord] = []
self._engine_key: Optional[str] = None
self._model: Optional[str] = None
self._temperature: float = 0.7
self._max_tokens: int = 4096
self._user_model: Optional[str] = None
# pass^k: best of k trials per task. Default 3, override via env var
# OPENJARVIS_TAUBENCH_TRIALS for faster runs (e.g. =1 for 3x speedup).
self._num_trials: int = int(os.environ.get("OPENJARVIS_TAUBENCH_TRIALS", "3"))
self._telemetry: bool = False
self._gpu_metrics: bool = False
|
Functions
set_engine_config
set_engine_config(engine_key: Optional[str] = None, model: Optional[str] = None, temperature: float = 0.7, max_tokens: int = 4096, user_model: Optional[str] = None, num_trials: Optional[int] = None, telemetry: bool = False, gpu_metrics: bool = False) -> None
Inject engine configuration for the agent. Called by CLI.
Source code in src/openjarvis/evals/datasets/taubench.py
| def set_engine_config(
self,
engine_key: Optional[str] = None,
model: Optional[str] = None,
temperature: float = 0.7,
max_tokens: int = 4096,
user_model: Optional[str] = None,
num_trials: Optional[int] = None,
telemetry: bool = False,
gpu_metrics: bool = False,
) -> None:
"""Inject engine configuration for the agent. Called by CLI."""
if engine_key is not None:
self._engine_key = engine_key
if model is not None:
self._model = model
self._temperature = temperature
self._max_tokens = max_tokens
if user_model is not None:
self._user_model = user_model
if num_trials is not None:
self._num_trials = num_trials
self._telemetry = telemetry
self._gpu_metrics = gpu_metrics
|
create_task_env
Create a TauBench task environment for evaluation.
Source code in src/openjarvis/evals/datasets/taubench.py
| def create_task_env(self, record: EvalRecord):
"""Create a TauBench task environment for evaluation."""
from openjarvis.evals.execution.taubench_env import TauBenchTaskEnv
return TauBenchTaskEnv(
record,
engine_key=self._engine_key,
model=self._model,
temperature=self._temperature,
max_tokens=self._max_tokens,
user_model=self._user_model,
num_trials=self._num_trials,
telemetry=self._telemetry,
gpu_metrics=self._gpu_metrics,
)
|