workarena_env

workarena_env ¶

WorkArena task environment — per-task BrowserGym lifecycle + validation.

Wraps BrowserGym's BrowserEnv to provide per-task browser/ServiceNow setup, observation access, action stepping, and native validate() scoring against the live ServiceNow instance.

Classes¶

WorkArenaTaskEnv ¶

WorkArenaTaskEnv(metadata: MutableMapping[str, Any])

Per-task BrowserGym environment for WorkArena.

Context manager that creates a BrowserEnv, resets the task against the ServiceNow instance, and exposes observation/action/validate methods.

After the agent finishes, run_tests() calls the task's native validate() to determine pass/fail from the actual ServiceNow state.

Source code in src/openjarvis/evals/execution/workarena_env.py

def __init__(self, metadata: MutableMapping[str, Any]) -> None:
    self._metadata = metadata
    self._env: Any = None
    self._obs: Optional[Dict[str, Any]] = None
    self._goal: str = ""
    self._chat_messages: list = []
    self._done: bool = False
    self._last_reward: float = 0.0
    self._step_count: int = 0
    self.all_responses: list[str] = []
    self.turn_wall_clocks: list[float] = []

Functions¶

get_observation_text ¶

get_observation_text() -> str

Return the current observation formatted as text for the agent.

Source code in src/openjarvis/evals/execution/workarena_env.py

def get_observation_text(self) -> str:
    """Return the current observation formatted as text for the agent."""
    if self._obs is None:
        return ""
    return self._format_observation(self._obs)

step ¶

step(action: str) -> Tuple[str, float, bool, Dict[str, Any]]

Execute a BrowserGym action and return (obs_text, reward, done, info).

Actions use BrowserGym's high-level action format, e.g.: click("bid_123") fill("bid_456", "hello world") scroll(0, 300) send_msg_to_user("The answer is 42")

Source code in src/openjarvis/evals/execution/workarena_env.py

def step(self, action: str) -> Tuple[str, float, bool, Dict[str, Any]]:
    """Execute a BrowserGym action and return (obs_text, reward, done, info).

    Actions use BrowserGym's high-level action format, e.g.:
      click("bid_123")
      fill("bid_456", "hello world")
      scroll(0, 300)
      send_msg_to_user("The answer is 42")
    """
    if self._env is None:
        raise RuntimeError("WorkArena environment not initialized")
    if self._done:
        return "", 0.0, True, {"message": "Episode already finished"}

    obs, reward, terminated, truncated, info = self._env.step(action)
    self._obs = obs
    self._step_count += 1
    self._last_reward = reward
    self._chat_messages = list(obs.get("chat_messages", []))

    self._done = terminated or truncated
    obs_text = self._format_observation(obs)

    self._metadata["workarena_obs"] = obs_text

    return obs_text, reward, self._done, info

send_chat_message ¶

send_chat_message(message: str) -> None

Send a message from the assistant to the chat.

Source code in src/openjarvis/evals/execution/workarena_env.py

def send_chat_message(self, message: str) -> None:
    """Send a message from the assistant to the chat."""
    if self._env is not None and hasattr(self._env, "chat"):
        self._env.chat.add_message(role="assistant", msg=message)

run_tests ¶

run_tests() -> Tuple[bool, Dict[str, Any]]

Validate the task using the native WorkArena validate() method.

This calls task.validate(page, chat_messages) which checks the actual state of the ServiceNow instance — the canonical evaluation method from the original benchmark.

Source code in src/openjarvis/evals/execution/workarena_env.py

def run_tests(self) -> Tuple[bool, Dict[str, Any]]:
    """Validate the task using the native WorkArena validate() method.

    This calls ``task.validate(page, chat_messages)`` which checks the
    actual state of the ServiceNow instance — the canonical evaluation
    method from the original benchmark.
    """
    results: Dict[str, Any] = {
        "steps_taken": self._step_count,
    }

    if self._env is None or self._env.task is None:
        results["error"] = "environment_not_initialized"
        self._metadata["is_resolved"] = False
        self._metadata["reward"] = 0.0
        self._metadata["test_results"] = results
        return False, results

    try:
        chat_msgs = []
        if self._env.chat is not None:
            chat_msgs = self._env.chat.messages

        reward, done, message, info = self._env.task.validate(
            self._env.page,
            chat_msgs,
        )

        is_resolved = reward == 1.0
        results["reward"] = reward
        results["is_resolved"] = is_resolved
        results["validate_message"] = message
        results["validate_info"] = _safe_serialize(info)
        results["chat_message_count"] = len(chat_msgs)

        self._metadata["is_resolved"] = is_resolved
        self._metadata["reward"] = reward
        self._metadata["test_results"] = results

        LOGGER.info(
            "WorkArena validate: reward=%.1f resolved=%s msg=%s",
            reward,
            is_resolved,
            message,
        )
        return is_resolved, results

    except Exception as exc:
        LOGGER.exception("WorkArena validation failed")
        results["error"] = str(exc)
        self._metadata["is_resolved"] = False
        self._metadata["reward"] = 0.0
        self._metadata["test_results"] = results
        return False, results

run_agent_loop ¶

run_agent_loop(generate_fn: Callable[[str], str], max_steps: Optional[int] = None) -> str

Drive the BrowserGym env in a step loop using generate_fn for LLM calls.

generate_fn(prompt) -> response is called once per step. The loop feeds observations to the LLM, parses a BrowserGym action from its response, and steps the environment until the task is done or max_steps is reached.

Validation (run_tests) is not called here — the caller (e.g. AgenticRunner) is responsible for that.

Source code in src/openjarvis/evals/execution/workarena_env.py

def run_agent_loop(
    self,
    generate_fn: Callable[[str], str],
    max_steps: Optional[int] = None,
) -> str:
    """Drive the BrowserGym env in a step loop using *generate_fn* for LLM calls.

    ``generate_fn(prompt) -> response`` is called once per step.
    The loop feeds observations to the LLM, parses a BrowserGym
    action from its response, and steps the environment until the
    task is done or *max_steps* is reached.

    Validation (``run_tests``) is **not** called here — the caller
    (e.g. ``AgenticRunner``) is responsible for that.
    """
    if self._env is None:
        raise RuntimeError(
            "WorkArena environment not initialised — use as context manager"
        )

    if max_steps is None:
        level = self._metadata.get("level", "l1")
        max_steps = 15 if level == "l1" else 50

    import time as _time

    self.all_responses = []
    self.turn_wall_clocks = []
    parse_error: Optional[str] = None

    for step_idx in range(max_steps):
        if self._done:
            break

        prompt = self._build_step_prompt(step_idx, max_steps, parse_error)
        t0 = _time.monotonic()
        response = generate_fn(prompt)
        self.turn_wall_clocks.append(_time.monotonic() - t0)
        self.all_responses.append(response)

        action = self._parse_action(response)
        if action is None:
            parse_error = (
                "Could not parse a valid action from your previous "
                "response. Please respond with exactly one action "
                'call, e.g. click("bid_42").'
            )
            LOGGER.warning(
                "Step %d: unparseable action, issuing noop",
                step_idx,
            )
            action = "noop()"
        else:
            parse_error = None

        LOGGER.info("Step %d/%d action: %s", step_idx + 1, max_steps, action[:200])

        _obs_text, _reward, done, _info = self.step(action)
        if done:
            break

    return "\n---\n".join(self.all_responses)