Skip to content

Index

feedback

Feedback subsystem: LLM-as-judge scoring and signal aggregation.

Classes

FeedbackCollector

FeedbackCollector()

Collects feedback signals: explicit user scores + LLM judge evaluations.

Signals are stored in-memory as dicts with at least trace_id, score, source, and timestamp keys.

Source code in src/openjarvis/learning/optimize/feedback/collector.py
def __init__(self) -> None:
    self._records: List[Dict[str, Any]] = []
Functions
record_explicit
record_explicit(trace_id: str, score: float, source: str = 'api') -> None

Record an explicit numeric score (0-1) for a trace.

Source code in src/openjarvis/learning/optimize/feedback/collector.py
def record_explicit(
    self,
    trace_id: str,
    score: float,
    source: str = "api",
) -> None:
    """Record an explicit numeric score (0-1) for a trace."""
    self._records.append({
        "trace_id": trace_id,
        "score": min(max(score, 0.0), 1.0),
        "source": source,
        "timestamp": time.time(),
    })
record_thumbs
record_thumbs(trace_id: str, thumbs_up: bool) -> None

Record a thumbs-up / thumbs-down signal (converted to 1.0/0.0).

Source code in src/openjarvis/learning/optimize/feedback/collector.py
def record_thumbs(self, trace_id: str, thumbs_up: bool) -> None:
    """Record a thumbs-up / thumbs-down signal (converted to 1.0/0.0)."""
    self._records.append({
        "trace_id": trace_id,
        "score": 1.0 if thumbs_up else 0.0,
        "source": "thumbs",
        "timestamp": time.time(),
    })
evaluate_traces
evaluate_traces(traces: List[Trace], judge: TraceJudge) -> List[Dict[str, Any]]

Score traces via the LLM judge and record the results.

Returns the list of newly created feedback records.

Source code in src/openjarvis/learning/optimize/feedback/collector.py
def evaluate_traces(
    self,
    traces: List[Trace],
    judge: TraceJudge,
) -> List[Dict[str, Any]]:
    """Score *traces* via the LLM judge and record the results.

    Returns the list of newly created feedback records.
    """
    new_records: List[Dict[str, Any]] = []
    for trace in traces:
        score, feedback = judge.score_trace(trace)
        record: Dict[str, Any] = {
            "trace_id": trace.trace_id,
            "score": score,
            "source": "judge",
            "feedback": feedback,
            "timestamp": time.time(),
        }
        self._records.append(record)
        new_records.append(record)
    return new_records
get_records
get_records(trace_id: Optional[str] = None) -> List[Dict[str, Any]]

Return stored records, optionally filtered by trace_id.

Source code in src/openjarvis/learning/optimize/feedback/collector.py
def get_records(
    self, trace_id: Optional[str] = None,
) -> List[Dict[str, Any]]:
    """Return stored records, optionally filtered by *trace_id*."""
    if trace_id is None:
        return list(self._records)
    return [r for r in self._records if r["trace_id"] == trace_id]
stats
stats() -> Dict[str, Any]

Return aggregate statistics over all recorded feedback.

Returns a dict with count, mean_score, and a simple distribution bucket (low / medium / high).

Source code in src/openjarvis/learning/optimize/feedback/collector.py
def stats(self) -> Dict[str, Any]:
    """Return aggregate statistics over all recorded feedback.

    Returns a dict with ``count``, ``mean_score``, and a simple
    ``distribution`` bucket (low / medium / high).
    """
    count = len(self._records)
    if count == 0:
        return {
            "count": 0,
            "mean_score": 0.0,
            "distribution": {"low": 0, "medium": 0, "high": 0},
        }

    scores = [r["score"] for r in self._records]
    mean_score = sum(scores) / count
    low = sum(1 for s in scores if s < 0.4)
    medium = sum(1 for s in scores if 0.4 <= s < 0.7)
    high = sum(1 for s in scores if s >= 0.7)

    return {
        "count": count,
        "mean_score": mean_score,
        "distribution": {"low": low, "medium": medium, "high": high},
    }

TraceJudge

TraceJudge(backend: InferenceBackend, model: str)

LLM-as-judge for scoring traces when no ground truth exists.

Given a :class:Trace, the judge constructs a prompt showing the query, agent steps, and final result, then asks an LLM to rate the quality on a 0-1 scale.

Source code in src/openjarvis/learning/optimize/feedback/judge.py
def __init__(self, backend: InferenceBackend, model: str) -> None:
    self._backend = backend
    self._model = model
Functions
score_trace
score_trace(trace: Trace) -> Tuple[float, str]

Score a single trace.

Returns: (score, feedback) where score is in [0, 1] and feedback is the judge's textual reasoning.

Source code in src/openjarvis/learning/optimize/feedback/judge.py
def score_trace(self, trace: Trace) -> Tuple[float, str]:
    """Score a single trace.

    Returns:
        ``(score, feedback)`` where *score* is in [0, 1] and
        *feedback* is the judge's textual reasoning.
    """
    prompt = _format_trace(trace)
    response = self._backend.generate(
        prompt,
        model=self._model,
        system=_SYSTEM_PROMPT,
        temperature=0.0,
        max_tokens=1024,
    )
    score = _parse_score(response)
    return score, response
batch_evaluate
batch_evaluate(traces: List[Trace]) -> List[Tuple[float, str]]

Evaluate multiple traces sequentially.

Returns a list of (score, feedback) tuples, one per trace.

Source code in src/openjarvis/learning/optimize/feedback/judge.py
def batch_evaluate(
    self, traces: List[Trace],
) -> List[Tuple[float, str]]:
    """Evaluate multiple traces sequentially.

    Returns a list of ``(score, feedback)`` tuples, one per trace.
    """
    results: List[Tuple[float, str]] = []
    for trace in traces:
        results.append(self.score_trace(trace))
    return results