gpu_monitor

gpu_monitor ¶

GPU monitoring via pynvml — background poller for GPU metrics.

Classes¶

GpuHardwareSpec `dataclass` ¶

GpuHardwareSpec(tflops_fp16: float, bandwidth_gb_s: float, tdp_watts: float)

Peak theoretical capabilities for a known GPU model.

GpuSnapshot `dataclass` ¶

GpuSnapshot(power_watts: float, utilization_pct: float, memory_used_gb: float, temperature_c: float, device_id: int = 0)

A single point-in-time reading from one GPU device.

GpuSample `dataclass` ¶

GpuSample(energy_joules: float = 0.0, mean_power_watts: float = 0.0, peak_power_watts: float = 0.0, mean_utilization_pct: float = 0.0, peak_utilization_pct: float = 0.0, mean_memory_used_gb: float = 0.0, peak_memory_used_gb: float = 0.0, mean_temperature_c: float = 0.0, peak_temperature_c: float = 0.0, duration_seconds: float = 0.0, num_snapshots: int = 0)

Aggregated GPU metrics over an inference bracket.

GpuMonitor ¶

GpuMonitor(poll_interval_ms: int = 50)

Background GPU poller using pynvml.

Usage::

mon = GpuMonitor(poll_interval_ms=50)
with mon.sample() as result:
    # ... run inference ...
    pass
print(result.energy_joules)
mon.close()

Source code in src/openjarvis/telemetry/gpu_monitor.py

def __init__(self, poll_interval_ms: int = 50) -> None:
    self._poll_interval_s = poll_interval_ms / 1000.0
    self._handles: List = []
    self._device_count = 0
    self._initialized = False

    if _PYNVML_AVAILABLE:
        try:
            pynvml.nvmlInit()
            self._device_count = pynvml.nvmlDeviceGetCount()
            self._handles = [
                pynvml.nvmlDeviceGetHandleByIndex(i)
                for i in range(self._device_count)
            ]
            self._initialized = True
        except Exception as exc:
            logger.debug("GPU monitor initialization failed: %s", exc)
            self._initialized = False

Functions¶

available `staticmethod` ¶

available() -> bool

Return True if pynvml is importable and can be initialized.

Source code in src/openjarvis/telemetry/gpu_monitor.py

@staticmethod
def available() -> bool:
    """Return ``True`` if pynvml is importable and can be initialized."""
    if not _PYNVML_AVAILABLE:
        return False
    try:
        pynvml.nvmlInit()
        pynvml.nvmlShutdown()
        return True
    except Exception as exc:
        logger.debug("GPU monitor availability check failed: %s", exc)
        return False

sample ¶

sample() -> Generator[GpuSample, None, None]

Context manager that polls GPUs during the block, then populates the sample.

If pynvml is unavailable or no devices are found, yields an empty :class:GpuSample without starting a background thread.

Source code in src/openjarvis/telemetry/gpu_monitor.py

@contextmanager
def sample(self) -> Generator[GpuSample, None, None]:
    """Context manager that polls GPUs during the block, then populates the sample.

    If pynvml is unavailable or no devices are found, yields an empty
    :class:`GpuSample` without starting a background thread.
    """
    result = GpuSample()

    if not self._initialized or self._device_count == 0:
        t_start = time.monotonic()
        yield result
        result.duration_seconds = time.monotonic() - t_start
        return

    snapshots: List[List[GpuSnapshot]] = []
    timestamps: List[float] = []
    lock = threading.Lock()
    stop_event = threading.Event()

    thread = threading.Thread(
        target=self._polling_loop,
        args=(snapshots, timestamps, lock, stop_event),
        daemon=True,
    )

    t_start = time.monotonic()
    thread.start()
    try:
        yield result
    finally:
        stop_event.set()
        thread.join(timeout=2.0)
        wall = time.monotonic() - t_start

        with lock:
            snap_copy = list(snapshots)
            ts_copy = list(timestamps)

        aggregated = self._aggregate(snap_copy, ts_copy, wall)

        # Copy aggregated values into the yielded result object
        result.energy_joules = aggregated.energy_joules
        result.mean_power_watts = aggregated.mean_power_watts
        result.peak_power_watts = aggregated.peak_power_watts
        result.mean_utilization_pct = aggregated.mean_utilization_pct
        result.peak_utilization_pct = aggregated.peak_utilization_pct
        result.mean_memory_used_gb = aggregated.mean_memory_used_gb
        result.peak_memory_used_gb = aggregated.peak_memory_used_gb
        result.mean_temperature_c = aggregated.mean_temperature_c
        result.peak_temperature_c = aggregated.peak_temperature_c
        result.duration_seconds = aggregated.duration_seconds
        result.num_snapshots = aggregated.num_snapshots

close ¶

close() -> None

Shut down pynvml if it was initialized.

Source code in src/openjarvis/telemetry/gpu_monitor.py

def close(self) -> None:
    """Shut down pynvml if it was initialized."""
    if self._initialized:
        try:
            pynvml.nvmlShutdown()
        except Exception as exc:
            logger.debug("Failed to shut down GPU monitor: %s", exc)
        self._initialized = False

Functions¶

lookup_gpu_spec ¶

lookup_gpu_spec(name: str) -> Optional[GpuHardwareSpec]

Return the :class:GpuHardwareSpec for name, or None if unknown.

Matches are case-insensitive substring lookups against the keys in :data:GPU_SPECS.

Source code in src/openjarvis/telemetry/gpu_monitor.py

def lookup_gpu_spec(name: str) -> Optional[GpuHardwareSpec]:
    """Return the :class:`GpuHardwareSpec` for *name*, or ``None`` if unknown.

    Matches are case-insensitive substring lookups against the keys in
    :data:`GPU_SPECS`.
    """
    upper = name.upper()
    for key, spec in GPU_SPECS.items():
        if key.upper() in upper:
            return spec
    return None

gpu_monitor

gpu_monitor ¶

Classes¶

GpuHardwareSpec dataclass ¶

GpuSnapshot dataclass ¶

GpuSample dataclass ¶

GpuMonitor ¶

Functions¶

available staticmethod ¶

sample ¶

close ¶

Functions¶

lookup_gpu_spec ¶

GpuHardwareSpec `dataclass` ¶

GpuSnapshot `dataclass` ¶

GpuSample `dataclass` ¶

available `staticmethod` ¶