|
| 1 | +import logging |
| 2 | +import os |
| 3 | +from typing import Dict, Union, Tuple |
| 4 | + |
| 5 | +import psutil |
| 6 | +from statistics import mean |
| 7 | +from threading import Event, Thread |
| 8 | +from funcy import merge_with |
| 9 | + |
| 10 | +try: |
| 11 | + from pynvml import ( |
| 12 | + nvmlInit, |
| 13 | + nvmlDeviceGetCount, |
| 14 | + nvmlDeviceGetHandleByIndex, |
| 15 | + nvmlDeviceGetMemoryInfo, |
| 16 | + nvmlDeviceGetUtilizationRates, |
| 17 | + nvmlShutdown, |
| 18 | + NVMLError, |
| 19 | + ) |
| 20 | + |
| 21 | + GPU_AVAILABLE = True |
| 22 | +except ImportError: |
| 23 | + GPU_AVAILABLE = False |
| 24 | + |
| 25 | +logger = logging.getLogger("dvclive") |
| 26 | +GIGABYTES_DIVIDER = 1024.0**3 |
| 27 | + |
| 28 | +MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20 |
| 29 | + |
| 30 | +METRIC_CPU_COUNT = "system/cpu/count" |
| 31 | +METRIC_CPU_USAGE_PERCENT = "system/cpu/usage (%)" |
| 32 | +METRIC_CPU_PARALLELIZATION_PERCENT = "system/cpu/parallelization (%)" |
| 33 | + |
| 34 | +METRIC_RAM_USAGE_PERCENT = "system/ram/usage (%)" |
| 35 | +METRIC_RAM_USAGE_GB = "system/ram/usage (GB)" |
| 36 | +METRIC_RAM_TOTAL_GB = "system/ram/total (GB)" |
| 37 | + |
| 38 | +METRIC_DISK_USAGE_PERCENT = "system/disk/usage (%)" |
| 39 | +METRIC_DISK_USAGE_GB = "system/disk/usage (GB)" |
| 40 | +METRIC_DISK_TOTAL_GB = "system/disk/total (GB)" |
| 41 | + |
| 42 | +METRIC_GPU_COUNT = "system/gpu/count" |
| 43 | +METRIC_GPU_USAGE_PERCENT = "system/gpu/usage (%)" |
| 44 | +METRIC_VRAM_USAGE_PERCENT = "system/vram/usage (%)" |
| 45 | +METRIC_VRAM_USAGE_GB = "system/vram/usage (GB)" |
| 46 | +METRIC_VRAM_TOTAL_GB = "system/vram/total (GB)" |
| 47 | + |
| 48 | + |
| 49 | +class _SystemMonitor: |
| 50 | + _plot_blacklist_prefix: Tuple = ( |
| 51 | + METRIC_CPU_COUNT, |
| 52 | + METRIC_RAM_TOTAL_GB, |
| 53 | + METRIC_DISK_TOTAL_GB, |
| 54 | + METRIC_GPU_COUNT, |
| 55 | + METRIC_VRAM_TOTAL_GB, |
| 56 | + ) |
| 57 | + |
| 58 | + def __init__( |
| 59 | + self, |
| 60 | + live, |
| 61 | + interval: float, # seconds |
| 62 | + num_samples: int, |
| 63 | + directories_to_monitor: Dict[str, str], |
| 64 | + ): |
| 65 | + self._live = live |
| 66 | + self._interval = self._check_interval(interval, max_interval=0.1) |
| 67 | + self._num_samples = self._check_num_samples( |
| 68 | + num_samples, min_num_samples=1, max_num_samples=30 |
| 69 | + ) |
| 70 | + self._disks_to_monitor = self._check_directories_to_monitor( |
| 71 | + directories_to_monitor |
| 72 | + ) |
| 73 | + self._warn_cpu_problem = True |
| 74 | + self._warn_gpu_problem = True |
| 75 | + self._warn_disk_doesnt_exist: Dict[str, bool] = {} |
| 76 | + |
| 77 | + self._shutdown_event = Event() |
| 78 | + Thread( |
| 79 | + target=self._monitoring_loop, |
| 80 | + ).start() |
| 81 | + |
| 82 | + def _check_interval(self, interval: float, max_interval: float) -> float: |
| 83 | + if interval > max_interval: |
| 84 | + logger.warning( |
| 85 | + f"System monitoring `interval` should be less than {max_interval} " |
| 86 | + f"seconds. Setting `interval` to {max_interval} seconds." |
| 87 | + ) |
| 88 | + return max_interval |
| 89 | + return interval |
| 90 | + |
| 91 | + def _check_num_samples( |
| 92 | + self, num_samples: int, min_num_samples: int, max_num_samples: int |
| 93 | + ) -> int: |
| 94 | + min_num_samples = 1 |
| 95 | + max_num_samples = 30 |
| 96 | + if not min_num_samples < num_samples < max_num_samples: |
| 97 | + num_samples = max(min(num_samples, max_num_samples), min_num_samples) |
| 98 | + logger.warning( |
| 99 | + f"System monitoring `num_samples` should be between {min_num_samples} " |
| 100 | + f"and {max_num_samples}. Setting `num_samples` to {num_samples}." |
| 101 | + ) |
| 102 | + return num_samples |
| 103 | + |
| 104 | + def _check_directories_to_monitor( |
| 105 | + self, directories_to_monitor: Dict[str, str] |
| 106 | + ) -> Dict[str, str]: |
| 107 | + disks_to_monitor = {} |
| 108 | + for disk_name, disk_path in directories_to_monitor.items(): |
| 109 | + if disk_name != os.path.normpath(disk_name): |
| 110 | + raise ValueError( # noqa: TRY003 |
| 111 | + "Keys for `directories_to_monitor` should be a valid name" |
| 112 | + f", but got '{disk_name}'." |
| 113 | + ) |
| 114 | + disks_to_monitor[disk_name] = disk_path |
| 115 | + return disks_to_monitor |
| 116 | + |
| 117 | + def _monitoring_loop(self): |
| 118 | + while not self._shutdown_event.is_set(): |
| 119 | + self._metrics = {} |
| 120 | + for _ in range(self._num_samples): |
| 121 | + try: |
| 122 | + last_metrics = self._get_metrics() |
| 123 | + except psutil.Error: |
| 124 | + if self._warn_cpu_problem: |
| 125 | + logger.exception("Failed to monitor CPU metrics") |
| 126 | + self._warn_cpu_problem = False |
| 127 | + except NVMLError: |
| 128 | + if self._warn_gpu_problem: |
| 129 | + logger.exception("Failed to monitor GPU metrics") |
| 130 | + self._warn_gpu_problem = False |
| 131 | + |
| 132 | + self._metrics = merge_with(sum, self._metrics, last_metrics) |
| 133 | + self._shutdown_event.wait(self._interval) |
| 134 | + if self._shutdown_event.is_set(): |
| 135 | + break |
| 136 | + for name, values in self._metrics.items(): |
| 137 | + blacklisted = any( |
| 138 | + name.startswith(prefix) for prefix in self._plot_blacklist_prefix |
| 139 | + ) |
| 140 | + self._live.log_metric( |
| 141 | + name, |
| 142 | + values / self._num_samples, |
| 143 | + timestamp=True, |
| 144 | + plot=None if blacklisted else True, |
| 145 | + ) |
| 146 | + |
| 147 | + def _get_metrics(self) -> Dict[str, Union[float, int]]: |
| 148 | + return { |
| 149 | + **self._get_gpu_info(), |
| 150 | + **self._get_cpu_info(), |
| 151 | + **self._get_ram_info(), |
| 152 | + **self._get_disk_info(), |
| 153 | + } |
| 154 | + |
| 155 | + def _get_ram_info(self) -> Dict[str, Union[float, int]]: |
| 156 | + ram_info = psutil.virtual_memory() |
| 157 | + return { |
| 158 | + METRIC_RAM_USAGE_PERCENT: ram_info.percent, |
| 159 | + METRIC_RAM_USAGE_GB: ram_info.used / GIGABYTES_DIVIDER, |
| 160 | + METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER, |
| 161 | + } |
| 162 | + |
| 163 | + def _get_cpu_info(self) -> Dict[str, Union[float, int]]: |
| 164 | + num_cpus = psutil.cpu_count() |
| 165 | + cpus_percent = psutil.cpu_percent(percpu=True) |
| 166 | + return { |
| 167 | + METRIC_CPU_COUNT: num_cpus, |
| 168 | + METRIC_CPU_USAGE_PERCENT: mean(cpus_percent), |
| 169 | + METRIC_CPU_PARALLELIZATION_PERCENT: len( |
| 170 | + [ |
| 171 | + percent |
| 172 | + for percent in cpus_percent |
| 173 | + if percent >= MINIMUM_CPU_USAGE_TO_BE_ACTIVE |
| 174 | + ] |
| 175 | + ) |
| 176 | + * 100 |
| 177 | + / num_cpus, |
| 178 | + } |
| 179 | + |
| 180 | + def _get_disk_info(self) -> Dict[str, Union[float, int]]: |
| 181 | + result = {} |
| 182 | + for disk_name, disk_path in self._disks_to_monitor.items(): |
| 183 | + try: |
| 184 | + disk_info = psutil.disk_usage(disk_path) |
| 185 | + except OSError: |
| 186 | + if self._warn_disk_doesnt_exist.get(disk_name, True): |
| 187 | + logger.warning( |
| 188 | + f"Couldn't find directory '{disk_path}', ignoring it." |
| 189 | + ) |
| 190 | + self._warn_disk_doesnt_exist[disk_name] = False |
| 191 | + continue |
| 192 | + disk_metrics = { |
| 193 | + f"{METRIC_DISK_USAGE_PERCENT}/{disk_name}": disk_info.percent, |
| 194 | + f"{METRIC_DISK_USAGE_GB}/{disk_name}": disk_info.used |
| 195 | + / GIGABYTES_DIVIDER, |
| 196 | + f"{METRIC_DISK_TOTAL_GB}/{disk_name}": disk_info.total |
| 197 | + / GIGABYTES_DIVIDER, |
| 198 | + } |
| 199 | + disk_metrics = {k.rstrip("/"): v for k, v in disk_metrics.items()} |
| 200 | + result.update(disk_metrics) |
| 201 | + return result |
| 202 | + |
| 203 | + def _get_gpu_info(self) -> Dict[str, Union[float, int]]: |
| 204 | + if not GPU_AVAILABLE: |
| 205 | + return {} |
| 206 | + |
| 207 | + nvmlInit() |
| 208 | + num_gpus = nvmlDeviceGetCount() |
| 209 | + gpu_metrics = { |
| 210 | + "system/gpu/count": num_gpus, |
| 211 | + } |
| 212 | + |
| 213 | + for gpu_idx in range(num_gpus): |
| 214 | + gpu_handle = nvmlDeviceGetHandleByIndex(gpu_idx) |
| 215 | + memory_info = nvmlDeviceGetMemoryInfo(gpu_handle) |
| 216 | + usage_info = nvmlDeviceGetUtilizationRates(gpu_handle) |
| 217 | + |
| 218 | + gpu_metrics.update( |
| 219 | + { |
| 220 | + f"{METRIC_GPU_USAGE_PERCENT}/{gpu_idx}": ( |
| 221 | + 100 * usage_info.memory / usage_info.gpu |
| 222 | + if usage_info.gpu |
| 223 | + else 0 |
| 224 | + ), |
| 225 | + f"{METRIC_VRAM_USAGE_PERCENT}/{gpu_idx}": ( |
| 226 | + 100 * memory_info.used / memory_info.total |
| 227 | + ), |
| 228 | + f"{METRIC_VRAM_USAGE_GB}/{gpu_idx}": ( |
| 229 | + memory_info.used / GIGABYTES_DIVIDER |
| 230 | + ), |
| 231 | + f"{METRIC_VRAM_TOTAL_GB}/{gpu_idx}": ( |
| 232 | + memory_info.total / GIGABYTES_DIVIDER |
| 233 | + ), |
| 234 | + } |
| 235 | + ) |
| 236 | + nvmlShutdown() |
| 237 | + return gpu_metrics |
| 238 | + |
| 239 | + def end(self): |
| 240 | + self._shutdown_event.set() |
0 commit comments