Skip to content

Commit 2c7c378

Browse files
AlexandreKempfpre-commit-ci[bot]Dave Berenbaum
authored
monitor GPU ressources (#785)
* add GPU, CPU, RAM, and disk monitoring --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Dave Berenbaum <[email protected]>
1 parent 9eb04c2 commit 2c7c378

File tree

4 files changed

+631
-2
lines changed

4 files changed

+631
-2
lines changed

Diff for: pyproject.toml

+6-2
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ dependencies = [
3737
"funcy",
3838
"gto",
3939
"ruamel.yaml",
40-
"scmrepo>=3,<4"
40+
"scmrepo>=3,<4",
41+
"psutil",
42+
"pynvml"
4143
]
4244

4345
[project.optional-dependencies]
@@ -51,7 +53,9 @@ tests = [
5153
"pytest-cov>=3.0.0,<4.0",
5254
"pytest-mock>=3.8.2,<4.0",
5355
"dvclive[image,plots,markdown]",
54-
"ipython"
56+
"ipython",
57+
"pytest_voluptuous",
58+
"dpath"
5559
]
5660
dev = [
5761
"dvclive[all,tests]",

Diff for: src/dvclive/live.py

+52
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77
import os
88
import shutil
99
import tempfile
10+
1011
from pathlib import Path, PurePath
1112
from typing import Any, Dict, List, Optional, Set, Tuple, Union, TYPE_CHECKING, Literal
1213

14+
1315
if TYPE_CHECKING:
1416
import numpy as np
1517
import pandas as pd
@@ -41,6 +43,7 @@
4143
from .report import BLANK_NOTEBOOK_REPORT, make_report
4244
from .serialize import dump_json, dump_yaml, load_yaml
4345
from .studio import get_dvc_studio_config, post_to_studio
46+
from .monitor_system import _SystemMonitor
4447
from .utils import (
4548
StrPath,
4649
catch_and_warn,
@@ -81,6 +84,7 @@ def __init__(
8184
cache_images: bool = False,
8285
exp_name: Optional[str] = None,
8386
exp_message: Optional[str] = None,
87+
monitor_system: bool = False,
8488
):
8589
"""
8690
Initializes a DVCLive logger. A `Live()` instance is required in order to log
@@ -119,6 +123,8 @@ def __init__(
119123
provided string will be passed to `dvc exp save --message`.
120124
If DVCLive is used inside `dvc exp run`, the option will be ignored, use
121125
`dvc exp run --message` instead.
126+
monitor_system (bool): if `True`, DVCLive will monitor GPU, CPU, ram, and
127+
disk usage. Defaults to `False`.
122128
"""
123129
self.summary: Dict[str, Any] = {}
124130

@@ -165,6 +171,10 @@ def __init__(
165171
self._dvc_studio_config: Dict[str, Any] = {}
166172
self._init_studio()
167173

174+
self._system_monitor: Optional[_SystemMonitor] = None # Monitoring thread
175+
if monitor_system:
176+
self.monitor_system()
177+
168178
def _init_resume(self):
169179
self._read_params()
170180
self.summary = self.read_latest()
@@ -370,6 +380,43 @@ def step(self, value: int) -> None:
370380
self._step = value
371381
logger.debug(f"Step: {self.step}")
372382

383+
def monitor_system(
384+
self,
385+
interval: float = 0.05, # seconds
386+
num_samples: int = 20,
387+
directories_to_monitor: Optional[Dict[str, str]] = None,
388+
) -> None:
389+
"""Monitor GPU, CPU, ram, and disk resources and log them to DVC Live.
390+
391+
Args:
392+
interval (float): the time interval between samples in seconds. To keep the
393+
sampling interval small, the maximum value allowed is 0.1 seconds.
394+
Default to 0.05.
395+
num_samples (int): the number of samples to collect before the aggregation.
396+
The value should be between 1 and 30 samples. Default to 20.
397+
directories_to_monitor (Optional[Dict[str, str]]): a dictionary with the
398+
information about which directories to monitor. The `key` would be the
399+
name of the metric and the `value` is the path to the directory.
400+
The metric tracked concerns the partition that contains the directory.
401+
Default to `{"main": "/"}`.
402+
403+
Raises:
404+
ValueError: if the keys in `directories_to_monitor` contains invalid
405+
characters as defined by `os.path.normpath`.
406+
"""
407+
if directories_to_monitor is None:
408+
directories_to_monitor = {"main": "/"}
409+
410+
if self._system_monitor is not None:
411+
self._system_monitor.end()
412+
413+
self._system_monitor = _SystemMonitor(
414+
live=self,
415+
interval=interval,
416+
num_samples=num_samples,
417+
directories_to_monitor=directories_to_monitor,
418+
)
419+
373420
def sync(self):
374421
self.make_summary()
375422

@@ -857,6 +904,11 @@ def end(self):
857904
# If next_step called before end, don't want to update step number
858905
if "step" in self.summary:
859906
self.step = self.summary["step"]
907+
908+
# Kill threads that monitor the system metrics
909+
if self._system_monitor is not None:
910+
self._system_monitor.end()
911+
860912
self.sync()
861913

862914
if self._inside_dvc_exp and self._dvc_repo:

Diff for: src/dvclive/monitor_system.py

+240
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
import logging
2+
import os
3+
from typing import Dict, Union, Tuple
4+
5+
import psutil
6+
from statistics import mean
7+
from threading import Event, Thread
8+
from funcy import merge_with
9+
10+
try:
11+
from pynvml import (
12+
nvmlInit,
13+
nvmlDeviceGetCount,
14+
nvmlDeviceGetHandleByIndex,
15+
nvmlDeviceGetMemoryInfo,
16+
nvmlDeviceGetUtilizationRates,
17+
nvmlShutdown,
18+
NVMLError,
19+
)
20+
21+
GPU_AVAILABLE = True
22+
except ImportError:
23+
GPU_AVAILABLE = False
24+
25+
logger = logging.getLogger("dvclive")
26+
GIGABYTES_DIVIDER = 1024.0**3
27+
28+
MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20
29+
30+
METRIC_CPU_COUNT = "system/cpu/count"
31+
METRIC_CPU_USAGE_PERCENT = "system/cpu/usage (%)"
32+
METRIC_CPU_PARALLELIZATION_PERCENT = "system/cpu/parallelization (%)"
33+
34+
METRIC_RAM_USAGE_PERCENT = "system/ram/usage (%)"
35+
METRIC_RAM_USAGE_GB = "system/ram/usage (GB)"
36+
METRIC_RAM_TOTAL_GB = "system/ram/total (GB)"
37+
38+
METRIC_DISK_USAGE_PERCENT = "system/disk/usage (%)"
39+
METRIC_DISK_USAGE_GB = "system/disk/usage (GB)"
40+
METRIC_DISK_TOTAL_GB = "system/disk/total (GB)"
41+
42+
METRIC_GPU_COUNT = "system/gpu/count"
43+
METRIC_GPU_USAGE_PERCENT = "system/gpu/usage (%)"
44+
METRIC_VRAM_USAGE_PERCENT = "system/vram/usage (%)"
45+
METRIC_VRAM_USAGE_GB = "system/vram/usage (GB)"
46+
METRIC_VRAM_TOTAL_GB = "system/vram/total (GB)"
47+
48+
49+
class _SystemMonitor:
50+
_plot_blacklist_prefix: Tuple = (
51+
METRIC_CPU_COUNT,
52+
METRIC_RAM_TOTAL_GB,
53+
METRIC_DISK_TOTAL_GB,
54+
METRIC_GPU_COUNT,
55+
METRIC_VRAM_TOTAL_GB,
56+
)
57+
58+
def __init__(
59+
self,
60+
live,
61+
interval: float, # seconds
62+
num_samples: int,
63+
directories_to_monitor: Dict[str, str],
64+
):
65+
self._live = live
66+
self._interval = self._check_interval(interval, max_interval=0.1)
67+
self._num_samples = self._check_num_samples(
68+
num_samples, min_num_samples=1, max_num_samples=30
69+
)
70+
self._disks_to_monitor = self._check_directories_to_monitor(
71+
directories_to_monitor
72+
)
73+
self._warn_cpu_problem = True
74+
self._warn_gpu_problem = True
75+
self._warn_disk_doesnt_exist: Dict[str, bool] = {}
76+
77+
self._shutdown_event = Event()
78+
Thread(
79+
target=self._monitoring_loop,
80+
).start()
81+
82+
def _check_interval(self, interval: float, max_interval: float) -> float:
83+
if interval > max_interval:
84+
logger.warning(
85+
f"System monitoring `interval` should be less than {max_interval} "
86+
f"seconds. Setting `interval` to {max_interval} seconds."
87+
)
88+
return max_interval
89+
return interval
90+
91+
def _check_num_samples(
92+
self, num_samples: int, min_num_samples: int, max_num_samples: int
93+
) -> int:
94+
min_num_samples = 1
95+
max_num_samples = 30
96+
if not min_num_samples < num_samples < max_num_samples:
97+
num_samples = max(min(num_samples, max_num_samples), min_num_samples)
98+
logger.warning(
99+
f"System monitoring `num_samples` should be between {min_num_samples} "
100+
f"and {max_num_samples}. Setting `num_samples` to {num_samples}."
101+
)
102+
return num_samples
103+
104+
def _check_directories_to_monitor(
105+
self, directories_to_monitor: Dict[str, str]
106+
) -> Dict[str, str]:
107+
disks_to_monitor = {}
108+
for disk_name, disk_path in directories_to_monitor.items():
109+
if disk_name != os.path.normpath(disk_name):
110+
raise ValueError( # noqa: TRY003
111+
"Keys for `directories_to_monitor` should be a valid name"
112+
f", but got '{disk_name}'."
113+
)
114+
disks_to_monitor[disk_name] = disk_path
115+
return disks_to_monitor
116+
117+
def _monitoring_loop(self):
118+
while not self._shutdown_event.is_set():
119+
self._metrics = {}
120+
for _ in range(self._num_samples):
121+
try:
122+
last_metrics = self._get_metrics()
123+
except psutil.Error:
124+
if self._warn_cpu_problem:
125+
logger.exception("Failed to monitor CPU metrics")
126+
self._warn_cpu_problem = False
127+
except NVMLError:
128+
if self._warn_gpu_problem:
129+
logger.exception("Failed to monitor GPU metrics")
130+
self._warn_gpu_problem = False
131+
132+
self._metrics = merge_with(sum, self._metrics, last_metrics)
133+
self._shutdown_event.wait(self._interval)
134+
if self._shutdown_event.is_set():
135+
break
136+
for name, values in self._metrics.items():
137+
blacklisted = any(
138+
name.startswith(prefix) for prefix in self._plot_blacklist_prefix
139+
)
140+
self._live.log_metric(
141+
name,
142+
values / self._num_samples,
143+
timestamp=True,
144+
plot=None if blacklisted else True,
145+
)
146+
147+
def _get_metrics(self) -> Dict[str, Union[float, int]]:
148+
return {
149+
**self._get_gpu_info(),
150+
**self._get_cpu_info(),
151+
**self._get_ram_info(),
152+
**self._get_disk_info(),
153+
}
154+
155+
def _get_ram_info(self) -> Dict[str, Union[float, int]]:
156+
ram_info = psutil.virtual_memory()
157+
return {
158+
METRIC_RAM_USAGE_PERCENT: ram_info.percent,
159+
METRIC_RAM_USAGE_GB: ram_info.used / GIGABYTES_DIVIDER,
160+
METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER,
161+
}
162+
163+
def _get_cpu_info(self) -> Dict[str, Union[float, int]]:
164+
num_cpus = psutil.cpu_count()
165+
cpus_percent = psutil.cpu_percent(percpu=True)
166+
return {
167+
METRIC_CPU_COUNT: num_cpus,
168+
METRIC_CPU_USAGE_PERCENT: mean(cpus_percent),
169+
METRIC_CPU_PARALLELIZATION_PERCENT: len(
170+
[
171+
percent
172+
for percent in cpus_percent
173+
if percent >= MINIMUM_CPU_USAGE_TO_BE_ACTIVE
174+
]
175+
)
176+
* 100
177+
/ num_cpus,
178+
}
179+
180+
def _get_disk_info(self) -> Dict[str, Union[float, int]]:
181+
result = {}
182+
for disk_name, disk_path in self._disks_to_monitor.items():
183+
try:
184+
disk_info = psutil.disk_usage(disk_path)
185+
except OSError:
186+
if self._warn_disk_doesnt_exist.get(disk_name, True):
187+
logger.warning(
188+
f"Couldn't find directory '{disk_path}', ignoring it."
189+
)
190+
self._warn_disk_doesnt_exist[disk_name] = False
191+
continue
192+
disk_metrics = {
193+
f"{METRIC_DISK_USAGE_PERCENT}/{disk_name}": disk_info.percent,
194+
f"{METRIC_DISK_USAGE_GB}/{disk_name}": disk_info.used
195+
/ GIGABYTES_DIVIDER,
196+
f"{METRIC_DISK_TOTAL_GB}/{disk_name}": disk_info.total
197+
/ GIGABYTES_DIVIDER,
198+
}
199+
disk_metrics = {k.rstrip("/"): v for k, v in disk_metrics.items()}
200+
result.update(disk_metrics)
201+
return result
202+
203+
def _get_gpu_info(self) -> Dict[str, Union[float, int]]:
204+
if not GPU_AVAILABLE:
205+
return {}
206+
207+
nvmlInit()
208+
num_gpus = nvmlDeviceGetCount()
209+
gpu_metrics = {
210+
"system/gpu/count": num_gpus,
211+
}
212+
213+
for gpu_idx in range(num_gpus):
214+
gpu_handle = nvmlDeviceGetHandleByIndex(gpu_idx)
215+
memory_info = nvmlDeviceGetMemoryInfo(gpu_handle)
216+
usage_info = nvmlDeviceGetUtilizationRates(gpu_handle)
217+
218+
gpu_metrics.update(
219+
{
220+
f"{METRIC_GPU_USAGE_PERCENT}/{gpu_idx}": (
221+
100 * usage_info.memory / usage_info.gpu
222+
if usage_info.gpu
223+
else 0
224+
),
225+
f"{METRIC_VRAM_USAGE_PERCENT}/{gpu_idx}": (
226+
100 * memory_info.used / memory_info.total
227+
),
228+
f"{METRIC_VRAM_USAGE_GB}/{gpu_idx}": (
229+
memory_info.used / GIGABYTES_DIVIDER
230+
),
231+
f"{METRIC_VRAM_TOTAL_GB}/{gpu_idx}": (
232+
memory_info.total / GIGABYTES_DIVIDER
233+
),
234+
}
235+
)
236+
nvmlShutdown()
237+
return gpu_metrics
238+
239+
def end(self):
240+
self._shutdown_event.set()

0 commit comments

Comments
 (0)