Skip to content

Commit 3ff490b

Browse files
authored
Merge pull request #122 from elezar/gpm-metrics
Fix missing GPM metrics
2 parents a94486b + 72a248c commit 3ff490b

File tree

3 files changed

+205
-4
lines changed

3 files changed

+205
-4
lines changed

examples/gpm-metrics/main.go

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
/**
2+
# Copyright 2024 NVIDIA CORPORATION
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package main
18+
19+
import (
20+
"fmt"
21+
"log"
22+
"time"
23+
24+
"github.com/NVIDIA/go-nvml/pkg/nvml"
25+
)
26+
27+
func main() {
28+
ret := nvml.Init()
29+
if ret != nvml.SUCCESS {
30+
log.Fatalf("failed to init NVML: %v", ret)
31+
}
32+
defer func() {
33+
_ = nvml.Shutdown()
34+
}()
35+
36+
count, ret := nvml.DeviceGetCount()
37+
if ret != nvml.SUCCESS {
38+
log.Fatalf("failed to get device count: %v", ret)
39+
}
40+
41+
for i := 0; i < count; i++ {
42+
if err := collectGPMMetrics(i); err != nil {
43+
log.Printf("failed to get metrics for device %d: %v\n", i, err)
44+
}
45+
}
46+
}
47+
48+
// collectGPMMetrics gets GPM metrics for a specified device.
49+
func collectGPMMetrics(i int) error {
50+
device, ret := nvml.DeviceGetHandleByIndex(i)
51+
if ret != nvml.SUCCESS {
52+
return fmt.Errorf("could not get devices handle: %w", ret)
53+
}
54+
55+
gpuQuerySupport, ret := device.GpmQueryDeviceSupport()
56+
if ret != nvml.SUCCESS {
57+
return fmt.Errorf("could not query GPM support: %w", ret)
58+
}
59+
60+
if gpuQuerySupport.IsSupportedDevice == 0 {
61+
return fmt.Errorf("GPM queries are not supported")
62+
}
63+
64+
sample1, ret := nvml.GpmSampleAlloc()
65+
if ret != nvml.SUCCESS {
66+
return fmt.Errorf("could not allocate sample: %w", ret)
67+
}
68+
defer func() {
69+
_ = sample1.Free()
70+
}()
71+
sample2, ret := nvml.GpmSampleAlloc()
72+
if ret != nvml.SUCCESS {
73+
return fmt.Errorf("could not allocate sample: %w", ret)
74+
}
75+
defer func() {
76+
_ = sample2.Free()
77+
}()
78+
79+
if ret := device.GpmSampleGet(sample1); ret != nvml.SUCCESS {
80+
return fmt.Errorf("could not get sample: %w", ret)
81+
}
82+
// add a delay between samples.
83+
time.Sleep(1 * time.Second)
84+
if ret := device.GpmSampleGet(sample2); ret != nvml.SUCCESS {
85+
return fmt.Errorf("could not get sample: %w", ret)
86+
}
87+
88+
gpmMetric := nvml.GpmMetricsGetType{
89+
NumMetrics: 1,
90+
Sample1: sample1,
91+
Sample2: sample2,
92+
Metrics: [98]nvml.GpmMetric{
93+
{
94+
MetricId: uint32(nvml.GPM_METRIC_GRAPHICS_UTIL),
95+
},
96+
},
97+
}
98+
99+
ret = nvml.GpmMetricsGet(&gpmMetric)
100+
if ret != nvml.SUCCESS {
101+
return fmt.Errorf("failed to get gpm metric: %w", ret)
102+
}
103+
104+
for i := 0; i < int(gpmMetric.NumMetrics); i++ {
105+
if gpmMetric.Metrics[i].MetricId > 0 {
106+
fmt.Printf("%v: %v\n", gpmMetric.Metrics[i].MetricId, gpmMetric.Metrics[i].Value)
107+
}
108+
}
109+
110+
return nil
111+
}

pkg/nvml/gpm.go

+15-4
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,31 @@ func (g *nvmlGpmMetricsGetType) convert() *GpmMetricsGetType {
5151

5252
// nvml.GpmMetricsGet()
5353
type GpmMetricsGetVType struct {
54-
metricsGet *nvmlGpmMetricsGetType
54+
metricsGet *GpmMetricsGetType
5555
}
5656

5757
func (l *library) GpmMetricsGetV(metricsGet *GpmMetricsGetType) GpmMetricsGetVType {
58-
return GpmMetricsGetVType{metricsGet.convert()}
58+
return GpmMetricsGetVType{metricsGet}
5959
}
60+
61+
// nvmlGpmMetricsGetStub is a stub function that can be overridden for testing.
62+
var nvmlGpmMetricsGetStub = nvmlGpmMetricsGet
63+
6064
func (metricsGetV GpmMetricsGetVType) V1() Return {
6165
metricsGetV.metricsGet.Version = 1
62-
return nvmlGpmMetricsGet(metricsGetV.metricsGet)
66+
return gpmMetricsGet(metricsGetV.metricsGet)
6367
}
6468

6569
func (l *library) GpmMetricsGet(metricsGet *GpmMetricsGetType) Return {
6670
metricsGet.Version = GPM_METRICS_GET_VERSION
67-
return nvmlGpmMetricsGet(metricsGet.convert())
71+
return gpmMetricsGet(metricsGet)
72+
}
73+
74+
func gpmMetricsGet(metricsGet *GpmMetricsGetType) Return {
75+
nvmlMetricsGet := metricsGet.convert()
76+
ret := nvmlGpmMetricsGetStub(nvmlMetricsGet)
77+
*metricsGet = *nvmlMetricsGet.convert()
78+
return ret
6879
}
6980

7081
// nvml.GpmSampleFree()

pkg/nvml/gpm_test.go

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/**
2+
# Copyright 2024 NVIDIA CORPORATION
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package nvml
18+
19+
import (
20+
"testing"
21+
22+
"github.com/stretchr/testify/require"
23+
)
24+
25+
func TestGpmMetricsGet(t *testing.T) {
26+
overrideMetrics := [98]GpmMetric{
27+
{
28+
Value: 99,
29+
},
30+
}
31+
defer setNvmlGpmMetricsGetStubForTest(func(metricsGet *nvmlGpmMetricsGetType) Return {
32+
metricsGet.Metrics = overrideMetrics
33+
return SUCCESS
34+
})()
35+
36+
metrics := GpmMetricsGetType{
37+
Sample1: nvmlGpmSample{},
38+
Sample2: nvmlGpmSample{},
39+
}
40+
ret := GpmMetricsGet(&metrics)
41+
42+
require.Equal(t, SUCCESS, ret)
43+
require.EqualValues(t, GPM_METRICS_GET_VERSION, metrics.Version)
44+
45+
require.EqualValues(t, overrideMetrics, metrics.Metrics)
46+
}
47+
48+
func TestGpmMetricsGetV(t *testing.T) {
49+
overrideMetrics := [98]GpmMetric{
50+
{
51+
Value: 99,
52+
},
53+
}
54+
defer setNvmlGpmMetricsGetStubForTest(func(metricsGet *nvmlGpmMetricsGetType) Return {
55+
metricsGet.Metrics = overrideMetrics
56+
return SUCCESS
57+
})()
58+
59+
metrics := GpmMetricsGetType{
60+
Sample1: nvmlGpmSample{},
61+
Sample2: nvmlGpmSample{},
62+
}
63+
64+
ret := GpmMetricsGetV(&metrics).V1()
65+
66+
require.Equal(t, SUCCESS, ret)
67+
require.EqualValues(t, GPM_METRICS_GET_VERSION, metrics.Version)
68+
69+
require.EqualValues(t, overrideMetrics, metrics.Metrics)
70+
}
71+
72+
func setNvmlGpmMetricsGetStubForTest(mock func(metricsGet *nvmlGpmMetricsGetType) Return) func() {
73+
original := nvmlGpmMetricsGetStub
74+
75+
nvmlGpmMetricsGetStub = mock
76+
return func() {
77+
nvmlGpmMetricsGetStub = original
78+
}
79+
}

0 commit comments

Comments
 (0)