Merge pull request #122 from elezar/gpm-metrics

elezar · web-flow · commit 3ff490b62e26 · 2024-05-24T14:48:32.000+02:00
Fix missing GPM metrics
diff --git a/examples/gpm-metrics/main.go b/examples/gpm-metrics/main.go
@@ -0,0 +1,111 @@
+/**
+# Copyright 2024 NVIDIA CORPORATION
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package main
+
+import (
+	"fmt"
+	"log"
+	"time"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+)
+
+func main() {
+	ret := nvml.Init()
+	if ret != nvml.SUCCESS {
+		log.Fatalf("failed to init NVML: %v", ret)
+	}
+	defer func() {
+		_ = nvml.Shutdown()
+	}()
+
+	count, ret := nvml.DeviceGetCount()
+	if ret != nvml.SUCCESS {
+		log.Fatalf("failed to get device count: %v", ret)
+	}
+
+	for i := 0; i < count; i++ {
+		if err := collectGPMMetrics(i); err != nil {
+			log.Printf("failed to get metrics for device %d: %v\n", i, err)
+		}
+	}
+}
+
+// collectGPMMetrics gets GPM metrics for a specified device.
+func collectGPMMetrics(i int) error {
+	device, ret := nvml.DeviceGetHandleByIndex(i)
+	if ret != nvml.SUCCESS {
+		return fmt.Errorf("could not get devices handle: %w", ret)
+	}
+
+	gpuQuerySupport, ret := device.GpmQueryDeviceSupport()
+	if ret != nvml.SUCCESS {
+		return fmt.Errorf("could not query GPM support: %w", ret)
+	}
+
+	if gpuQuerySupport.IsSupportedDevice == 0 {
+		return fmt.Errorf("GPM queries are not supported")
+	}
+
+	sample1, ret := nvml.GpmSampleAlloc()
+	if ret != nvml.SUCCESS {
+		return fmt.Errorf("could not allocate sample: %w", ret)
+	}
+	defer func() {
+		_ = sample1.Free()
+	}()
+	sample2, ret := nvml.GpmSampleAlloc()
+	if ret != nvml.SUCCESS {
+		return fmt.Errorf("could not allocate sample: %w", ret)
+	}
+	defer func() {
+		_ = sample2.Free()
+	}()
+
+	if ret := device.GpmSampleGet(sample1); ret != nvml.SUCCESS {
+		return fmt.Errorf("could not get sample: %w", ret)
+	}
+	// add a delay between samples.
+	time.Sleep(1 * time.Second)
+	if ret := device.GpmSampleGet(sample2); ret != nvml.SUCCESS {
+		return fmt.Errorf("could not get sample: %w", ret)
+	}
+
+	gpmMetric := nvml.GpmMetricsGetType{
+		NumMetrics: 1,
+		Sample1:    sample1,
+		Sample2:    sample2,
+		Metrics: [98]nvml.GpmMetric{
+			{
+				MetricId: uint32(nvml.GPM_METRIC_GRAPHICS_UTIL),
+			},
+		},
+	}
+
+	ret = nvml.GpmMetricsGet(&gpmMetric)
+	if ret != nvml.SUCCESS {
+		return fmt.Errorf("failed to get gpm metric: %w", ret)
+	}
+
+	for i := 0; i < int(gpmMetric.NumMetrics); i++ {
+		if gpmMetric.Metrics[i].MetricId > 0 {
+			fmt.Printf("%v: %v\n", gpmMetric.Metrics[i].MetricId, gpmMetric.Metrics[i].Value)
+		}
+	}
+
+	return nil
+}
diff --git a/pkg/nvml/gpm.go b/pkg/nvml/gpm.go
@@ -51,20 +51,31 @@ func (g *nvmlGpmMetricsGetType) convert() *GpmMetricsGetType {
 
 // nvml.GpmMetricsGet()
 type GpmMetricsGetVType struct {
-	metricsGet *nvmlGpmMetricsGetType
+	metricsGet *GpmMetricsGetType
 }
 
 func (l *library) GpmMetricsGetV(metricsGet *GpmMetricsGetType) GpmMetricsGetVType {
-	return GpmMetricsGetVType{metricsGet.convert()}
+	return GpmMetricsGetVType{metricsGet}
 }
+
+// nvmlGpmMetricsGetStub is a stub function that can be overridden for testing.
+var nvmlGpmMetricsGetStub = nvmlGpmMetricsGet
+
 func (metricsGetV GpmMetricsGetVType) V1() Return {
 	metricsGetV.metricsGet.Version = 1
-	return nvmlGpmMetricsGet(metricsGetV.metricsGet)
+	return gpmMetricsGet(metricsGetV.metricsGet)
 }
 
 func (l *library) GpmMetricsGet(metricsGet *GpmMetricsGetType) Return {
 	metricsGet.Version = GPM_METRICS_GET_VERSION
-	return nvmlGpmMetricsGet(metricsGet.convert())
+	return gpmMetricsGet(metricsGet)
+}
+
+func gpmMetricsGet(metricsGet *GpmMetricsGetType) Return {
+	nvmlMetricsGet := metricsGet.convert()
+	ret := nvmlGpmMetricsGetStub(nvmlMetricsGet)
+	*metricsGet = *nvmlMetricsGet.convert()
+	return ret
 }
 
 // nvml.GpmSampleFree()
diff --git a/pkg/nvml/gpm_test.go b/pkg/nvml/gpm_test.go
@@ -0,0 +1,79 @@
+/**
+# Copyright 2024 NVIDIA CORPORATION
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package nvml
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestGpmMetricsGet(t *testing.T) {
+	overrideMetrics := [98]GpmMetric{
+		{
+			Value: 99,
+		},
+	}
+	defer setNvmlGpmMetricsGetStubForTest(func(metricsGet *nvmlGpmMetricsGetType) Return {
+		metricsGet.Metrics = overrideMetrics
+		return SUCCESS
+	})()
+
+	metrics := GpmMetricsGetType{
+		Sample1: nvmlGpmSample{},
+		Sample2: nvmlGpmSample{},
+	}
+	ret := GpmMetricsGet(&metrics)
+
+	require.Equal(t, SUCCESS, ret)
+	require.EqualValues(t, GPM_METRICS_GET_VERSION, metrics.Version)
+
+	require.EqualValues(t, overrideMetrics, metrics.Metrics)
+}
+
+func TestGpmMetricsGetV(t *testing.T) {
+	overrideMetrics := [98]GpmMetric{
+		{
+			Value: 99,
+		},
+	}
+	defer setNvmlGpmMetricsGetStubForTest(func(metricsGet *nvmlGpmMetricsGetType) Return {
+		metricsGet.Metrics = overrideMetrics
+		return SUCCESS
+	})()
+
+	metrics := GpmMetricsGetType{
+		Sample1: nvmlGpmSample{},
+		Sample2: nvmlGpmSample{},
+	}
+
+	ret := GpmMetricsGetV(&metrics).V1()
+
+	require.Equal(t, SUCCESS, ret)
+	require.EqualValues(t, GPM_METRICS_GET_VERSION, metrics.Version)
+
+	require.EqualValues(t, overrideMetrics, metrics.Metrics)
+}
+
+func setNvmlGpmMetricsGetStubForTest(mock func(metricsGet *nvmlGpmMetricsGetType) Return) func() {
+	original := nvmlGpmMetricsGetStub
+
+	nvmlGpmMetricsGetStub = mock
+	return func() {
+		nvmlGpmMetricsGetStub = original
+	}
+}