Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU Health API improvements #70

Merged
merged 1 commit into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

GOLANG_VERSION := 1.14.2
GOLANGCILINT_TIMEOUT ?= 10m

.PHONY: all binary install check-format
all: binary test-main check-format
Expand Down Expand Up @@ -45,4 +46,4 @@ clean:
rm -f samples/topology/topology

lint:
golangci-lint run ./...
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix
1,081 changes: 610 additions & 471 deletions pkg/dcgm/const.go

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions pkg/dcgm/gpu_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package dcgm
#include "dcgm_structs.h"
*/
import "C"

import (
"encoding/binary"
"fmt"
Expand Down Expand Up @@ -92,3 +93,34 @@ func DestroyGroup(groupId GroupHandle) (err error) {

return
}

type GroupInfo struct {
Version uint32
GroupName string
EntityList []GroupEntityPair
}

func GetGroupInfo(groupId GroupHandle) (*GroupInfo, error) {
response := C.dcgmGroupInfo_v2{
version: C.dcgmGroupInfo_version2,
}

result := C.dcgmGroupGetInfo(handle.handle, groupId.handle, &response)
if err := errorString(result); err != nil {
return nil, err
}

ret := &GroupInfo{
Version: uint32(response.version),
GroupName: C.GoString(&response.groupName[0]),
}

for i := 0; i < int(response.count); i++ {
ret.EntityList = append(ret.EntityList, GroupEntityPair{
EntityId: uint(response.entityList[i].entityId),
EntityGroupId: Field_Entity_Group(response.entityList[i].entityGroupId),
})
}

return ret, nil
}
28 changes: 28 additions & 0 deletions pkg/dcgm/gpu_group_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestGroupHandle(t *testing.T) {
Expand All @@ -17,3 +18,30 @@ func TestGroupHandle(t *testing.T) {
assert.Equal(t, input, gh.GetHandle(), "values mismatch")
}
}

func TestGetGroupInfo(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)

runOnlyWithLiveGPUs(t)
gpus, err := withInjectionGPUs(t, 1)
require.NoError(t, err)

gpuID := gpus[0]

groupID, err := CreateGroup("test1")
require.NoError(t, err)
defer func() {
_ = DestroyGroup(groupID)
}()
err = AddEntityToGroup(groupID, FE_GPU, gpuID)
require.NoError(t, err)

grInfo, err := GetGroupInfo(groupID)
require.NoError(t, err)

assert.Equal(t, "test1", grInfo.GroupName)
assert.Len(t, grInfo.EntityList, 1)
assert.Equal(t, FE_GPU, grInfo.EntityList[0].EntityGroupId)
assert.Equal(t, gpuID, grInfo.EntityList[0].EntityId)
}
121 changes: 102 additions & 19 deletions pkg/dcgm/health.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,27 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgm

/*
#include "dcgm_agent.h"
#include "dcgm_structs.h"
*/
import "C"

import (
"fmt"
"math/rand"
Expand All @@ -23,14 +40,84 @@ type DeviceHealth struct {
Watches []SystemWatch
}

func setHealthWatches(groupId GroupHandle) (err error) {
result := C.dcgmHealthSet(handle.handle, groupId.handle, C.DCGM_HEALTH_WATCH_ALL)
// HealthSet enable the DCGM health check system for the given systems
func HealthSet(groupId GroupHandle, systems HealthSystem) (err error) {
result := C.dcgmHealthSet(handle.handle, groupId.handle, C.dcgmHealthSystems_t(systems))
if err = errorString(result); err != nil {
return fmt.Errorf("Error setting health watches: %s", err)
return fmt.Errorf("error setting health watches: %w", err)
}
return
}

// HealthGet retrieve the current state of the DCGM health check system
func HealthGet(groupId GroupHandle) (HealthSystem, error) {
var systems C.dcgmHealthSystems_t

result := C.dcgmHealthGet(handle.handle, groupId.handle, (*C.dcgmHealthSystems_t)(unsafe.Pointer(&systems)))
if err := errorString(result); err != nil {
return HealthSystem(0), err
}
return HealthSystem(systems), nil
}

type DiagErrorDetail struct {
Message string
Code HealthCheckErrorCode
nvvfedorov marked this conversation as resolved.
Show resolved Hide resolved
}

type Incident struct {
System HealthSystem
Health HealthResult
Error DiagErrorDetail
EntityInfo GroupEntityPair
}

type HealthResponse struct {
OverallHealth HealthResult
Incidents []Incident
}

// HealthCheck check the configured watches for any errors/failures/warnings that have occurred
// since the last time this check was invoked. On the first call, stateful information
// about all of the enabled watches within a group is created but no error results are
// provided. On subsequent calls, any error information will be returned.
func HealthCheck(groupId GroupHandle) (HealthResponse, error) {
var healthResults C.dcgmHealthResponse_v4
healthResults.version = makeVersion4(unsafe.Sizeof(healthResults))

result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults)))

if err := errorString(result); err != nil {
return HealthResponse{}, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result}
}

response := HealthResponse{
nvvfedorov marked this conversation as resolved.
Show resolved Hide resolved
OverallHealth: HealthResult(healthResults.overallHealth),
}

// number of watches that encountred error/warning
incidents := uint(healthResults.incidentCount)

response.Incidents = make([]Incident, incidents)

for i := uint(0); i < incidents; i++ {
response.Incidents[i] = Incident{
System: HealthSystem(healthResults.incidents[i].system),
Health: HealthResult(healthResults.incidents[i].health),
Error: DiagErrorDetail{
Message: *stringPtr(&healthResults.incidents[i].error.msg[0]),
Code: HealthCheckErrorCode(healthResults.incidents[i].error.code),
},
EntityInfo: GroupEntityPair{
EntityGroupId: Field_Entity_Group(healthResults.incidents[i].entityInfo.entityGroupId),
EntityId: uint(healthResults.incidents[i].entityInfo.entityId),
},
}
}

return response, nil
}

func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
name := fmt.Sprintf("health%d", rand.Uint64())
groupId, err := CreateGroup(name)
Expand All @@ -43,32 +130,28 @@ func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
return
}

err = setHealthWatches(groupId)
err = HealthSet(groupId, DCGM_HEALTH_WATCH_ALL)
if err != nil {
return
}

var healthResults C.dcgmHealthResponse_v4
healthResults.version = makeVersion4(unsafe.Sizeof(healthResults))

result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults)))

if err = errorString(result); err != nil {
return deviceHealth, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result}
result, err := HealthCheck(groupId)
if err != nil {
return
}

status := healthStatus(int8(healthResults.overallHealth))
status := healthStatus(result.OverallHealth)
watches := []SystemWatch{}

// number of watches that encountred error/warning
incidents := uint(healthResults.incidentCount)
incidents := len(result.Incidents)

for j := uint(0); j < incidents; j++ {
for j := 0; j < incidents; j++ {
watch := SystemWatch{
Type: systemWatch(int(healthResults.incidents[j].system)),
Status: healthStatus(int8(healthResults.incidents[j].health)),
Type: systemWatch(result.Incidents[j].System),
Status: healthStatus(result.Incidents[j].Health),

Error: *stringPtr(&healthResults.incidents[j].error.msg[0]),
Error: result.Incidents[j].Error.Message,
}
watches = append(watches, watch)
}
Expand All @@ -82,7 +165,7 @@ func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
return
}

func healthStatus(status int8) string {
func healthStatus(status HealthResult) string {
switch status {
case 0:
return "Healthy"
Expand All @@ -94,7 +177,7 @@ func healthStatus(status int8) string {
return "N/A"
}

func systemWatch(watch int) string {
func systemWatch(watch HealthSystem) string {
switch watch {
case 1:
return "PCIe watches"
Expand Down
121 changes: 121 additions & 0 deletions pkg/dcgm/health_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
//go:build linux && cgo

/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgm

import (
"strings"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestHealthWhenInvalidGroupID(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)
runOnlyWithLiveGPUs(t)

var invalidGroupID uintptr = 99
gh := GroupHandle{}
gh.SetHandle(invalidGroupID)
err := HealthSet(gh, DCGM_HEALTH_WATCH_PCIE)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Setting not configured")

_, err = HealthGet(gh)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Setting not configured")

_, err = HealthGet(gh)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Setting not configured")
}

func TestHealthCheckPCIE(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)

runOnlyWithLiveGPUs(t)
gpus, err := withInjectionGPUs(t, 1)
require.NoError(t, err)

gpuID := gpus[0]

groupID, err := CreateGroup("test1")
require.NoError(t, err)
defer func() {
_ = DestroyGroup(groupID)
}()
err = AddEntityToGroup(groupID, FE_GPU, gpuID)
require.NoError(t, err)

err = HealthSet(groupID, DCGM_HEALTH_WATCH_PCIE)
require.NoError(t, err)

system, err := HealthGet(groupID)
require.NoError(t, err)
require.Equal(t, DCGM_HEALTH_WATCH_PCIE, system)

skipTestIfUnhealthy(t, groupID)

err = InjectFieldValue(gpuID,
DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
DCGM_FT_INT64,
0,
time.Now().Add(-50*time.Second).UnixMicro(),
int64(0),
)
require.NoError(t, err)

response, err := HealthCheck(groupID)
require.NoError(t, err)
require.Equal(t, DCGM_HEALTH_RESULT_PASS, response.OverallHealth)

// inject an error into PCI
err = InjectFieldValue(gpuID,
DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
DCGM_FT_INT64,
0,
time.Now().Add(100*time.Second).UnixMicro(),
int64(10),
)
require.NoError(t, err)
response, err = HealthCheck(groupID)
require.NoError(t, err)
require.Equal(t, DCGM_HEALTH_RESULT_WARN, response.OverallHealth)
require.Len(t, response.Incidents, 1)
assert.Equal(t, gpuID, response.Incidents[0].EntityInfo.EntityId)
assert.Equal(t, DCGM_HEALTH_WATCH_PCIE, response.Incidents[0].System)
assert.Equal(t, DCGM_FR_PCI_REPLAY_RATE, response.Incidents[0].Error.Code)
}

func skipTestIfUnhealthy(t *testing.T, groupId GroupHandle) {
health, err := HealthCheck(groupId)
require.NoError(t, err)
if health.OverallHealth != DCGM_HEALTH_RESULT_PASS {
msg := "Skipping health check test because we are already unhealthy: "
incidents := []string{}
for _, incident := range health.Incidents {
incidents = append(incidents, incident.Error.Message)
}

t.Skip(msg + strings.Join(incidents, ", "))
}
}
Loading
Loading