Skip to content

Commit

Permalink
The GetValuesSince binding added
Browse files Browse the repository at this point in the history
Signed-off-by: Vadym Fedorov <[email protected]>
  • Loading branch information
nvvfedorov committed Jan 17, 2024
1 parent 26fbf85 commit 2e5142b
Show file tree
Hide file tree
Showing 10 changed files with 321 additions and 100 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ clean:
rm -f samples/processInfo/processInfo
rm -f samples/restApi/restApi
rm -f samples/topology/topology

lint:
golangci-lint run ./...
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,5 @@ require (
require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/stretchr/objx v0.5.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
12 changes: 0 additions & 12 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,28 +1,16 @@
github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww=
github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y=
github.com/bits-and-blooms/bitset v1.2.1 h1:M+/hrU9xlMp7t4TyTDQW97d3tRPVuKFC6zBEK16QnXY=
github.com/bits-and-blooms/bitset v1.2.1/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE=
github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
87 changes: 0 additions & 87 deletions pkg/dcgm/bcast.go

This file was deleted.

107 changes: 107 additions & 0 deletions pkg/dcgm/field_values.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgm

/*
#include "dcgm_agent.h"
#include "dcgm_structs.h"
#include "field_values_cb.h"
extern int go_dcgmFieldValueEntityEnumeration(dcgm_field_entity_group_t entityGroupId,
dcgm_field_eid_t entityId,
dcgmFieldValue_v1 *values,
int numValues,
void *userData);
*/
import "C"
import (
"fmt"
"sync"
"time"
"unsafe"
)

type callback struct {
mu sync.Mutex
Values []FieldValue_v2
}

func (cb *callback) processValues(entityGroup Field_Entity_Group, entityID uint, cvalues []C.dcgmFieldValue_v1) {
values := dcgmFieldValue_v1ToFieldValue_v2(entityGroup, entityID, cvalues)
cb.mu.Lock()
defer cb.mu.Unlock()
cb.Values = append(cb.Values, values...)
}

//export go_dcgmFieldValueEntityEnumeration
func go_dcgmFieldValueEntityEnumeration(
entityGroup C.dcgm_field_entity_group_t,
entityID C.dcgm_field_eid_t,
values *C.dcgmFieldValue_v1,
numValues C.int,
userData unsafe.Pointer) C.int {
ptrValues := unsafe.Pointer(values)
if ptrValues != nil {
valuesSlice := (*[1 << 30]C.dcgmFieldValue_v1)(ptrValues)[0:numValues]
if userData != nil {
processor := (*callback)(userData)
processor.processValues(Field_Entity_Group(entityGroup), uint(entityID), valuesSlice)
}
}
return 0
}

// GetValuesSince reads and returns field values for a specified group of entities, such as GPUs,
// that have been updated since a given timestamp. It allows for targeted data retrieval based on time criteria.
//
// GPUGroup is a GroupHandle that identifies the group of entities to operate on. It can be obtained from CreateGroup
// for a specific group of GPUs or use GroupAllGPUs() to target all GPUs.
//
// fieldGroup is a FieldHandle representing the group of fields for which data is requested.
//
// sinceTime is a time.Time value representing the timestamp from which to request updated values.
// A zero value (time.Time{}) requests all available data.
//
// Returns []FieldValue_v2 slice containing the requested field values, a time.Time indicating the time
// of the latest data retrieval, and an error if there is any issue during the operation.
func GetValuesSince(GPUGroup GroupHandle, fieldGroup FieldHandle, sinceTime time.Time) ([]FieldValue_v2, time.Time, error) {
var (
nextSinceTimestamp C.longlong
)

cbResult := &callback{}

result := C.dcgmGetValuesSince_v2(handle.handle,
GPUGroup.handle,
C.dcgmFieldGrp_t(fieldGroup.handle),
C.longlong(sinceTime.UnixMicro()),
&nextSinceTimestamp,
(C.dcgmFieldValueEnumeration_f)(unsafe.Pointer(C.fieldValueEntityCallback)),
unsafe.Pointer(cbResult))
if result != C.DCGM_ST_OK {
return nil, time.Time{}, fmt.Errorf("dcgmGetValuesSince_v2 failed with error code %d", int(result))
}

return cbResult.Values, timestampUSECToTime(int64(nextSinceTimestamp)), nil
}

func timestampUSECToTime(timestampUSEC int64) time.Time {
// Convert microseconds to seconds and nanoseconds
sec := timestampUSEC / 1000000 // Convert microseconds to seconds
nsec := (timestampUSEC % 1000000) * 1000 // Convert the remaining microseconds to nanoseconds
// Use time.Unix to get a time.Time object
return time.Unix(sec, nsec)
}
11 changes: 11 additions & 0 deletions pkg/dcgm/field_values_cb.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include "dcgm_agent.h"
#include "dcgm_structs.h"
#include "_cgo_export.h"

int fieldValueEntityCallback(dcgm_field_entity_group_t entityGroupId,
dcgm_field_eid_t entityId,
dcgmFieldValue_v1 *values,
int numValues,
void *userData) {
return go_dcgmFieldValueEntityEnumeration(entityGroupId, entityId, values, numValues, userData);
}
13 changes: 13 additions & 0 deletions pkg/dcgm/field_values_cb.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#ifndef FIELD_VALUES
#define FIELD_VALUES

#include "dcgm_agent.h"
#include "dcgm_structs.h"

int fieldValueEntityCallback(dcgm_field_entity_group_t entityGroupId,
dcgm_field_eid_t entityId,
dcgmFieldValue_v1 *values,
int numValues,
void *userData);

#endif
101 changes: 101 additions & 0 deletions pkg/dcgm/field_values_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgm

import (
"fmt"
"math/rand"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestGetValuesSince(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)
runOnlyWithLiveGPUs(t)

const gpu uint = 0

// Create a group of fields
const (
xid int = iota
)

deviceFields := make([]Short, 1)
deviceFields[xid] = DCGM_FI_DEV_XID_ERRORS

fieldGroupName := fmt.Sprintf("fieldGroupName%d", rand.Uint64())
fieldsGroup, err := FieldGroupCreate(fieldGroupName, deviceFields)
assert.NoError(t, err)
defer func() {
_ = FieldGroupDestroy(fieldsGroup)
}()

t.Run("When there is no data return error", func(t *testing.T) {
values, nextTime, err := GetValuesSince(GroupAllGPUs(),
fieldsGroup, time.Time{})
require.Error(t, err)
assert.Empty(t, nextTime)
assert.Len(t, values, 0)
})

t.Run("When there are a few entries", func(t *testing.T) {
expectedNumberOfErrors := int64(43)
expectedInjectedValuesCount := 0
t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu)
err = InjectFieldValue(gpu,
DCGM_FI_DEV_XID_ERRORS,
DCGM_FT_INT64,
0,
time.Now().Add(-time.Duration(5)*time.Second).UnixMicro(),
expectedNumberOfErrors,
)
require.NoError(t, err)
expectedInjectedValuesCount++
for i := 4; i > 0; i-- {
err = InjectFieldValue(gpu,
DCGM_FI_DEV_XID_ERRORS,
DCGM_FT_INT64,
0,
time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(),
int64(i),
)
require.NoError(t, err)
expectedInjectedValuesCount++
}
// Force an update of the fields so that we can fetch initial values.
err = UpdateAllFields()
assert.NoError(t, err)
values, nextTime, err := GetValuesSince(GroupAllGPUs(), fieldsGroup, time.Time{})
assert.NoError(t, err)
assert.Greater(t, nextTime, time.Time{})
assert.Len(t, values, expectedInjectedValuesCount)
assert.Equal(t, FE_GPU, values[0].EntityGroupId)
assert.Equal(t, gpu, values[0].EntityId)
assert.Equal(t, uint(DCGM_FI_DEV_XID_ERRORS), values[0].FieldId)
assert.Equal(t, expectedNumberOfErrors, values[0].Int64())
for i := 1; i < 5; i++ {
assert.Equal(t, FE_GPU, values[i].EntityGroupId)
assert.Equal(t, gpu, values[i].EntityId)
assert.Equal(t, uint(DCGM_FI_DEV_XID_ERRORS), values[i].FieldId)
assert.Equal(t, int64(5-i), values[i].Int64())
}
})
}
Loading

0 comments on commit 2e5142b

Please sign in to comment.