Skip to content

Commit 6429002

Browse files
committed
gpu nfdhook: new memory amount reading logic
This changes the memory reading to be done through lmem_total_bytes file instead of the addr_range file. Signed-off-by: Ukri Niemimuukko <ukri.niemimuukko@intel.com>
1 parent 32e8c59 commit 6429002

File tree

2 files changed

+45
-46
lines changed

2 files changed

+45
-46
lines changed

cmd/gpu_nfdhook/labeler.go

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -118,39 +118,37 @@ func fallback() uint64 {
118118
return getEnvVarNumber(memoryOverrideEnv)
119119
}
120120

121-
// getTileMemoryAmount reads the total GPU memory amount from the GPU tiles and returns it and the tile count.
122-
func (l *labeler) getTileMemoryAmount(gpuName string) (mem, numTiles uint64) {
121+
func (l *labeler) getMemoryAmount(gpuName string, numTiles uint64) uint64 {
123122
reserved := getEnvVarNumber(memoryReservedEnv)
124-
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range")
125123

126-
files, err := filepath.Glob(filePath)
124+
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "lmem_total_bytes")
125+
126+
dat, err := os.ReadFile(filePath)
127127
if err != nil {
128-
klog.V(4).Info("Can't read sysfs folder", err)
129-
return fallback(), 1
128+
klog.Warning("Can't read file: ", err)
129+
return fallback()
130130
}
131131

132-
for _, fileName := range files {
133-
dat, err := os.ReadFile(fileName)
134-
if err != nil {
135-
klog.Warning("Skipping. Can't read file: ", err)
136-
continue
137-
}
132+
totalPerTile, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
133+
if err != nil {
134+
klog.Warning("Can't convert lmem_total_bytes: ", err)
135+
return fallback()
136+
}
138137

139-
n, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
140-
if err != nil {
141-
klog.Warning("Skipping. Can't convert addr_range: ", err)
142-
continue
143-
}
138+
return totalPerTile*numTiles - reserved
139+
}
144140

145-
numTiles++
146-
mem += n
147-
}
141+
// getTileCount reads the tile count.
142+
func (l *labeler) getTileCount(gpuName string) (numTiles uint64) {
143+
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*")
144+
145+
files, _ := filepath.Glob(filePath)
148146

149-
if mem == 0 {
150-
return fallback(), 1
147+
if len(files) == 0 {
148+
return 1
151149
}
152150

153-
return mem - reserved, numTiles
151+
return uint64(len(files))
154152
}
155153

156154
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
@@ -218,8 +216,11 @@ func (l *labeler) createLabels() error {
218216
return errors.Wrap(err, "gpu name parsing error")
219217
}
220218

221-
// read the memory amount to find a proper max allocation value
222-
memoryAmount, numTiles := l.getTileMemoryAmount(gpuName)
219+
// read the tile count
220+
numTiles := l.getTileCount(gpuName)
221+
222+
// read memory amount
223+
memoryAmount := l.getMemoryAmount(gpuName, numTiles)
223224

224225
// try to add capability labels
225226
l.createCapabilityLabels(gpuNum, numTiles)

cmd/gpu_nfdhook/labeler_test.go

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,10 @@ func getTestCases() []testcase {
4242
"card0/gt/gt0",
4343
},
4444
sysfsfiles: map[string][]byte{
45-
"card0/device/vendor": []byte("0x8086"),
46-
"card0/gt/gt0/addr_range": []byte("8086"),
45+
"card0/device/vendor": []byte("0x8086"),
46+
"card0/lmem_total_bytes": []byte("8086"),
4747
},
48-
name: "successful labeling via gt0/addr_range",
48+
name: "successful labeling via lmem_total_bytes",
4949
memoryOverride: 16000000000,
5050
capabilityFile: map[string][]byte{
5151
"0/i915_capabilities": []byte(
@@ -88,11 +88,10 @@ func getTestCases() []testcase {
8888
"card0/gt/gt1",
8989
},
9090
sysfsfiles: map[string][]byte{
91-
"card0/device/vendor": []byte("0x8086"),
92-
"card0/gt/gt0/addr_range": []byte("8086"),
93-
"card0/gt/gt1/addr_range": []byte("2"),
91+
"card0/device/vendor": []byte("0x8086"),
92+
"card0/lmem_total_bytes": []byte("8000"),
9493
},
95-
name: "successful labeling via gt0/addr_range and gt1/addr_range",
94+
name: "successful labeling via card0/lmem_total_bytes and two tiles",
9695
memoryOverride: 16000000000,
9796
capabilityFile: map[string][]byte{
9897
"0/i915_capabilities": []byte(
@@ -102,7 +101,7 @@ func getTestCases() []testcase {
102101
expectedRetval: nil,
103102
expectedLabels: labelMap{
104103
"gpu.intel.com/millicores": "1000",
105-
"gpu.intel.com/memory.max": "8088",
104+
"gpu.intel.com/memory.max": "16000",
106105
"gpu.intel.com/platform_new.count": "1",
107106
"gpu.intel.com/platform_new.present": "true",
108107
"gpu.intel.com/platform_new.tiles": "2",
@@ -116,10 +115,10 @@ func getTestCases() []testcase {
116115
"card0/gt/gt0",
117116
},
118117
sysfsfiles: map[string][]byte{
119-
"card0/device/vendor": []byte("0x8086"),
120-
"card0/gt/gt0/addr_range": []byte("8086"),
118+
"card0/device/vendor": []byte("0x8086"),
119+
"card0/lmem_total_bytes": []byte("8086"),
121120
},
122-
name: "successful labeling via gt0/addr_range and reserved memory",
121+
name: "successful labeling via lmem_total_bytes and reserved memory",
123122
memoryOverride: 16000000000,
124123
memoryReserved: 86,
125124
capabilityFile: map[string][]byte{
@@ -242,20 +241,25 @@ func TestLabeling(t *testing.T) {
242241
testcases := getTestCases()
243242

244243
for _, tc := range testcases {
244+
subroot, err := os.MkdirTemp(root, "tc")
245+
if err != nil {
246+
t.Fatalf("can't create temporary subroot directory: %+v", err)
247+
}
248+
245249
tc := tc
246250
t.Run(tc.name, func(t *testing.T) {
247-
err := os.MkdirAll(path.Join(root, "0"), 0750)
251+
err := os.MkdirAll(path.Join(subroot, "0"), 0750)
248252
if err != nil {
249253
t.Fatalf("couldn't create dir: %s", err.Error())
250254
}
251-
sysfs := path.Join(root, sysfsDirectory)
255+
sysfs := path.Join(subroot, sysfsDirectory)
252256

253-
tc.createFiles(t, sysfs, root)
257+
tc.createFiles(t, sysfs, subroot)
254258

255259
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
256260
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
257261

258-
labeler := newLabeler(sysfs, root)
262+
labeler := newLabeler(sysfs, subroot)
259263
err = labeler.createLabels()
260264
if err != nil && tc.expectedRetval == nil ||
261265
err == nil && tc.expectedRetval != nil {
@@ -264,12 +268,6 @@ func TestLabeling(t *testing.T) {
264268
if tc.expectedRetval == nil && !reflect.DeepEqual(labeler.labels, tc.expectedLabels) {
265269
t.Errorf("test %v label mismatch with expectation:\n%v\n%v\n", tc.name, labeler.labels, tc.expectedLabels)
266270
}
267-
for filename := range tc.capabilityFile {
268-
os.Remove(path.Join(root, filename))
269-
}
270-
for filename := range tc.sysfsfiles {
271-
os.Remove(path.Join(sysfs, filename))
272-
}
273271
})
274272
}
275273
}

0 commit comments

Comments
 (0)