Skip to content

Commit 1b7a35c

Browse files
authored
Merge pull request #596 from uniemimu/tiles
add tile count label
2 parents 9869944 + f89b61f commit 1b7a35c

File tree

2 files changed

+18
-10
lines changed

2 files changed

+18
-10
lines changed

cmd/gpu_nfdhook/labeler.go

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -113,18 +113,17 @@ func fallback() uint64 {
113113
return getEnvVarNumber(memoryOverrideEnv)
114114
}
115115

116-
// getMemoryAmount reads the GPU memory amount from the system.
117-
func (l *labeler) getMemoryAmount(gpuName string) uint64 {
116+
// getTileMemoryAmount reads the total GPU memory amount from the GPU tiles and returns it and the tile count.
117+
func (l *labeler) getTileMemoryAmount(gpuName string) (mem, numTiles uint64) {
118118
reserved := getEnvVarNumber(memoryReservedEnv)
119119
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range")
120120

121121
files, err := filepath.Glob(filePath)
122122
if err != nil {
123123
klog.V(4).Info("Can't read sysfs folder", err)
124-
return fallback()
124+
return fallback(), 1
125125
}
126126

127-
mem := uint64(0)
128127
for _, fileName := range files {
129128
dat, err := ioutil.ReadFile(fileName)
130129
if err != nil {
@@ -138,14 +137,15 @@ func (l *labeler) getMemoryAmount(gpuName string) uint64 {
138137
continue
139138
}
140139

140+
numTiles++
141141
mem += n
142142
}
143143

144144
if mem == 0 {
145-
return fallback()
145+
return fallback(), 1
146146
}
147147

148-
return mem - reserved
148+
return mem - reserved, numTiles
149149
}
150150

151151
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
@@ -159,7 +159,7 @@ func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) {
159159
}
160160

161161
// createCapabilityLabels creates labels from the gpu capability file under debugfs.
162-
func (l *labeler) createCapabilityLabels(cardNum string) {
162+
func (l *labeler) createCapabilityLabels(cardNum string, numTiles uint64) {
163163
// try to read the capabilities from the i915_capabilities file
164164
file, err := os.Open(filepath.Join(l.debugfsDRIDir, cardNum, "i915_capabilities"))
165165
if err != nil {
@@ -172,6 +172,7 @@ func (l *labeler) createCapabilityLabels(cardNum string) {
172172
searchStringActionMap := map[string]func(string){
173173
"platform: ": func(platformName string) {
174174
l.labels.addNumericLabel(labelNamespace+"platform_"+platformName+".count", 1)
175+
l.labels[labelNamespace+"platform_"+platformName+".tiles"] = strconv.FormatInt(int64(numTiles), 10)
175176
l.labels[labelNamespace+"platform_"+platformName+".present"] = "true"
176177
},
177178
"gen: ": func(genName string) {
@@ -212,11 +213,13 @@ func (l *labeler) createLabels() error {
212213
return errors.Wrap(err, "gpu name parsing error")
213214
}
214215

216+
// read the memory amount to find a proper max allocation value
217+
memoryAmount, numTiles := l.getTileMemoryAmount(gpuName)
218+
215219
// try to add capability labels
216-
l.createCapabilityLabels(gpuNum)
220+
l.createCapabilityLabels(gpuNum, numTiles)
217221

218-
// read the memory amount to find a proper max allocation value
219-
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(l.getMemoryAmount(gpuName)))
222+
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(memoryAmount))
220223
}
221224
gpuCount := len(gpuNameList)
222225
// add gpu list label (example: "card0.card1.card2")

cmd/gpu_nfdhook/labeler_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ func getTestCases() []testcase {
5959
"gpu.intel.com/memory.max": "8086",
6060
"gpu.intel.com/platform_new.count": "1",
6161
"gpu.intel.com/platform_new.present": "true",
62+
"gpu.intel.com/platform_new.tiles": "1",
6263
"gpu.intel.com/platform_gen": "9",
6364
"gpu.intel.com/cards": "card0",
6465
},
@@ -87,6 +88,7 @@ func getTestCases() []testcase {
8788
"gpu.intel.com/memory.max": "8088",
8889
"gpu.intel.com/platform_new.count": "1",
8990
"gpu.intel.com/platform_new.present": "true",
91+
"gpu.intel.com/platform_new.tiles": "2",
9092
"gpu.intel.com/platform_gen": "9",
9193
"gpu.intel.com/cards": "card0",
9294
},
@@ -114,6 +116,7 @@ func getTestCases() []testcase {
114116
"gpu.intel.com/memory.max": "8000",
115117
"gpu.intel.com/platform_new.count": "1",
116118
"gpu.intel.com/platform_new.present": "true",
119+
"gpu.intel.com/platform_new.tiles": "1",
117120
"gpu.intel.com/platform_gen": "9",
118121
"gpu.intel.com/cards": "card0",
119122
},
@@ -138,6 +141,7 @@ func getTestCases() []testcase {
138141
"gpu.intel.com/memory.max": "16000000000",
139142
"gpu.intel.com/platform_new.count": "1",
140143
"gpu.intel.com/platform_new.present": "true",
144+
"gpu.intel.com/platform_new.tiles": "1",
141145
"gpu.intel.com/platform_gen": "9",
142146
"gpu.intel.com/cards": "card0",
143147
},
@@ -161,6 +165,7 @@ func getTestCases() []testcase {
161165
"gpu.intel.com/memory.max": "16000000000",
162166
"gpu.intel.com/platform_new.count": "1",
163167
"gpu.intel.com/platform_new.present": "true",
168+
"gpu.intel.com/platform_new.tiles": "1",
164169
"gpu.intel.com/cards": "card0",
165170
},
166171
},

0 commit comments

Comments
 (0)