@@ -113,18 +113,17 @@ func fallback() uint64 {
113113 return getEnvVarNumber (memoryOverrideEnv )
114114}
115115
116- // getMemoryAmount reads the GPU memory amount from the system .
117- func (l * labeler ) getMemoryAmount (gpuName string ) uint64 {
116+ // getTileMemoryAmount reads the total GPU memory amount from the GPU tiles and returns it and the tile count .
117+ func (l * labeler ) getTileMemoryAmount (gpuName string ) ( mem , numTiles uint64 ) {
118118 reserved := getEnvVarNumber (memoryReservedEnv )
119119 filePath := filepath .Join (l .sysfsDRMDir , gpuName , "gt/gt*/addr_range" )
120120
121121 files , err := filepath .Glob (filePath )
122122 if err != nil {
123123 klog .V (4 ).Info ("Can't read sysfs folder" , err )
124- return fallback ()
124+ return fallback (), 1
125125 }
126126
127- mem := uint64 (0 )
128127 for _ , fileName := range files {
129128 dat , err := ioutil .ReadFile (fileName )
130129 if err != nil {
@@ -138,14 +137,15 @@ func (l *labeler) getMemoryAmount(gpuName string) uint64 {
138137 continue
139138 }
140139
140+ numTiles ++
141141 mem += n
142142 }
143143
144144 if mem == 0 {
145- return fallback ()
145+ return fallback (), 1
146146 }
147147
148- return mem - reserved
148+ return mem - reserved , numTiles
149149}
150150
151151// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
@@ -159,7 +159,7 @@ func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) {
159159}
160160
161161// createCapabilityLabels creates labels from the gpu capability file under debugfs.
162- func (l * labeler ) createCapabilityLabels (cardNum string ) {
162+ func (l * labeler ) createCapabilityLabels (cardNum string , numTiles uint64 ) {
163163 // try to read the capabilities from the i915_capabilities file
164164 file , err := os .Open (filepath .Join (l .debugfsDRIDir , cardNum , "i915_capabilities" ))
165165 if err != nil {
@@ -172,6 +172,7 @@ func (l *labeler) createCapabilityLabels(cardNum string) {
172172 searchStringActionMap := map [string ]func (string ){
173173 "platform: " : func (platformName string ) {
174174 l .labels .addNumericLabel (labelNamespace + "platform_" + platformName + ".count" , 1 )
175+ l .labels [labelNamespace + "platform_" + platformName + ".tiles" ] = strconv .FormatInt (int64 (numTiles ), 10 )
175176 l .labels [labelNamespace + "platform_" + platformName + ".present" ] = "true"
176177 },
177178 "gen: " : func (genName string ) {
@@ -212,11 +213,13 @@ func (l *labeler) createLabels() error {
212213 return errors .Wrap (err , "gpu name parsing error" )
213214 }
214215
216+ // read the memory amount to find a proper max allocation value
217+ memoryAmount , numTiles := l .getTileMemoryAmount (gpuName )
218+
215219 // try to add capability labels
216- l .createCapabilityLabels (gpuNum )
220+ l .createCapabilityLabels (gpuNum , numTiles )
217221
218- // read the memory amount to find a proper max allocation value
219- l .labels .addNumericLabel (labelNamespace + "memory.max" , int64 (l .getMemoryAmount (gpuName )))
222+ l .labels .addNumericLabel (labelNamespace + "memory.max" , int64 (memoryAmount ))
220223 }
221224 gpuCount := len (gpuNameList )
222225 // add gpu list label (example: "card0.card1.card2")
0 commit comments