Skip to content

Commit 35e4bff

Browse files
authored
Merge pull request #640 from uniemimu/froperator
gpu resource manager operator parts
2 parents a08cbe9 + efe6efd commit 35e4bff

File tree

12 files changed

+390
-47
lines changed

12 files changed

+390
-47
lines changed

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ generate:
9191
$(CONTROLLER_GEN) webhook \
9292
paths="./pkg/fpgacontroller/..." \
9393
output:webhook:artifacts:config=deployments/fpga_admissionwebhook/webhook
94+
$(CONTROLLER_GEN) rbac:roleName=gpu-manager-role paths="./cmd/gpu_plugin/..." output:dir=deployments/operator/rbac
95+
cp deployments/operator/rbac/role.yaml deployments/operator/rbac/gpu_manager_role.yaml
9496
$(CONTROLLER_GEN) rbac:roleName=manager-role paths="./pkg/..." output:dir=deployments/operator/rbac
9597
$(CONTROLLER_GEN) rbac:roleName=manager-role paths="./pkg/fpgacontroller/..." output:dir=deployments/fpga_admissionwebhook/rbac
9698

deployments/operator/crd/bases/deviceplugin.intel.com_gpudeviceplugins.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ spec:
6868
description: NodeSelector provides a simple way to constrain device
6969
plugin pods to nodes with particular labels.
7070
type: object
71+
resourceManager:
72+
description: ResourceManager handles the fractional resource management for multi-GPU nodes
73+
type: boolean
7174
sharedDevNum:
7275
description: SharedDevNum is a number of containers that can share
7376
the same GPU device.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
2+
---
3+
apiVersion: rbac.authorization.k8s.io/v1
4+
kind: ClusterRole
5+
metadata:
6+
creationTimestamp: null
7+
name: gpu-manager-role
8+
rules:
9+
- apiGroups:
10+
- ""
11+
resources:
12+
- pods
13+
verbs:
14+
- list

deployments/operator/rbac/kustomization.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ resources:
33
- role_binding.yaml
44
- leader_election_role.yaml
55
- leader_election_role_binding.yaml
6+
- gpu_manager_role.yaml
67
# Comment the following 4 lines if you want to disable
78
# the auth proxy (https://github.com/brancz/kube-rbac-proxy)
89
# which protects your /metrics endpoint.

deployments/operator/rbac/role.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,16 @@ rules:
1414
- get
1515
- list
1616
- watch
17+
- apiGroups:
18+
- ""
19+
resources:
20+
- serviceaccounts
21+
verbs:
22+
- create
23+
- delete
24+
- get
25+
- list
26+
- watch
1727
- apiGroups:
1828
- apps
1929
resources:
@@ -150,3 +160,13 @@ rules:
150160
- get
151161
- list
152162
- watch
163+
- apiGroups:
164+
- rbac.authorization.k8s.io
165+
resources:
166+
- clusterrolebindings
167+
verbs:
168+
- create
169+
- delete
170+
- get
171+
- list
172+
- watch

pkg/apis/deviceplugin/v1/gpudeviceplugin_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ type GpuDevicePluginSpec struct {
3535
// +kubebuilder:validation:Minimum=1
3636
SharedDevNum int `json:"sharedDevNum,omitempty"`
3737

38+
// ResourceManager handles the fractional resource management for multi-GPU nodes
39+
ResourceManager bool `json:"resourceManager,omitempty"`
40+
3841
// LogLevel sets the plugin's log level.
3942
// +kubebuilder:validation:Minimum=0
4043
LogLevel int `json:"logLevel,omitempty"`

pkg/controllers/dsa/controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ func SetupReconciler(mgr ctrl.Manager, namespace string, withWebhook bool) error
5757
}
5858

5959
type controller struct {
60+
controllers.DefaultServiceAccountFactory
6061
scheme *runtime.Scheme
6162
ns string
6263
}

pkg/controllers/fpga/controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ func SetupReconciler(mgr ctrl.Manager, namespace string, withWebhook bool) error
5757
}
5858

5959
type controller struct {
60+
controllers.DefaultServiceAccountFactory
6061
scheme *runtime.Scheme
6162
ns string
6263
}

pkg/controllers/gpu/controller.go

Lines changed: 111 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
apps "k8s.io/api/apps/v1"
2525
v1 "k8s.io/api/core/v1"
26+
rbacv1 "k8s.io/api/rbac/v1"
2627
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2728
"k8s.io/apimachinery/pkg/runtime"
2829
"k8s.io/client-go/tools/reference"
@@ -35,8 +36,9 @@ import (
3536
)
3637

3738
const (
38-
ownerKey = ".metadata.controller.gpu"
39-
appLabel = "intel-gpu-plugin"
39+
ownerKey = ".metadata.controller.gpu"
40+
appLabel = "intel-gpu-plugin"
41+
serviceAccountName = "gpu-manager-sa"
4042
)
4143

4244
// +kubebuilder:rbac:groups=deviceplugin.intel.com,resources=gpudeviceplugins,verbs=get;list;watch;create;update;patch;delete
@@ -74,6 +76,46 @@ func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client
7476
return len(list.Items), nil
7577
}
7678

79+
func (c *controller) NewServiceAccount(rawObj client.Object) *v1.ServiceAccount {
80+
devicePlugin := rawObj.(*devicepluginv1.GpuDevicePlugin)
81+
if devicePlugin.Spec.ResourceManager {
82+
sa := v1.ServiceAccount{
83+
ObjectMeta: metav1.ObjectMeta{
84+
Name: "gpu-manager-sa",
85+
Namespace: c.ns,
86+
},
87+
}
88+
return &sa
89+
}
90+
return nil
91+
}
92+
93+
func (c *controller) NewClusterRoleBinding(rawObj client.Object) *rbacv1.ClusterRoleBinding {
94+
devicePlugin := rawObj.(*devicepluginv1.GpuDevicePlugin)
95+
if devicePlugin.Spec.ResourceManager {
96+
rb := rbacv1.ClusterRoleBinding{
97+
ObjectMeta: metav1.ObjectMeta{
98+
Name: "gpu-manager-rolebinding",
99+
Namespace: c.ns,
100+
},
101+
Subjects: []rbacv1.Subject{
102+
{
103+
Kind: "ServiceAccount",
104+
Name: "gpu-manager-sa",
105+
Namespace: c.ns,
106+
},
107+
},
108+
RoleRef: rbacv1.RoleRef{
109+
Kind: "ClusterRole",
110+
Name: "inteldeviceplugins-gpu-manager-role",
111+
APIGroup: "rbac.authorization.k8s.io",
112+
},
113+
}
114+
return &rb
115+
}
116+
return nil
117+
}
118+
77119
func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet {
78120
devicePlugin := rawObj.(*devicepluginv1.GpuDevicePlugin)
79121

@@ -183,9 +225,46 @@ func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet {
183225
if devicePlugin.Spec.InitImage != "" {
184226
setInitContainer(&daemonSet.Spec.Template.Spec, devicePlugin.Spec.InitImage)
185227
}
228+
// add service account if resource manager is enabled
229+
if devicePlugin.Spec.ResourceManager {
230+
daemonSet.Spec.Template.Spec.ServiceAccountName = serviceAccountName
231+
addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory)
232+
addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources")
233+
}
186234
return &daemonSet
187235
}
188236

237+
func addVolumeMountIfMissing(spec *v1.PodSpec, name, mountPath string) {
238+
for _, mount := range spec.Containers[0].VolumeMounts {
239+
if mount.Name == name {
240+
return
241+
}
242+
}
243+
244+
spec.Containers[0].VolumeMounts = append(spec.Containers[0].VolumeMounts, v1.VolumeMount{
245+
Name: name,
246+
MountPath: mountPath,
247+
})
248+
}
249+
250+
func addVolumeIfMissing(spec *v1.PodSpec, name, path string, hpType v1.HostPathType) {
251+
for _, vol := range spec.Volumes {
252+
if vol.Name == name {
253+
return
254+
}
255+
}
256+
257+
spec.Volumes = append(spec.Volumes, v1.Volume{
258+
Name: name,
259+
VolumeSource: v1.VolumeSource{
260+
HostPath: &v1.HostPathVolumeSource{
261+
Path: path,
262+
Type: &hpType,
263+
},
264+
},
265+
})
266+
}
267+
189268
func setInitContainer(spec *v1.PodSpec, imageName string) {
190269
yes := true
191270
spec.InitContainers = []v1.Container{
@@ -203,25 +282,7 @@ func setInitContainer(spec *v1.PodSpec, imageName string) {
203282
},
204283
},
205284
}}
206-
directoryOrCreate := v1.HostPathDirectoryOrCreate
207-
missing := true
208-
for _, vol := range spec.Volumes {
209-
if vol.Name == "nfd-source-hooks" {
210-
missing = false
211-
break
212-
}
213-
}
214-
if missing {
215-
spec.Volumes = append(spec.Volumes, v1.Volume{
216-
Name: "nfd-source-hooks",
217-
VolumeSource: v1.VolumeSource{
218-
HostPath: &v1.HostPathVolumeSource{
219-
Path: "/etc/kubernetes/node-feature-discovery/source.d/",
220-
Type: &directoryOrCreate,
221-
},
222-
},
223-
})
224-
}
285+
addVolumeIfMissing(spec, "nfd-source-hooks", "/etc/kubernetes/node-feature-discovery/source.d/", v1.HostPathDirectoryOrCreate)
225286
}
226287

227288
func removeVolume(volumes []v1.Volume, name string) []v1.Volume {
@@ -233,6 +294,15 @@ func removeVolume(volumes []v1.Volume, name string) []v1.Volume {
233294
}
234295
return newVolumes
235296
}
297+
func removeVolumeMount(volumeMounts []v1.VolumeMount, name string) []v1.VolumeMount {
298+
newVolumeMounts := []v1.VolumeMount{}
299+
for _, volume := range volumeMounts {
300+
if volume.Name != name {
301+
newVolumeMounts = append(newVolumeMounts, volume)
302+
}
303+
}
304+
return newVolumeMounts
305+
}
236306

237307
func (c *controller) UpdateDaemonSet(rawObj client.Object, ds *apps.DaemonSet) (updated bool) {
238308
dp := rawObj.(*devicepluginv1.GpuDevicePlugin)
@@ -269,6 +339,22 @@ func (c *controller) UpdateDaemonSet(rawObj client.Object, ds *apps.DaemonSet) (
269339
updated = true
270340
}
271341

342+
newServiceAccountName := "default"
343+
if dp.Spec.ResourceManager {
344+
newServiceAccountName = serviceAccountName
345+
}
346+
if ds.Spec.Template.Spec.ServiceAccountName != newServiceAccountName {
347+
ds.Spec.Template.Spec.ServiceAccountName = newServiceAccountName
348+
if dp.Spec.ResourceManager {
349+
addVolumeIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory)
350+
addVolumeMountIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources")
351+
} else {
352+
ds.Spec.Template.Spec.Volumes = removeVolume(ds.Spec.Template.Spec.Volumes, "podresources")
353+
ds.Spec.Template.Spec.Containers[0].VolumeMounts = removeVolumeMount(ds.Spec.Template.Spec.Containers[0].VolumeMounts, "podresources")
354+
}
355+
updated = true
356+
}
357+
272358
return updated
273359
}
274360

@@ -313,5 +399,9 @@ func getPodArgs(gdp *devicepluginv1.GpuDevicePlugin) []string {
313399
args = append(args, "-shared-dev-num", "1")
314400
}
315401

402+
if gdp.Spec.ResourceManager {
403+
args = append(args, "-resource-manager")
404+
}
405+
316406
return args
317407
}

pkg/controllers/qat/controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ func SetupReconciler(mgr ctrl.Manager, namespace string, withWebhook bool) error
5757
}
5858

5959
type controller struct {
60+
controllers.DefaultServiceAccountFactory
6061
scheme *runtime.Scheme
6162
ns string
6263
}

0 commit comments

Comments
 (0)