diff --git a/cmd/machine-config-controller/start.go b/cmd/machine-config-controller/start.go index f22c4dfee0..faa7c3f791 100644 --- a/cmd/machine-config-controller/start.go +++ b/cmd/machine-config-controller/start.go @@ -9,13 +9,13 @@ import ( features "github.com/openshift/api/features" "github.com/openshift/machine-config-operator/cmd/common" "github.com/openshift/machine-config-operator/internal/clients" + bootimagecontroller "github.com/openshift/machine-config-operator/pkg/controller/bootimage" certrotationcontroller "github.com/openshift/machine-config-operator/pkg/controller/certrotation" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" containerruntimeconfig "github.com/openshift/machine-config-operator/pkg/controller/container-runtime-config" "github.com/openshift/machine-config-operator/pkg/controller/drain" "github.com/openshift/machine-config-operator/pkg/controller/internalreleaseimage" kubeletconfig "github.com/openshift/machine-config-operator/pkg/controller/kubelet-config" - machinesetbootimage "github.com/openshift/machine-config-operator/pkg/controller/machine-set-boot-image" "github.com/openshift/machine-config-operator/pkg/controller/node" "github.com/openshift/machine-config-operator/pkg/controller/pinnedimageset" "github.com/openshift/machine-config-operator/pkg/controller/render" @@ -144,7 +144,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { } if ctrlcommon.IsBootImageControllerRequired(ctrlctx) { - machineSetBootImage := machinesetbootimage.New( + bootImageController := bootimagecontroller.New( ctrlctx.ClientBuilder.KubeClientOrDie("machine-set-boot-image-controller"), ctrlctx.ClientBuilder.MachineClientOrDie("machine-set-boot-image-controller"), ctrlctx.KubeNamespacedInformerFactory.Core().V1().ConfigMaps(), @@ -153,9 +153,10 @@ func runStartCmd(_ *cobra.Command, _ []string) { ctrlctx.ConfigInformerFactory.Config().V1().Infrastructures(), ctrlctx.ClientBuilder.OperatorClientOrDie(componentName), ctrlctx.OperatorInformerFactory.Operator().V1().MachineConfigurations(), + ctrlctx.ConfigInformerFactory.Config().V1().ClusterVersions(), ctrlctx.FeatureGatesHandler, ) - go machineSetBootImage.Run(ctrlctx.Stop) + go bootImageController.Run(ctrlctx.Stop) // start the informers again to enable feature gated types. // see comments in SharedInformerFactory interface. ctrlctx.KubeNamespacedInformerFactory.Start(ctrlctx.Stop) diff --git a/devex/cmd/mco-sanitize/redactor.go b/devex/cmd/mco-sanitize/redactor.go index 43cf2e29e2..12645082ce 100644 --- a/devex/cmd/mco-sanitize/redactor.go +++ b/devex/cmd/mco-sanitize/redactor.go @@ -155,18 +155,17 @@ func recursiveWalk(node *visitorNode, path []string) (bool, error) { } } return changed, nil - } else { - // The path provides the index to pick. Transverse only that index - pathIndex, err := strconv.Atoi(key) - if err != nil { - return false, errors.New("redact path uses array indexing at a path level that is not an array") - } - newNode := node.newArrayChild(pathIndex) - if newNode == nil { - return false, nil - } - return recursiveWalk(newNode, path[1:]) } + // The path provides the index to pick. Transverse only that index + pathIndex, err := strconv.Atoi(key) + if err != nil { + return false, errors.New("redact path uses array indexing at a path level that is not an array") + } + newNode := node.newArrayChild(pathIndex) + if newNode == nil { + return false, nil + } + return recursiveWalk(newNode, path[1:]) case map[string]interface{}: // Map case. newObjectChild returns nil if the key doesn't exist which should make us // break the transversing path diff --git a/pkg/controller/machine-set-boot-image/ami.go b/pkg/controller/bootimage/ami.go similarity index 99% rename from pkg/controller/machine-set-boot-image/ami.go rename to pkg/controller/bootimage/ami.go index 78759b97ed..51c6786a3d 100644 --- a/pkg/controller/machine-set-boot-image/ami.go +++ b/pkg/controller/bootimage/ami.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "k8s.io/apimachinery/pkg/util/sets" diff --git a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go b/pkg/controller/bootimage/boot_image_controller.go similarity index 85% rename from pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go rename to pkg/controller/bootimage/boot_image_controller.go index f69c029474..1a68e0c9f9 100644 --- a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go +++ b/pkg/controller/bootimage/boot_image_controller.go @@ -1,10 +1,10 @@ -package machineset +package bootimage import ( "context" "fmt" "reflect" - "sync" + "time" features "github.com/openshift/api/features" opv1 "github.com/openshift/api/operator/v1" @@ -14,6 +14,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" coreinformersv1 "k8s.io/client-go/informers/core/v1" clientset "k8s.io/client-go/kubernetes" corev1client "k8s.io/client-go/kubernetes/typed/core/v1" @@ -21,6 +22,7 @@ import ( "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/retry" + "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" "k8s.io/kubectl/pkg/scheme" @@ -44,17 +46,23 @@ type Controller struct { mcopClient mcopclientset.Interface eventRecorder record.EventRecorder + syncHandler func(event string) error + mcoCmLister corelisterv1.ConfigMapLister mapiMachineSetLister machinelistersv1beta1.MachineSetLister cpmsLister machinelistersv1.ControlPlaneMachineSetLister infraLister configlistersv1.InfrastructureLister mcopLister mcoplistersv1.MachineConfigurationLister + clusterVersionLister configlistersv1.ClusterVersionLister mcoCmListerSynced cache.InformerSynced mapiMachineSetListerSynced cache.InformerSynced cpmsListerSynced cache.InformerSynced infraListerSynced cache.InformerSynced mcopListerSynced cache.InformerSynced + clusterVersionListerSynced cache.InformerSynced + + queue workqueue.TypedRateLimitingInterface[string] mapiStats MachineResourceStats cpmsStats MachineResourceStats @@ -62,9 +70,6 @@ type Controller struct { capiMachineDeploymentStats MachineResourceStats mapiBootImageState map[string]BootImageState cpmsBootImageState map[string]BootImageState - conditionMutex sync.Mutex - mapiSyncMutex sync.Mutex - cpmsSyncMutex sync.Mutex fgHandler ctrlcommon.FeatureGatesHandler } @@ -78,6 +83,7 @@ type MachineResourceStats struct { // State structure uses for detecting hot loops. Reset when cluster is opted // out of boot image updates. +// nolint: revive type BootImageState struct { value []byte hotLoopCount int @@ -103,6 +109,9 @@ const ( // Threshold for hot loop detection HotLoopLimit = 3 + + // maxRetries is the number of times a sync will be retried before it is dropped out of the queue. + maxRetries = 15 ) // New returns a new machine-set-boot-image controller. @@ -115,6 +124,7 @@ func New( infraInformer configinformersv1.InfrastructureInformer, mcopClient mcopclientset.Interface, mcopInformer mcopinformersv1.MachineConfigurationInformer, + clusterVersionInformer configinformersv1.ClusterVersionInformer, fgHandler ctrlcommon.FeatureGatesHandler, ) *Controller { eventBroadcaster := record.NewBroadcaster() @@ -126,19 +136,26 @@ func New( machineClient: machineClient, mcopClient: mcopClient, eventRecorder: ctrlcommon.NamespacedEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "machineconfigcontroller-machinesetbootimagecontroller"})), + queue: workqueue.NewTypedRateLimitingQueueWithConfig( + workqueue.DefaultTypedControllerRateLimiter[string](), + workqueue.TypedRateLimitingQueueConfig[string]{Name: "machineconfigcontroller-machinesetbootimagecontroller"}), } + ctrl.syncHandler = ctrl.syncAll + ctrl.mcoCmLister = mcoCmInfomer.Lister() ctrl.mapiMachineSetLister = mapiMachineSetInformer.Lister() ctrl.cpmsLister = cpmsInformer.Lister() ctrl.infraLister = infraInformer.Lister() ctrl.mcopLister = mcopInformer.Lister() + ctrl.clusterVersionLister = clusterVersionInformer.Lister() ctrl.mcoCmListerSynced = mcoCmInfomer.Informer().HasSynced ctrl.mapiMachineSetListerSynced = mapiMachineSetInformer.Informer().HasSynced ctrl.cpmsListerSynced = cpmsInformer.Informer().HasSynced ctrl.infraListerSynced = infraInformer.Informer().HasSynced ctrl.mcopListerSynced = mcopInformer.Informer().HasSynced + ctrl.clusterVersionListerSynced = clusterVersionInformer.Informer().HasSynced mapiMachineSetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: ctrl.addMAPIMachineSet, @@ -178,17 +195,67 @@ func New( // Run executes the machine-set-boot-image controller. func (ctrl *Controller) Run(stopCh <-chan struct{}) { defer utilruntime.HandleCrash() + defer ctrl.queue.ShutDown() - if !cache.WaitForCacheSync(stopCh, ctrl.mcoCmListerSynced, ctrl.mapiMachineSetListerSynced, ctrl.infraListerSynced, ctrl.mcopListerSynced) { + if !cache.WaitForCacheSync(stopCh, ctrl.mcoCmListerSynced, ctrl.mapiMachineSetListerSynced, ctrl.infraListerSynced, ctrl.mcopListerSynced, ctrl.clusterVersionListerSynced) { return } klog.Info("Starting MachineConfigController-MachineSetBootImageController") defer klog.Info("Shutting down MachineConfigController-MachineSetBootImageController") + // This controller needs to run in single thread mode, as the work unit per sync are + // the same and shouldn't overlap each other. + go wait.Until(ctrl.worker, time.Second, stopCh) + <-stopCh } +// enqueueEvent adds a event to the work queue. +func (ctrl *Controller) enqueueEvent(event string) { + ctrl.queue.Add(event) +} + +// worker runs a worker thread that just dequeues items, processes them, and marks them done. +// It enforces that the syncHandler is never invoked concurrently with the same key. +func (ctrl *Controller) worker() { + for ctrl.processNextWorkItem() { + } +} + +// processNextWorkItem processes the next work item from the queue. +func (ctrl *Controller) processNextWorkItem() bool { + event, quit := ctrl.queue.Get() + if quit { + return false + } + defer ctrl.queue.Done(event) + + err := ctrl.syncHandler(event) + ctrl.handleErr(err, event) + + return true +} + +// handleErr checks if an error happened and makes sure we will retry later. +func (ctrl *Controller) handleErr(err error, event string) { + if err == nil { + ctrl.queue.Forget(event) + return + } + + if ctrl.queue.NumRequeues(event) < maxRetries { + klog.V(2).Infof("Error syncing boot image controller for event %v: %v", event, err) + ctrl.queue.AddRateLimited(event) + return + } + + utilruntime.HandleError(err) + klog.V(2).Infof("Dropping event %q out of the queue: %v", event, err) + ctrl.queue.Forget(event) + ctrl.queue.AddAfter(event, 1*time.Minute) +} + // addMAPIMachineSet handles the addition of a MAPI MachineSet by triggering // a reconciliation of all enrolled MAPI MachineSets. func (ctrl *Controller) addMAPIMachineSet(obj interface{}) { @@ -200,7 +267,7 @@ func (ctrl *Controller) addMAPIMachineSet(obj interface{}) { // Update/Check all machinesets instead of just this one. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncMAPIMachineSets("MAPIMachinesetAdded") }() + ctrl.enqueueEvent("MAPIMachineSetAdded") } // updateMAPIMachineSet handles updates to a MAPI MachineSet by triggering @@ -223,7 +290,7 @@ func (ctrl *Controller) updateMAPIMachineSet(oldMS, newMS interface{}) { // Update all machinesets instead of just this one. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncMAPIMachineSets("MAPIMachinesetUpdated") }() + ctrl.enqueueEvent("MAPIMachineSetUpdated") } // deleteMAPIMachineSet handles the deletion of a MAPI MachineSet by triggering @@ -237,7 +304,7 @@ func (ctrl *Controller) deleteMAPIMachineSet(deletedMS interface{}) { // Update all machinesets. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncMAPIMachineSets("MAPIMachinesetDeleted") }() + ctrl.enqueueEvent("MAPIMachineSetDeleted") } // addControlPlaneMachineSet handles the addition of a ControlPlaneMachineSet by triggering @@ -251,7 +318,7 @@ func (ctrl *Controller) addControlPlaneMachineSet(obj interface{}) { // Update/Check all ControlPlaneMachineSets instead of just this one. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncControlPlaneMachineSets("ControlPlaneMachineSetAdded") }() + ctrl.enqueueEvent("ControlPlaneMachineSetAdded") } // updateControlPlaneMachineSet handles updates to a ControlPlaneMachineSet by triggering @@ -274,21 +341,21 @@ func (ctrl *Controller) updateControlPlaneMachineSet(oldCPMS, newCPMS interface{ // Update all ControlPlaneMachineSets instead of just this one. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncControlPlaneMachineSets("ControlPlaneMachineSetUpdated") }() + ctrl.enqueueEvent("ControlPlaneMachineSetUpdated") } // deleteControlPlaneMachineSet handles the deletion of a ControlPlaneMachineSet by triggering // a reconciliation of all enrolled ControlPlaneMachineSets. func (ctrl *Controller) deleteControlPlaneMachineSet(deletedCPMS interface{}) { - deletedMachineSet := deletedCPMS.(*machinev1beta1.MachineSet) + deletedMachineSet := deletedCPMS.(*machinev1.ControlPlaneMachineSet) klog.Infof("ControlPlaneMachineSet %s deleted, reconciling enrolled machineset resources", deletedMachineSet.Name) // Update all ControlPlaneMachineSets. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncControlPlaneMachineSets("ControlPlaneMachineSetDeleted") }() + ctrl.enqueueEvent("ControlPlaneMachineSetDeleted") } // addConfigMap handles the addition of the boot images ConfigMap by triggering @@ -305,7 +372,7 @@ func (ctrl *Controller) addConfigMap(obj interface{}) { klog.Infof("configMap %s added, reconciling enrolled machine resources", configMap.Name) // Update all machinesets since the "golden" configmap has been added - go func() { ctrl.syncAll("BootImageConfigMapAdded") }() + ctrl.enqueueEvent("BootImageConfigMapAdded") } // updateConfigMap handles updates to the boot images ConfigMap by triggering @@ -328,7 +395,7 @@ func (ctrl *Controller) updateConfigMap(oldCM, newCM interface{}) { klog.Infof("configMap %s updated, reconciling enrolled machine resources", oldConfigMap.Name) // Update all machinesets since the "golden" configmap has been updated - go func() { ctrl.syncAll("BootImageConfigMapUpdated") }() + ctrl.enqueueEvent("BootImageConfigMapUpdated") } // deleteConfigMap handles the deletion of the boot images ConfigMap by triggering @@ -345,7 +412,7 @@ func (ctrl *Controller) deleteConfigMap(obj interface{}) { klog.Infof("configMap %s deleted, reconciling enrolled machine resources", configMap.Name) // Update all machinesets since the "golden" configmap has been deleted - go func() { ctrl.syncAll("BootImageConfigMapDeleted") }() + ctrl.enqueueEvent("BootImageConfigMapDeleted") } // addMachineConfiguration handles the addition of the cluster-level MachineConfiguration @@ -363,7 +430,7 @@ func (ctrl *Controller) addMachineConfiguration(obj interface{}) { klog.Infof("Bootimages management configuration has been added, reconciling enrolled machine resources") // Update/Check machinesets since the boot images configuration knob was updated - go func() { ctrl.syncAll("BootImageUpdateConfigurationAdded") }() + ctrl.enqueueEvent("BootImageUpdateConfigurationAdded") } // updateMachineConfiguration handles updates to the cluster-level MachineConfiguration @@ -387,7 +454,7 @@ func (ctrl *Controller) updateMachineConfiguration(oldMC, newMC interface{}) { klog.Infof("Bootimages management configuration has been updated, reconciling enrolled machine resources") // Update all machinesets since the boot images configuration knob was updated - go func() { ctrl.syncAll("BootImageUpdateConfigurationUpdated") }() + ctrl.enqueueEvent("BootImageUpdateConfigurationUpdated") } // deleteMachineConfiguration handles the deletion of the cluster-level MachineConfiguration @@ -405,14 +472,13 @@ func (ctrl *Controller) deleteMachineConfiguration(obj interface{}) { klog.Infof("Bootimages management configuration has been deleted, reconciling enrolled machine resources") // Update/Check machinesets since the boot images configuration knob was updated - go func() { ctrl.syncAll("BootImageUpdateConfigurationDeleted") }() + ctrl.enqueueEvent("BootImageUpdateConfigurationDeleted") } // updateConditions updates the boot image update conditions on the MachineConfiguration status // based on the current state of machine resource reconciliation. func (ctrl *Controller) updateConditions(newReason string, syncError error, targetConditionType string) { - ctrl.conditionMutex.Lock() - defer ctrl.conditionMutex.Unlock() + mcop, err := ctrl.mcopClient.OperatorV1().MachineConfigurations().Get(context.TODO(), ctrlcommon.MCOOperatorKnobsObjectName, metav1.GetOptions{}) if err != nil { klog.Errorf("error updating progressing condition: %s", err) @@ -508,8 +574,17 @@ func getDefaultConditions() []metav1.Condition { } -// syncAll will attempt to enqueue all supported machine resources -func (ctrl *Controller) syncAll(reason string) { - ctrl.syncControlPlaneMachineSets(reason) - ctrl.syncMAPIMachineSets(reason) +// syncAll will attempt to sync all supported machine resources +func (ctrl *Controller) syncAll(event string) error { + klog.V(4).Infof("Syncing boot image controller for event: %s", event) + + // Wait for MachineConfiguration/cluster to be ready before syncing any machine resources + if err := ctrl.waitForMachineConfigurationReady(); err != nil { + ctrl.updateConditions(event, fmt.Errorf("MachineConfiguration was not ready: %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) + return err + } + + ctrl.syncControlPlaneMachineSets(event) + ctrl.syncMAPIMachineSets(event) + return nil } diff --git a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller_test.go b/pkg/controller/bootimage/boot_image_controller_test.go similarity index 88% rename from pkg/controller/machine-set-boot-image/machine_set_boot_image_controller_test.go rename to pkg/controller/bootimage/boot_image_controller_test.go index 69e8fd9fd7..2bf92a8956 100644 --- a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller_test.go +++ b/pkg/controller/bootimage/boot_image_controller_test.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "testing" @@ -97,71 +97,144 @@ func setMachineSetBootImage(machineset *machinev1beta1.MachineSet, generateBootI } func TestGetArchFromMachineSet(t *testing.T) { + // Helper to create a single-arch cluster version + singleArchCV := &osconfigv1.ClusterVersion{ + Status: osconfigv1.ClusterVersionStatus{ + Desired: osconfigv1.Release{ + Architecture: "", // Empty means single-arch + }, + }, + } + // Helper to create a multi-arch cluster version + multiArchCV := &osconfigv1.ClusterVersion{ + Status: osconfigv1.ClusterVersionStatus{ + Desired: osconfigv1.Release{ + Architecture: osconfigv1.ClusterVersionArchitectureMulti, + }, + }, + } + cases := []struct { - name string - annotations map[string]string - expectedArch string - expectError bool + name string + annotations map[string]string + clusterVersion *osconfigv1.ClusterVersion + expectedArch string + expectError bool }{ { - name: "Single architecture label", + name: "Single architecture label in single-arch cluster", annotations: map[string]string{ MachineSetArchAnnotationKey: "kubernetes.io/arch=amd64", }, - expectedArch: "x86_64", - expectError: false, + clusterVersion: singleArchCV, + expectedArch: "x86_64", + expectError: false, + }, + { + name: "Single architecture label in multi-arch cluster", + annotations: map[string]string{ + MachineSetArchAnnotationKey: "kubernetes.io/arch=amd64", + }, + clusterVersion: multiArchCV, + expectedArch: "x86_64", + expectError: false, }, { name: "Multiple labels with architecture first", annotations: map[string]string{ MachineSetArchAnnotationKey: "kubernetes.io/arch=amd64,topology.ebs.csi.aws.com/zone=eu-central-1a", }, - expectedArch: "x86_64", - expectError: false, + clusterVersion: singleArchCV, + expectedArch: "x86_64", + expectError: false, }, { name: "Multiple labels with architecture last", annotations: map[string]string{ MachineSetArchAnnotationKey: "topology.ebs.csi.aws.com/zone=eu-central-1a,kubernetes.io/arch=arm64", }, - expectedArch: "aarch64", - expectError: false, + clusterVersion: singleArchCV, + expectedArch: "aarch64", + expectError: false, }, { name: "Multiple labels with architecture in middle", annotations: map[string]string{ MachineSetArchAnnotationKey: "topology.ebs.csi.aws.com/zone=eu-central-1a,kubernetes.io/arch=s390x,node.kubernetes.io/instance-type=m5.large", }, - expectedArch: "s390x", - expectError: false, + clusterVersion: singleArchCV, + expectedArch: "s390x", + expectError: false, }, { name: "Multiple labels with spaces", annotations: map[string]string{ MachineSetArchAnnotationKey: " topology.ebs.csi.aws.com/zone=eu-central-1a , kubernetes.io/arch=ppc64le , node.kubernetes.io/instance-type=m5.large ", }, - expectedArch: "ppc64le", - expectError: false, + clusterVersion: singleArchCV, + expectedArch: "ppc64le", + expectError: false, }, { name: "Invalid architecture", annotations: map[string]string{ MachineSetArchAnnotationKey: "kubernetes.io/arch=invalid-arch", }, - expectError: true, + clusterVersion: singleArchCV, + expectError: true, }, { - name: "No architecture label", + name: "No architecture label in annotation", annotations: map[string]string{ MachineSetArchAnnotationKey: "topology.ebs.csi.aws.com/zone=eu-central-1a,node.kubernetes.io/instance-type=m5.large", }, + clusterVersion: singleArchCV, + expectError: true, + }, + { + name: "No annotation in single-arch cluster defaults to control plane arch", + annotations: map[string]string{}, + clusterVersion: singleArchCV, + expectError: false, // Should default to control plane arch + }, + { + name: "No annotation in multi-arch cluster returns error", + annotations: map[string]string{}, + clusterVersion: &osconfigv1.ClusterVersion{ + Status: osconfigv1.ClusterVersionStatus{ + Desired: osconfigv1.Release{ + Architecture: osconfigv1.ClusterVersionArchitectureMulti, + }, + }, + }, expectError: true, }, { - name: "No annotation", - annotations: map[string]string{}, - expectedArch: "", // Will default to control plane arch, but we can't test that easily - expectError: false, + name: "ARM64 architecture in multi-arch cluster", + annotations: map[string]string{ + MachineSetArchAnnotationKey: "kubernetes.io/arch=arm64", + }, + clusterVersion: multiArchCV, + expectedArch: "aarch64", + expectError: false, + }, + { + name: "PPC64LE architecture in single-arch cluster", + annotations: map[string]string{ + MachineSetArchAnnotationKey: "kubernetes.io/arch=ppc64le", + }, + clusterVersion: singleArchCV, + expectedArch: "ppc64le", + expectError: false, + }, + { + name: "S390X architecture in multi-arch cluster", + annotations: map[string]string{ + MachineSetArchAnnotationKey: "kubernetes.io/arch=s390x", + }, + clusterVersion: multiArchCV, + expectedArch: "s390x", + expectError: false, }, } @@ -174,7 +247,7 @@ func TestGetArchFromMachineSet(t *testing.T) { }, } - arch, err := getArchFromMachineSet(machineSet) + arch, err := getArchFromMachineSet(machineSet, tc.clusterVersion) if tc.expectError { assert.Error(t, err, "Expected error for test case: %s", tc.name) diff --git a/pkg/controller/machine-set-boot-image/cache/cache.go b/pkg/controller/bootimage/cache/cache.go similarity index 100% rename from pkg/controller/machine-set-boot-image/cache/cache.go rename to pkg/controller/bootimage/cache/cache.go diff --git a/pkg/controller/machine-set-boot-image/cpms_helpers.go b/pkg/controller/bootimage/cpms_helpers.go similarity index 92% rename from pkg/controller/machine-set-boot-image/cpms_helpers.go rename to pkg/controller/bootimage/cpms_helpers.go index 48b11f481f..70811f78ae 100644 --- a/pkg/controller/machine-set-boot-image/cpms_helpers.go +++ b/pkg/controller/bootimage/cpms_helpers.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "bytes" @@ -23,7 +23,6 @@ import ( "k8s.io/apimachinery/pkg/types" kubeErrs "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/jsonmergepatch" - "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" ) @@ -40,30 +39,11 @@ func (ctrl *Controller) syncControlPlaneMachineSets(reason string) { return } - ctrl.cpmsSyncMutex.Lock() - defer ctrl.cpmsSyncMutex.Unlock() - - var mcop *opv1.MachineConfiguration - var pollError error - // Wait for mcop.Status to populate, otherwise error out. This shouldn't take very long - // as this is done by the operator sync loop. - if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(_ context.Context) (bool, error) { - mcop, pollError = ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) - if pollError != nil { - klog.Errorf("MachineConfiguration/cluster has not been created yet") - return false, nil - } - - // Ensure status.ObservedGeneration matches the last generation of MachineConfiguration - if mcop.Generation != mcop.Status.ObservedGeneration { - klog.Errorf("MachineConfiguration.Status is not up to date.") - pollError = fmt.Errorf("MachineConfiguration.Status is not up to date") - return false, nil - } - return true, nil - }); err != nil { - klog.Errorf("MachineConfiguration was not ready: %v", pollError) - ctrl.updateConditions(reason, fmt.Errorf("MachineConfiguration was not ready: while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) + // Get MachineConfiguration to determine which resources are enrolled + mcop, err := ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) + if err != nil { + klog.Errorf("Failed to get MachineConfiguration: %v", err) + ctrl.updateConditions(reason, fmt.Errorf("failed to get MachineConfiguration while enqueueing ControlPlaneMachineSet: %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) return } diff --git a/pkg/controller/machine-set-boot-image/helpers.go b/pkg/controller/bootimage/helpers.go similarity index 83% rename from pkg/controller/machine-set-boot-image/helpers.go rename to pkg/controller/bootimage/helpers.go index 388468cace..dfd3713fa0 100644 --- a/pkg/controller/machine-set-boot-image/helpers.go +++ b/pkg/controller/bootimage/helpers.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "context" @@ -15,6 +15,7 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" kruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" "sigs.k8s.io/yaml" @@ -129,3 +130,29 @@ func upgradeStubIgnitionIfRequired(secretName string, secretClient clientset.Int } return nil } + +// waitForMachineConfigurationReady waits for the MachineConfiguration to be ready +// by polling until the status is populated and the ObservedGeneration matches Generation. +func (ctrl *Controller) waitForMachineConfigurationReady() error { + var mcop *opv1.MachineConfiguration + var pollError error + if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(_ context.Context) (bool, error) { + mcop, pollError = ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) + if pollError != nil { + klog.Errorf("MachineConfiguration/cluster has not been created yet") + return false, nil + } + + // Ensure status.ObservedGeneration matches the last generation of MachineConfiguration + if mcop.Generation != mcop.Status.ObservedGeneration { + klog.Errorf("MachineConfiguration.Status is not up to date.") + pollError = fmt.Errorf("MachineConfiguration.Status is not up to date") + return false, nil + } + return true, nil + }); err != nil { + klog.Errorf("MachineConfiguration was not ready: %v", pollError) + return pollError + } + return nil +} diff --git a/pkg/controller/machine-set-boot-image/ms_helpers.go b/pkg/controller/bootimage/ms_helpers.go similarity index 79% rename from pkg/controller/machine-set-boot-image/ms_helpers.go rename to pkg/controller/bootimage/ms_helpers.go index f87d0adbd6..bbb00a3108 100644 --- a/pkg/controller/machine-set-boot-image/ms_helpers.go +++ b/pkg/controller/bootimage/ms_helpers.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "bytes" @@ -8,6 +8,7 @@ import ( "strings" "time" + osconfigv1 "github.com/openshift/api/config/v1" machinev1beta1 "github.com/openshift/api/machine/v1beta1" opv1 "github.com/openshift/api/operator/v1" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" @@ -17,7 +18,6 @@ import ( kubeErrs "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/jsonmergepatch" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" archtranslater "github.com/coreos/stream-metadata-go/arch" @@ -27,30 +27,11 @@ import ( // nolint:dupl // I separated this from syncControlPlaneMachineSets for readability func (ctrl *Controller) syncMAPIMachineSets(reason string) { - ctrl.mapiSyncMutex.Lock() - defer ctrl.mapiSyncMutex.Unlock() - - var mcop *opv1.MachineConfiguration - var pollError error - // Wait for mcop.Status to populate, otherwise error out. This shouldn't take very long - // as this is done by the operator sync loop. - if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(_ context.Context) (bool, error) { - mcop, pollError = ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) - if pollError != nil { - klog.Errorf("MachineConfiguration/cluster has not been created yet") - return false, nil - } - - // Ensure status.ObservedGeneration matches the last generation of MachineConfiguration - if mcop.Generation != mcop.Status.ObservedGeneration { - klog.Errorf("MachineConfiguration.Status is not up to date.") - pollError = fmt.Errorf("MachineConfiguration.Status is not up to date") - return false, nil - } - return true, nil - }); err != nil { - klog.Errorf("MachineConfiguration was not ready: %v", pollError) - ctrl.updateConditions(reason, fmt.Errorf("MachineConfiguration was not ready: while enqueueing MAPI MachineSets %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) + // Get MachineConfiguration to determine which resources are enrolled + mcop, err := ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) + if err != nil { + klog.Errorf("Failed to get MachineConfiguration: %v", err) + ctrl.updateConditions(reason, fmt.Errorf("failed to get MachineConfiguration while enqueueing MAPI MachineSets: %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) return } @@ -133,9 +114,20 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet } } + // Fetch the ClusterVersion to determine if this is a multi-arch cluster + clusterVersion, err := ctrl.clusterVersionLister.Get("version") + if err != nil { + return fmt.Errorf("failed to fetch clusterversion during machineset sync: %v, defaulting to single-arch behavior", err) + } + // Fetch the architecture type of this machineset - arch, err := getArchFromMachineSet(machineSet) + arch, err := getArchFromMachineSet(machineSet, clusterVersion) if err != nil { + // If no architecture annotation was found, skip this machineset without erroring + // A later sync loop will pick it up once the annotation is added + if strings.Contains(err.Error(), "no architecture annotation found") { + return nil + } return fmt.Errorf("failed to fetch arch during machineset sync: %w", err) } @@ -242,29 +234,37 @@ func (ctrl *Controller) patchMachineSet(oldMachineSet, newMachineSet *machinev1b } // Returns architecture type for a given machineset -func getArchFromMachineSet(machineset *machinev1beta1.MachineSet) (arch string, err error) { +func getArchFromMachineSet(machineset *machinev1beta1.MachineSet, clusterVersion *osconfigv1.ClusterVersion) (arch string, err error) { // Valid set of machineset/node architectures validArchSet := sets.New("arm64", "s390x", "amd64", "ppc64le") // Check if the annotation enclosing arch label is present on this machineset archLabel, archLabelMatch := machineset.Annotations[MachineSetArchAnnotationKey] - if archLabelMatch { - // Parse the annotation value which may contain multiple comma-separated labels - // Example: kubernetes.io/arch=amd64,topology.ebs.csi.aws.com/zone=eu-central-1a - for label := range strings.SplitSeq(archLabel, ",") { - label = strings.TrimSpace(label) - if archLabelValue, found := strings.CutPrefix(label, ArchLabelKey); found { - // Extract just the architecture value after "kubernetes.io/arch=" - if validArchSet.Has(archLabelValue) { - return archtranslater.RpmArch(archLabelValue), nil - } - return "", fmt.Errorf("invalid architecture value found in annotation: %s", archLabelValue) + + if !archLabelMatch { + // Check if this is a multi-arch cluster + // clusterVersion should never be nil as it's validated by the caller + if clusterVersion.Status.Desired.Architecture == osconfigv1.ClusterVersionArchitectureMulti { + // For multi-arch clusters, we require the architecture annotation + klog.Errorf("No architecture annotation found on machineset %s in multi-arch cluster, skipping boot image update", machineset.Name) + return "", fmt.Errorf("no architecture annotation found on machineset %s", machineset.Name) + } + // For single-arch clusters, default to control plane architecture + klog.Infof("No architecture annotation found on machineset %s, defaulting to control plane architecture", machineset.Name) + return archtranslater.CurrentRpmArch(), nil + } + + // Parse the annotation value which may contain multiple comma-separated labels + // Example: kubernetes.io/arch=amd64,topology.ebs.csi.aws.com/zone=eu-central-1a + for label := range strings.SplitSeq(archLabel, ",") { + label = strings.TrimSpace(label) + if archLabelValue, found := strings.CutPrefix(label, ArchLabelKey); found { + // Extract just the architecture value after "kubernetes.io/arch=" + if validArchSet.Has(archLabelValue) { + return archtranslater.RpmArch(archLabelValue), nil } + return "", fmt.Errorf("invalid architecture value found in annotation: %s", archLabelValue) } - return "", fmt.Errorf("kubernetes.io/arch label not found in annotation: %s", archLabel) } - // If no arch annotation was found on the machineset, default to the control plane arch. - // return the architecture of the node running this pod, which will always be a control plane node. - klog.Infof("Defaulting to control plane architecture") - return archtranslater.CurrentRpmArch(), nil + return "", fmt.Errorf("kubernetes.io/arch label not found in annotation: %s", archLabel) } diff --git a/pkg/controller/machine-set-boot-image/platform_helpers.go b/pkg/controller/bootimage/platform_helpers.go similarity index 99% rename from pkg/controller/machine-set-boot-image/platform_helpers.go rename to pkg/controller/bootimage/platform_helpers.go index cea8277232..a8b4e1859d 100644 --- a/pkg/controller/machine-set-boot-image/platform_helpers.go +++ b/pkg/controller/bootimage/platform_helpers.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "context" diff --git a/pkg/controller/machine-set-boot-image/vsphere_helpers.go b/pkg/controller/bootimage/vsphere_helpers.go similarity index 99% rename from pkg/controller/machine-set-boot-image/vsphere_helpers.go rename to pkg/controller/bootimage/vsphere_helpers.go index 7b83551dab..17692c557c 100644 --- a/pkg/controller/machine-set-boot-image/vsphere_helpers.go +++ b/pkg/controller/bootimage/vsphere_helpers.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "context" @@ -29,7 +29,7 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" - "github.com/openshift/machine-config-operator/pkg/controller/machine-set-boot-image/cache" + "github.com/openshift/machine-config-operator/pkg/controller/bootimage/cache" ) type VsphereResources struct { diff --git a/test/extended-priv/mco_scale.go b/test/extended-priv/mco_scale.go index 38e1065ab8..f0e1a99d30 100644 --- a/test/extended-priv/mco_scale.go +++ b/test/extended-priv/mco_scale.go @@ -428,7 +428,7 @@ func uploadBaseImageToVsphere(baseImageSrc, baseImageDest, server, dataCenter, d uploadCmd.Env = govcExecEnv out, err := uploadCmd.CombinedOutput() - logger.Infof(string(out)) + logger.Infof("%s", string(out)) if err != nil { if strings.Contains(string(out), "already exists") { logger.Infof("Image %s already exists in the cloud, we don't upload it again", baseImageDest) @@ -444,7 +444,7 @@ func uploadBaseImageToVsphere(baseImageSrc, baseImageDest, server, dataCenter, d upgradeCmd.Env = govcExecEnv out, err = upgradeCmd.CombinedOutput() - logger.Infof(string(out)) + logger.Infof("%s", string(out)) if err != nil { // We don't fail. We log a warning and continue. logger.Warnf("ERROR UPGRADING HARDWARE: %s", err) @@ -457,7 +457,7 @@ func uploadBaseImageToVsphere(baseImageSrc, baseImageDest, server, dataCenter, d templateCmd.Env = govcExecEnv out, err = templateCmd.CombinedOutput() - logger.Infof(string(out)) + logger.Infof("%s", string(out)) if err != nil { // We don't fail. We log a warning and continue. logger.Warnf("ERROR CONVERTING INTO TEMPLATE: %s", err) diff --git a/test/extended-priv/node.go b/test/extended-priv/node.go index 6c2b01f5c7..036caf6edc 100644 --- a/test/extended-priv/node.go +++ b/test/extended-priv/node.go @@ -505,7 +505,7 @@ func (n *Node) GetRHELVersion() (string, error) { match := r.FindStringSubmatch(vContent) if len(match) == 0 { msg := fmt.Sprintf("No RHEL_VERSION available in /etc/os-release file: %s", vContent) - logger.Errorf(msg) + logger.Errorf("%s", msg) return "", fmt.Errorf("Error: %s", msg) } diff --git a/test/extended-priv/remotefile.go b/test/extended-priv/remotefile.go index fd8d5cd294..b8bce45fa8 100644 --- a/test/extended-priv/remotefile.go +++ b/test/extended-priv/remotefile.go @@ -78,7 +78,7 @@ func (rf *RemoteFile) fetchTextContent() error { tmpcontent := strings.SplitN(output, startCat, 2)[1] // take into account that "cat" introduces a newline at the end lastIndex := strings.LastIndex(tmpcontent, endCat) - rf.content = fmt.Sprintf(tmpcontent[:lastIndex]) + rf.content = tmpcontent[:lastIndex] logger.Debugf("remote file %s content is:\n%s", rf.fullPath, rf.content) @@ -299,7 +299,7 @@ func (rf *RemoteFile) Rm(args ...string) error { cmd = append(cmd, rf.fullPath) output, err := rf.node.DebugNodeWithChroot(cmd...) - logger.Infof(output) + logger.Infof("%s", output) return err }