From 939d7e3ec2e3ea88f8d10344452afb2b25de2a3f Mon Sep 17 00:00:00 2001 From: David Date: Tue, 9 Dec 2025 16:31:42 -0500 Subject: [PATCH 1/4] miscellaneous verify fixes These seemed to be tripping local verify runs, but not the CI ones. --- devex/cmd/mco-sanitize/redactor.go | 21 ++++++++++----------- test/extended-priv/mco_scale.go | 6 +++--- test/extended-priv/node.go | 2 +- test/extended-priv/remotefile.go | 4 ++-- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/devex/cmd/mco-sanitize/redactor.go b/devex/cmd/mco-sanitize/redactor.go index 43cf2e29e2..12645082ce 100644 --- a/devex/cmd/mco-sanitize/redactor.go +++ b/devex/cmd/mco-sanitize/redactor.go @@ -155,18 +155,17 @@ func recursiveWalk(node *visitorNode, path []string) (bool, error) { } } return changed, nil - } else { - // The path provides the index to pick. Transverse only that index - pathIndex, err := strconv.Atoi(key) - if err != nil { - return false, errors.New("redact path uses array indexing at a path level that is not an array") - } - newNode := node.newArrayChild(pathIndex) - if newNode == nil { - return false, nil - } - return recursiveWalk(newNode, path[1:]) } + // The path provides the index to pick. Transverse only that index + pathIndex, err := strconv.Atoi(key) + if err != nil { + return false, errors.New("redact path uses array indexing at a path level that is not an array") + } + newNode := node.newArrayChild(pathIndex) + if newNode == nil { + return false, nil + } + return recursiveWalk(newNode, path[1:]) case map[string]interface{}: // Map case. newObjectChild returns nil if the key doesn't exist which should make us // break the transversing path diff --git a/test/extended-priv/mco_scale.go b/test/extended-priv/mco_scale.go index 38e1065ab8..f0e1a99d30 100644 --- a/test/extended-priv/mco_scale.go +++ b/test/extended-priv/mco_scale.go @@ -428,7 +428,7 @@ func uploadBaseImageToVsphere(baseImageSrc, baseImageDest, server, dataCenter, d uploadCmd.Env = govcExecEnv out, err := uploadCmd.CombinedOutput() - logger.Infof(string(out)) + logger.Infof("%s", string(out)) if err != nil { if strings.Contains(string(out), "already exists") { logger.Infof("Image %s already exists in the cloud, we don't upload it again", baseImageDest) @@ -444,7 +444,7 @@ func uploadBaseImageToVsphere(baseImageSrc, baseImageDest, server, dataCenter, d upgradeCmd.Env = govcExecEnv out, err = upgradeCmd.CombinedOutput() - logger.Infof(string(out)) + logger.Infof("%s", string(out)) if err != nil { // We don't fail. We log a warning and continue. logger.Warnf("ERROR UPGRADING HARDWARE: %s", err) @@ -457,7 +457,7 @@ func uploadBaseImageToVsphere(baseImageSrc, baseImageDest, server, dataCenter, d templateCmd.Env = govcExecEnv out, err = templateCmd.CombinedOutput() - logger.Infof(string(out)) + logger.Infof("%s", string(out)) if err != nil { // We don't fail. We log a warning and continue. logger.Warnf("ERROR CONVERTING INTO TEMPLATE: %s", err) diff --git a/test/extended-priv/node.go b/test/extended-priv/node.go index 6c2b01f5c7..036caf6edc 100644 --- a/test/extended-priv/node.go +++ b/test/extended-priv/node.go @@ -505,7 +505,7 @@ func (n *Node) GetRHELVersion() (string, error) { match := r.FindStringSubmatch(vContent) if len(match) == 0 { msg := fmt.Sprintf("No RHEL_VERSION available in /etc/os-release file: %s", vContent) - logger.Errorf(msg) + logger.Errorf("%s", msg) return "", fmt.Errorf("Error: %s", msg) } diff --git a/test/extended-priv/remotefile.go b/test/extended-priv/remotefile.go index fd8d5cd294..b8bce45fa8 100644 --- a/test/extended-priv/remotefile.go +++ b/test/extended-priv/remotefile.go @@ -78,7 +78,7 @@ func (rf *RemoteFile) fetchTextContent() error { tmpcontent := strings.SplitN(output, startCat, 2)[1] // take into account that "cat" introduces a newline at the end lastIndex := strings.LastIndex(tmpcontent, endCat) - rf.content = fmt.Sprintf(tmpcontent[:lastIndex]) + rf.content = tmpcontent[:lastIndex] logger.Debugf("remote file %s content is:\n%s", rf.fullPath, rf.content) @@ -299,7 +299,7 @@ func (rf *RemoteFile) Rm(args ...string) error { cmd = append(cmd, rf.fullPath) output, err := rf.node.DebugNodeWithChroot(cmd...) - logger.Infof(output) + logger.Infof("%s", output) return err } From 4d8a497c72bd47ef074ad80dec7e5d949bded1fd Mon Sep 17 00:00:00 2001 From: David Date: Tue, 9 Dec 2025 16:01:29 -0500 Subject: [PATCH 2/4] controller: rename msbic to boot image controller This commit simplifies the name of the package as the controller now deals with more than just the standard machineset resources. --- cmd/machine-config-controller/start.go | 6 +++--- pkg/controller/{machine-set-boot-image => bootimage}/ami.go | 2 +- .../boot_image_controller.go} | 2 +- .../boot_image_controller_test.go} | 2 +- .../{machine-set-boot-image => bootimage}/cache/cache.go | 0 .../{machine-set-boot-image => bootimage}/cpms_helpers.go | 2 +- .../{machine-set-boot-image => bootimage}/helpers.go | 2 +- .../{machine-set-boot-image => bootimage}/ms_helpers.go | 2 +- .../platform_helpers.go | 2 +- .../vsphere_helpers.go | 4 ++-- 10 files changed, 12 insertions(+), 12 deletions(-) rename pkg/controller/{machine-set-boot-image => bootimage}/ami.go (99%) rename pkg/controller/{machine-set-boot-image/machine_set_boot_image_controller.go => bootimage/boot_image_controller.go} (99%) rename pkg/controller/{machine-set-boot-image/machine_set_boot_image_controller_test.go => bootimage/boot_image_controller_test.go} (99%) rename pkg/controller/{machine-set-boot-image => bootimage}/cache/cache.go (100%) rename pkg/controller/{machine-set-boot-image => bootimage}/cpms_helpers.go (99%) rename pkg/controller/{machine-set-boot-image => bootimage}/helpers.go (99%) rename pkg/controller/{machine-set-boot-image => bootimage}/ms_helpers.go (99%) rename pkg/controller/{machine-set-boot-image => bootimage}/platform_helpers.go (99%) rename pkg/controller/{machine-set-boot-image => bootimage}/vsphere_helpers.go (99%) diff --git a/cmd/machine-config-controller/start.go b/cmd/machine-config-controller/start.go index f22c4dfee0..fb9bd56d86 100644 --- a/cmd/machine-config-controller/start.go +++ b/cmd/machine-config-controller/start.go @@ -9,13 +9,13 @@ import ( features "github.com/openshift/api/features" "github.com/openshift/machine-config-operator/cmd/common" "github.com/openshift/machine-config-operator/internal/clients" + bootimagecontroller "github.com/openshift/machine-config-operator/pkg/controller/bootimage" certrotationcontroller "github.com/openshift/machine-config-operator/pkg/controller/certrotation" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" containerruntimeconfig "github.com/openshift/machine-config-operator/pkg/controller/container-runtime-config" "github.com/openshift/machine-config-operator/pkg/controller/drain" "github.com/openshift/machine-config-operator/pkg/controller/internalreleaseimage" kubeletconfig "github.com/openshift/machine-config-operator/pkg/controller/kubelet-config" - machinesetbootimage "github.com/openshift/machine-config-operator/pkg/controller/machine-set-boot-image" "github.com/openshift/machine-config-operator/pkg/controller/node" "github.com/openshift/machine-config-operator/pkg/controller/pinnedimageset" "github.com/openshift/machine-config-operator/pkg/controller/render" @@ -144,7 +144,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { } if ctrlcommon.IsBootImageControllerRequired(ctrlctx) { - machineSetBootImage := machinesetbootimage.New( + bootImageController := bootimagecontroller.New( ctrlctx.ClientBuilder.KubeClientOrDie("machine-set-boot-image-controller"), ctrlctx.ClientBuilder.MachineClientOrDie("machine-set-boot-image-controller"), ctrlctx.KubeNamespacedInformerFactory.Core().V1().ConfigMaps(), @@ -155,7 +155,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { ctrlctx.OperatorInformerFactory.Operator().V1().MachineConfigurations(), ctrlctx.FeatureGatesHandler, ) - go machineSetBootImage.Run(ctrlctx.Stop) + go bootImageController.Run(ctrlctx.Stop) // start the informers again to enable feature gated types. // see comments in SharedInformerFactory interface. ctrlctx.KubeNamespacedInformerFactory.Start(ctrlctx.Stop) diff --git a/pkg/controller/machine-set-boot-image/ami.go b/pkg/controller/bootimage/ami.go similarity index 99% rename from pkg/controller/machine-set-boot-image/ami.go rename to pkg/controller/bootimage/ami.go index 78759b97ed..51c6786a3d 100644 --- a/pkg/controller/machine-set-boot-image/ami.go +++ b/pkg/controller/bootimage/ami.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "k8s.io/apimachinery/pkg/util/sets" diff --git a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go b/pkg/controller/bootimage/boot_image_controller.go similarity index 99% rename from pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go rename to pkg/controller/bootimage/boot_image_controller.go index f69c029474..37c4ad806c 100644 --- a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go +++ b/pkg/controller/bootimage/boot_image_controller.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "context" diff --git a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller_test.go b/pkg/controller/bootimage/boot_image_controller_test.go similarity index 99% rename from pkg/controller/machine-set-boot-image/machine_set_boot_image_controller_test.go rename to pkg/controller/bootimage/boot_image_controller_test.go index 69e8fd9fd7..dd5609ced2 100644 --- a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller_test.go +++ b/pkg/controller/bootimage/boot_image_controller_test.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "testing" diff --git a/pkg/controller/machine-set-boot-image/cache/cache.go b/pkg/controller/bootimage/cache/cache.go similarity index 100% rename from pkg/controller/machine-set-boot-image/cache/cache.go rename to pkg/controller/bootimage/cache/cache.go diff --git a/pkg/controller/machine-set-boot-image/cpms_helpers.go b/pkg/controller/bootimage/cpms_helpers.go similarity index 99% rename from pkg/controller/machine-set-boot-image/cpms_helpers.go rename to pkg/controller/bootimage/cpms_helpers.go index 48b11f481f..29d9424e0e 100644 --- a/pkg/controller/machine-set-boot-image/cpms_helpers.go +++ b/pkg/controller/bootimage/cpms_helpers.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "bytes" diff --git a/pkg/controller/machine-set-boot-image/helpers.go b/pkg/controller/bootimage/helpers.go similarity index 99% rename from pkg/controller/machine-set-boot-image/helpers.go rename to pkg/controller/bootimage/helpers.go index 388468cace..41604008a6 100644 --- a/pkg/controller/machine-set-boot-image/helpers.go +++ b/pkg/controller/bootimage/helpers.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "context" diff --git a/pkg/controller/machine-set-boot-image/ms_helpers.go b/pkg/controller/bootimage/ms_helpers.go similarity index 99% rename from pkg/controller/machine-set-boot-image/ms_helpers.go rename to pkg/controller/bootimage/ms_helpers.go index f87d0adbd6..013fa81ed2 100644 --- a/pkg/controller/machine-set-boot-image/ms_helpers.go +++ b/pkg/controller/bootimage/ms_helpers.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "bytes" diff --git a/pkg/controller/machine-set-boot-image/platform_helpers.go b/pkg/controller/bootimage/platform_helpers.go similarity index 99% rename from pkg/controller/machine-set-boot-image/platform_helpers.go rename to pkg/controller/bootimage/platform_helpers.go index cea8277232..a8b4e1859d 100644 --- a/pkg/controller/machine-set-boot-image/platform_helpers.go +++ b/pkg/controller/bootimage/platform_helpers.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "context" diff --git a/pkg/controller/machine-set-boot-image/vsphere_helpers.go b/pkg/controller/bootimage/vsphere_helpers.go similarity index 99% rename from pkg/controller/machine-set-boot-image/vsphere_helpers.go rename to pkg/controller/bootimage/vsphere_helpers.go index 7b83551dab..17692c557c 100644 --- a/pkg/controller/machine-set-boot-image/vsphere_helpers.go +++ b/pkg/controller/bootimage/vsphere_helpers.go @@ -1,4 +1,4 @@ -package machineset +package bootimage import ( "context" @@ -29,7 +29,7 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" - "github.com/openshift/machine-config-operator/pkg/controller/machine-set-boot-image/cache" + "github.com/openshift/machine-config-operator/pkg/controller/bootimage/cache" ) type VsphereResources struct { From 12858f79782e05e2d20aa2474bba2e36b2b5e04b Mon Sep 17 00:00:00 2001 From: David Date: Tue, 9 Dec 2025 16:33:13 -0500 Subject: [PATCH 3/4] boot_image: add event based queueing --- .../bootimage/boot_image_controller.go | 116 ++++++++++++++---- pkg/controller/bootimage/cpms_helpers.go | 30 +---- pkg/controller/bootimage/helpers.go | 27 ++++ pkg/controller/bootimage/ms_helpers.go | 30 +---- 4 files changed, 130 insertions(+), 73 deletions(-) diff --git a/pkg/controller/bootimage/boot_image_controller.go b/pkg/controller/bootimage/boot_image_controller.go index 37c4ad806c..ea9655bfeb 100644 --- a/pkg/controller/bootimage/boot_image_controller.go +++ b/pkg/controller/bootimage/boot_image_controller.go @@ -4,7 +4,7 @@ import ( "context" "fmt" "reflect" - "sync" + "time" features "github.com/openshift/api/features" opv1 "github.com/openshift/api/operator/v1" @@ -14,6 +14,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" coreinformersv1 "k8s.io/client-go/informers/core/v1" clientset "k8s.io/client-go/kubernetes" corev1client "k8s.io/client-go/kubernetes/typed/core/v1" @@ -21,6 +22,7 @@ import ( "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/retry" + "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" "k8s.io/kubectl/pkg/scheme" @@ -44,6 +46,8 @@ type Controller struct { mcopClient mcopclientset.Interface eventRecorder record.EventRecorder + syncHandler func(event string) error + mcoCmLister corelisterv1.ConfigMapLister mapiMachineSetLister machinelistersv1beta1.MachineSetLister cpmsLister machinelistersv1.ControlPlaneMachineSetLister @@ -56,15 +60,14 @@ type Controller struct { infraListerSynced cache.InformerSynced mcopListerSynced cache.InformerSynced + queue workqueue.TypedRateLimitingInterface[string] + mapiStats MachineResourceStats cpmsStats MachineResourceStats capiMachineSetStats MachineResourceStats capiMachineDeploymentStats MachineResourceStats mapiBootImageState map[string]BootImageState cpmsBootImageState map[string]BootImageState - conditionMutex sync.Mutex - mapiSyncMutex sync.Mutex - cpmsSyncMutex sync.Mutex fgHandler ctrlcommon.FeatureGatesHandler } @@ -78,6 +81,7 @@ type MachineResourceStats struct { // State structure uses for detecting hot loops. Reset when cluster is opted // out of boot image updates. +// nolint: revive type BootImageState struct { value []byte hotLoopCount int @@ -103,6 +107,9 @@ const ( // Threshold for hot loop detection HotLoopLimit = 3 + + // maxRetries is the number of times a sync will be retried before it is dropped out of the queue. + maxRetries = 15 ) // New returns a new machine-set-boot-image controller. @@ -126,8 +133,13 @@ func New( machineClient: machineClient, mcopClient: mcopClient, eventRecorder: ctrlcommon.NamespacedEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "machineconfigcontroller-machinesetbootimagecontroller"})), + queue: workqueue.NewTypedRateLimitingQueueWithConfig( + workqueue.DefaultTypedControllerRateLimiter[string](), + workqueue.TypedRateLimitingQueueConfig[string]{Name: "machineconfigcontroller-machinesetbootimagecontroller"}), } + ctrl.syncHandler = ctrl.syncAll + ctrl.mcoCmLister = mcoCmInfomer.Lister() ctrl.mapiMachineSetLister = mapiMachineSetInformer.Lister() ctrl.cpmsLister = cpmsInformer.Lister() @@ -178,6 +190,7 @@ func New( // Run executes the machine-set-boot-image controller. func (ctrl *Controller) Run(stopCh <-chan struct{}) { defer utilruntime.HandleCrash() + defer ctrl.queue.ShutDown() if !cache.WaitForCacheSync(stopCh, ctrl.mcoCmListerSynced, ctrl.mapiMachineSetListerSynced, ctrl.infraListerSynced, ctrl.mcopListerSynced) { return @@ -186,9 +199,58 @@ func (ctrl *Controller) Run(stopCh <-chan struct{}) { klog.Info("Starting MachineConfigController-MachineSetBootImageController") defer klog.Info("Shutting down MachineConfigController-MachineSetBootImageController") + // This controller needs to run in single thread mode, as the work unit per sync are + // the same and shouldn't overlap each other. + go wait.Until(ctrl.worker, time.Second, stopCh) + <-stopCh } +// enqueueEvent adds a event to the work queue. +func (ctrl *Controller) enqueueEvent(event string) { + ctrl.queue.Add(event) +} + +// worker runs a worker thread that just dequeues items, processes them, and marks them done. +// It enforces that the syncHandler is never invoked concurrently with the same key. +func (ctrl *Controller) worker() { + for ctrl.processNextWorkItem() { + } +} + +// processNextWorkItem processes the next work item from the queue. +func (ctrl *Controller) processNextWorkItem() bool { + event, quit := ctrl.queue.Get() + if quit { + return false + } + defer ctrl.queue.Done(event) + + err := ctrl.syncHandler(event) + ctrl.handleErr(err, event) + + return true +} + +// handleErr checks if an error happened and makes sure we will retry later. +func (ctrl *Controller) handleErr(err error, event string) { + if err == nil { + ctrl.queue.Forget(event) + return + } + + if ctrl.queue.NumRequeues(event) < maxRetries { + klog.V(2).Infof("Error syncing boot image controller for event %v: %v", event, err) + ctrl.queue.AddRateLimited(event) + return + } + + utilruntime.HandleError(err) + klog.V(2).Infof("Dropping event %q out of the queue: %v", event, err) + ctrl.queue.Forget(event) + ctrl.queue.AddAfter(event, 1*time.Minute) +} + // addMAPIMachineSet handles the addition of a MAPI MachineSet by triggering // a reconciliation of all enrolled MAPI MachineSets. func (ctrl *Controller) addMAPIMachineSet(obj interface{}) { @@ -200,7 +262,7 @@ func (ctrl *Controller) addMAPIMachineSet(obj interface{}) { // Update/Check all machinesets instead of just this one. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncMAPIMachineSets("MAPIMachinesetAdded") }() + ctrl.enqueueEvent("MAPIMachineSetAdded") } // updateMAPIMachineSet handles updates to a MAPI MachineSet by triggering @@ -223,7 +285,7 @@ func (ctrl *Controller) updateMAPIMachineSet(oldMS, newMS interface{}) { // Update all machinesets instead of just this one. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncMAPIMachineSets("MAPIMachinesetUpdated") }() + ctrl.enqueueEvent("MAPIMachineSetUpdated") } // deleteMAPIMachineSet handles the deletion of a MAPI MachineSet by triggering @@ -237,7 +299,7 @@ func (ctrl *Controller) deleteMAPIMachineSet(deletedMS interface{}) { // Update all machinesets. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncMAPIMachineSets("MAPIMachinesetDeleted") }() + ctrl.enqueueEvent("MAPIMachineSetDeleted") } // addControlPlaneMachineSet handles the addition of a ControlPlaneMachineSet by triggering @@ -251,7 +313,7 @@ func (ctrl *Controller) addControlPlaneMachineSet(obj interface{}) { // Update/Check all ControlPlaneMachineSets instead of just this one. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncControlPlaneMachineSets("ControlPlaneMachineSetAdded") }() + ctrl.enqueueEvent("ControlPlaneMachineSetAdded") } // updateControlPlaneMachineSet handles updates to a ControlPlaneMachineSet by triggering @@ -274,21 +336,21 @@ func (ctrl *Controller) updateControlPlaneMachineSet(oldCPMS, newCPMS interface{ // Update all ControlPlaneMachineSets instead of just this one. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncControlPlaneMachineSets("ControlPlaneMachineSetUpdated") }() + ctrl.enqueueEvent("ControlPlaneMachineSetUpdated") } // deleteControlPlaneMachineSet handles the deletion of a ControlPlaneMachineSet by triggering // a reconciliation of all enrolled ControlPlaneMachineSets. func (ctrl *Controller) deleteControlPlaneMachineSet(deletedCPMS interface{}) { - deletedMachineSet := deletedCPMS.(*machinev1beta1.MachineSet) + deletedMachineSet := deletedCPMS.(*machinev1.ControlPlaneMachineSet) klog.Infof("ControlPlaneMachineSet %s deleted, reconciling enrolled machineset resources", deletedMachineSet.Name) // Update all ControlPlaneMachineSets. This prevents needing to maintain a local // store of machineset conditions. As this is using a lister, it is relatively inexpensive to do // this. - go func() { ctrl.syncControlPlaneMachineSets("ControlPlaneMachineSetDeleted") }() + ctrl.enqueueEvent("ControlPlaneMachineSetDeleted") } // addConfigMap handles the addition of the boot images ConfigMap by triggering @@ -305,7 +367,7 @@ func (ctrl *Controller) addConfigMap(obj interface{}) { klog.Infof("configMap %s added, reconciling enrolled machine resources", configMap.Name) // Update all machinesets since the "golden" configmap has been added - go func() { ctrl.syncAll("BootImageConfigMapAdded") }() + ctrl.enqueueEvent("BootImageConfigMapAdded") } // updateConfigMap handles updates to the boot images ConfigMap by triggering @@ -328,7 +390,7 @@ func (ctrl *Controller) updateConfigMap(oldCM, newCM interface{}) { klog.Infof("configMap %s updated, reconciling enrolled machine resources", oldConfigMap.Name) // Update all machinesets since the "golden" configmap has been updated - go func() { ctrl.syncAll("BootImageConfigMapUpdated") }() + ctrl.enqueueEvent("BootImageConfigMapUpdated") } // deleteConfigMap handles the deletion of the boot images ConfigMap by triggering @@ -345,7 +407,7 @@ func (ctrl *Controller) deleteConfigMap(obj interface{}) { klog.Infof("configMap %s deleted, reconciling enrolled machine resources", configMap.Name) // Update all machinesets since the "golden" configmap has been deleted - go func() { ctrl.syncAll("BootImageConfigMapDeleted") }() + ctrl.enqueueEvent("BootImageConfigMapDeleted") } // addMachineConfiguration handles the addition of the cluster-level MachineConfiguration @@ -363,7 +425,7 @@ func (ctrl *Controller) addMachineConfiguration(obj interface{}) { klog.Infof("Bootimages management configuration has been added, reconciling enrolled machine resources") // Update/Check machinesets since the boot images configuration knob was updated - go func() { ctrl.syncAll("BootImageUpdateConfigurationAdded") }() + ctrl.enqueueEvent("BootImageUpdateConfigurationAdded") } // updateMachineConfiguration handles updates to the cluster-level MachineConfiguration @@ -387,7 +449,7 @@ func (ctrl *Controller) updateMachineConfiguration(oldMC, newMC interface{}) { klog.Infof("Bootimages management configuration has been updated, reconciling enrolled machine resources") // Update all machinesets since the boot images configuration knob was updated - go func() { ctrl.syncAll("BootImageUpdateConfigurationUpdated") }() + ctrl.enqueueEvent("BootImageUpdateConfigurationUpdated") } // deleteMachineConfiguration handles the deletion of the cluster-level MachineConfiguration @@ -405,14 +467,13 @@ func (ctrl *Controller) deleteMachineConfiguration(obj interface{}) { klog.Infof("Bootimages management configuration has been deleted, reconciling enrolled machine resources") // Update/Check machinesets since the boot images configuration knob was updated - go func() { ctrl.syncAll("BootImageUpdateConfigurationDeleted") }() + ctrl.enqueueEvent("BootImageUpdateConfigurationDeleted") } // updateConditions updates the boot image update conditions on the MachineConfiguration status // based on the current state of machine resource reconciliation. func (ctrl *Controller) updateConditions(newReason string, syncError error, targetConditionType string) { - ctrl.conditionMutex.Lock() - defer ctrl.conditionMutex.Unlock() + mcop, err := ctrl.mcopClient.OperatorV1().MachineConfigurations().Get(context.TODO(), ctrlcommon.MCOOperatorKnobsObjectName, metav1.GetOptions{}) if err != nil { klog.Errorf("error updating progressing condition: %s", err) @@ -508,8 +569,17 @@ func getDefaultConditions() []metav1.Condition { } -// syncAll will attempt to enqueue all supported machine resources -func (ctrl *Controller) syncAll(reason string) { - ctrl.syncControlPlaneMachineSets(reason) - ctrl.syncMAPIMachineSets(reason) +// syncAll will attempt to sync all supported machine resources +func (ctrl *Controller) syncAll(event string) error { + klog.V(4).Infof("Syncing boot image controller for event: %s", event) + + // Wait for MachineConfiguration/cluster to be ready before syncing any machine resources + if err := ctrl.waitForMachineConfigurationReady(); err != nil { + ctrl.updateConditions(event, fmt.Errorf("MachineConfiguration was not ready: %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) + return err + } + + ctrl.syncControlPlaneMachineSets(event) + ctrl.syncMAPIMachineSets(event) + return nil } diff --git a/pkg/controller/bootimage/cpms_helpers.go b/pkg/controller/bootimage/cpms_helpers.go index 29d9424e0e..70811f78ae 100644 --- a/pkg/controller/bootimage/cpms_helpers.go +++ b/pkg/controller/bootimage/cpms_helpers.go @@ -23,7 +23,6 @@ import ( "k8s.io/apimachinery/pkg/types" kubeErrs "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/jsonmergepatch" - "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" ) @@ -40,30 +39,11 @@ func (ctrl *Controller) syncControlPlaneMachineSets(reason string) { return } - ctrl.cpmsSyncMutex.Lock() - defer ctrl.cpmsSyncMutex.Unlock() - - var mcop *opv1.MachineConfiguration - var pollError error - // Wait for mcop.Status to populate, otherwise error out. This shouldn't take very long - // as this is done by the operator sync loop. - if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(_ context.Context) (bool, error) { - mcop, pollError = ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) - if pollError != nil { - klog.Errorf("MachineConfiguration/cluster has not been created yet") - return false, nil - } - - // Ensure status.ObservedGeneration matches the last generation of MachineConfiguration - if mcop.Generation != mcop.Status.ObservedGeneration { - klog.Errorf("MachineConfiguration.Status is not up to date.") - pollError = fmt.Errorf("MachineConfiguration.Status is not up to date") - return false, nil - } - return true, nil - }); err != nil { - klog.Errorf("MachineConfiguration was not ready: %v", pollError) - ctrl.updateConditions(reason, fmt.Errorf("MachineConfiguration was not ready: while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) + // Get MachineConfiguration to determine which resources are enrolled + mcop, err := ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) + if err != nil { + klog.Errorf("Failed to get MachineConfiguration: %v", err) + ctrl.updateConditions(reason, fmt.Errorf("failed to get MachineConfiguration while enqueueing ControlPlaneMachineSet: %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) return } diff --git a/pkg/controller/bootimage/helpers.go b/pkg/controller/bootimage/helpers.go index 41604008a6..dfd3713fa0 100644 --- a/pkg/controller/bootimage/helpers.go +++ b/pkg/controller/bootimage/helpers.go @@ -15,6 +15,7 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" kruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" "sigs.k8s.io/yaml" @@ -129,3 +130,29 @@ func upgradeStubIgnitionIfRequired(secretName string, secretClient clientset.Int } return nil } + +// waitForMachineConfigurationReady waits for the MachineConfiguration to be ready +// by polling until the status is populated and the ObservedGeneration matches Generation. +func (ctrl *Controller) waitForMachineConfigurationReady() error { + var mcop *opv1.MachineConfiguration + var pollError error + if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(_ context.Context) (bool, error) { + mcop, pollError = ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) + if pollError != nil { + klog.Errorf("MachineConfiguration/cluster has not been created yet") + return false, nil + } + + // Ensure status.ObservedGeneration matches the last generation of MachineConfiguration + if mcop.Generation != mcop.Status.ObservedGeneration { + klog.Errorf("MachineConfiguration.Status is not up to date.") + pollError = fmt.Errorf("MachineConfiguration.Status is not up to date") + return false, nil + } + return true, nil + }); err != nil { + klog.Errorf("MachineConfiguration was not ready: %v", pollError) + return pollError + } + return nil +} diff --git a/pkg/controller/bootimage/ms_helpers.go b/pkg/controller/bootimage/ms_helpers.go index 013fa81ed2..994040e72e 100644 --- a/pkg/controller/bootimage/ms_helpers.go +++ b/pkg/controller/bootimage/ms_helpers.go @@ -17,7 +17,6 @@ import ( kubeErrs "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/jsonmergepatch" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" archtranslater "github.com/coreos/stream-metadata-go/arch" @@ -27,30 +26,11 @@ import ( // nolint:dupl // I separated this from syncControlPlaneMachineSets for readability func (ctrl *Controller) syncMAPIMachineSets(reason string) { - ctrl.mapiSyncMutex.Lock() - defer ctrl.mapiSyncMutex.Unlock() - - var mcop *opv1.MachineConfiguration - var pollError error - // Wait for mcop.Status to populate, otherwise error out. This shouldn't take very long - // as this is done by the operator sync loop. - if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(_ context.Context) (bool, error) { - mcop, pollError = ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) - if pollError != nil { - klog.Errorf("MachineConfiguration/cluster has not been created yet") - return false, nil - } - - // Ensure status.ObservedGeneration matches the last generation of MachineConfiguration - if mcop.Generation != mcop.Status.ObservedGeneration { - klog.Errorf("MachineConfiguration.Status is not up to date.") - pollError = fmt.Errorf("MachineConfiguration.Status is not up to date") - return false, nil - } - return true, nil - }); err != nil { - klog.Errorf("MachineConfiguration was not ready: %v", pollError) - ctrl.updateConditions(reason, fmt.Errorf("MachineConfiguration was not ready: while enqueueing MAPI MachineSets %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) + // Get MachineConfiguration to determine which resources are enrolled + mcop, err := ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) + if err != nil { + klog.Errorf("Failed to get MachineConfiguration: %v", err) + ctrl.updateConditions(reason, fmt.Errorf("failed to get MachineConfiguration while enqueueing MAPI MachineSets: %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) return } From db8a3de08795c3a66ea6eb14e2a25951c1434039 Mon Sep 17 00:00:00 2001 From: David Date: Thu, 18 Dec 2025 12:58:16 -0500 Subject: [PATCH 4/4] msbic: skip update if arch annotation not found --- cmd/machine-config-controller/start.go | 1 + .../bootimage/boot_image_controller.go | 7 +- .../bootimage/boot_image_controller_test.go | 117 ++++++++++++++---- pkg/controller/bootimage/ms_helpers.go | 56 ++++++--- 4 files changed, 140 insertions(+), 41 deletions(-) diff --git a/cmd/machine-config-controller/start.go b/cmd/machine-config-controller/start.go index fb9bd56d86..faa7c3f791 100644 --- a/cmd/machine-config-controller/start.go +++ b/cmd/machine-config-controller/start.go @@ -153,6 +153,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { ctrlctx.ConfigInformerFactory.Config().V1().Infrastructures(), ctrlctx.ClientBuilder.OperatorClientOrDie(componentName), ctrlctx.OperatorInformerFactory.Operator().V1().MachineConfigurations(), + ctrlctx.ConfigInformerFactory.Config().V1().ClusterVersions(), ctrlctx.FeatureGatesHandler, ) go bootImageController.Run(ctrlctx.Stop) diff --git a/pkg/controller/bootimage/boot_image_controller.go b/pkg/controller/bootimage/boot_image_controller.go index ea9655bfeb..1a68e0c9f9 100644 --- a/pkg/controller/bootimage/boot_image_controller.go +++ b/pkg/controller/bootimage/boot_image_controller.go @@ -53,12 +53,14 @@ type Controller struct { cpmsLister machinelistersv1.ControlPlaneMachineSetLister infraLister configlistersv1.InfrastructureLister mcopLister mcoplistersv1.MachineConfigurationLister + clusterVersionLister configlistersv1.ClusterVersionLister mcoCmListerSynced cache.InformerSynced mapiMachineSetListerSynced cache.InformerSynced cpmsListerSynced cache.InformerSynced infraListerSynced cache.InformerSynced mcopListerSynced cache.InformerSynced + clusterVersionListerSynced cache.InformerSynced queue workqueue.TypedRateLimitingInterface[string] @@ -122,6 +124,7 @@ func New( infraInformer configinformersv1.InfrastructureInformer, mcopClient mcopclientset.Interface, mcopInformer mcopinformersv1.MachineConfigurationInformer, + clusterVersionInformer configinformersv1.ClusterVersionInformer, fgHandler ctrlcommon.FeatureGatesHandler, ) *Controller { eventBroadcaster := record.NewBroadcaster() @@ -145,12 +148,14 @@ func New( ctrl.cpmsLister = cpmsInformer.Lister() ctrl.infraLister = infraInformer.Lister() ctrl.mcopLister = mcopInformer.Lister() + ctrl.clusterVersionLister = clusterVersionInformer.Lister() ctrl.mcoCmListerSynced = mcoCmInfomer.Informer().HasSynced ctrl.mapiMachineSetListerSynced = mapiMachineSetInformer.Informer().HasSynced ctrl.cpmsListerSynced = cpmsInformer.Informer().HasSynced ctrl.infraListerSynced = infraInformer.Informer().HasSynced ctrl.mcopListerSynced = mcopInformer.Informer().HasSynced + ctrl.clusterVersionListerSynced = clusterVersionInformer.Informer().HasSynced mapiMachineSetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: ctrl.addMAPIMachineSet, @@ -192,7 +197,7 @@ func (ctrl *Controller) Run(stopCh <-chan struct{}) { defer utilruntime.HandleCrash() defer ctrl.queue.ShutDown() - if !cache.WaitForCacheSync(stopCh, ctrl.mcoCmListerSynced, ctrl.mapiMachineSetListerSynced, ctrl.infraListerSynced, ctrl.mcopListerSynced) { + if !cache.WaitForCacheSync(stopCh, ctrl.mcoCmListerSynced, ctrl.mapiMachineSetListerSynced, ctrl.infraListerSynced, ctrl.mcopListerSynced, ctrl.clusterVersionListerSynced) { return } diff --git a/pkg/controller/bootimage/boot_image_controller_test.go b/pkg/controller/bootimage/boot_image_controller_test.go index dd5609ced2..2bf92a8956 100644 --- a/pkg/controller/bootimage/boot_image_controller_test.go +++ b/pkg/controller/bootimage/boot_image_controller_test.go @@ -97,71 +97,144 @@ func setMachineSetBootImage(machineset *machinev1beta1.MachineSet, generateBootI } func TestGetArchFromMachineSet(t *testing.T) { + // Helper to create a single-arch cluster version + singleArchCV := &osconfigv1.ClusterVersion{ + Status: osconfigv1.ClusterVersionStatus{ + Desired: osconfigv1.Release{ + Architecture: "", // Empty means single-arch + }, + }, + } + // Helper to create a multi-arch cluster version + multiArchCV := &osconfigv1.ClusterVersion{ + Status: osconfigv1.ClusterVersionStatus{ + Desired: osconfigv1.Release{ + Architecture: osconfigv1.ClusterVersionArchitectureMulti, + }, + }, + } + cases := []struct { - name string - annotations map[string]string - expectedArch string - expectError bool + name string + annotations map[string]string + clusterVersion *osconfigv1.ClusterVersion + expectedArch string + expectError bool }{ { - name: "Single architecture label", + name: "Single architecture label in single-arch cluster", annotations: map[string]string{ MachineSetArchAnnotationKey: "kubernetes.io/arch=amd64", }, - expectedArch: "x86_64", - expectError: false, + clusterVersion: singleArchCV, + expectedArch: "x86_64", + expectError: false, + }, + { + name: "Single architecture label in multi-arch cluster", + annotations: map[string]string{ + MachineSetArchAnnotationKey: "kubernetes.io/arch=amd64", + }, + clusterVersion: multiArchCV, + expectedArch: "x86_64", + expectError: false, }, { name: "Multiple labels with architecture first", annotations: map[string]string{ MachineSetArchAnnotationKey: "kubernetes.io/arch=amd64,topology.ebs.csi.aws.com/zone=eu-central-1a", }, - expectedArch: "x86_64", - expectError: false, + clusterVersion: singleArchCV, + expectedArch: "x86_64", + expectError: false, }, { name: "Multiple labels with architecture last", annotations: map[string]string{ MachineSetArchAnnotationKey: "topology.ebs.csi.aws.com/zone=eu-central-1a,kubernetes.io/arch=arm64", }, - expectedArch: "aarch64", - expectError: false, + clusterVersion: singleArchCV, + expectedArch: "aarch64", + expectError: false, }, { name: "Multiple labels with architecture in middle", annotations: map[string]string{ MachineSetArchAnnotationKey: "topology.ebs.csi.aws.com/zone=eu-central-1a,kubernetes.io/arch=s390x,node.kubernetes.io/instance-type=m5.large", }, - expectedArch: "s390x", - expectError: false, + clusterVersion: singleArchCV, + expectedArch: "s390x", + expectError: false, }, { name: "Multiple labels with spaces", annotations: map[string]string{ MachineSetArchAnnotationKey: " topology.ebs.csi.aws.com/zone=eu-central-1a , kubernetes.io/arch=ppc64le , node.kubernetes.io/instance-type=m5.large ", }, - expectedArch: "ppc64le", - expectError: false, + clusterVersion: singleArchCV, + expectedArch: "ppc64le", + expectError: false, }, { name: "Invalid architecture", annotations: map[string]string{ MachineSetArchAnnotationKey: "kubernetes.io/arch=invalid-arch", }, - expectError: true, + clusterVersion: singleArchCV, + expectError: true, }, { - name: "No architecture label", + name: "No architecture label in annotation", annotations: map[string]string{ MachineSetArchAnnotationKey: "topology.ebs.csi.aws.com/zone=eu-central-1a,node.kubernetes.io/instance-type=m5.large", }, + clusterVersion: singleArchCV, + expectError: true, + }, + { + name: "No annotation in single-arch cluster defaults to control plane arch", + annotations: map[string]string{}, + clusterVersion: singleArchCV, + expectError: false, // Should default to control plane arch + }, + { + name: "No annotation in multi-arch cluster returns error", + annotations: map[string]string{}, + clusterVersion: &osconfigv1.ClusterVersion{ + Status: osconfigv1.ClusterVersionStatus{ + Desired: osconfigv1.Release{ + Architecture: osconfigv1.ClusterVersionArchitectureMulti, + }, + }, + }, expectError: true, }, { - name: "No annotation", - annotations: map[string]string{}, - expectedArch: "", // Will default to control plane arch, but we can't test that easily - expectError: false, + name: "ARM64 architecture in multi-arch cluster", + annotations: map[string]string{ + MachineSetArchAnnotationKey: "kubernetes.io/arch=arm64", + }, + clusterVersion: multiArchCV, + expectedArch: "aarch64", + expectError: false, + }, + { + name: "PPC64LE architecture in single-arch cluster", + annotations: map[string]string{ + MachineSetArchAnnotationKey: "kubernetes.io/arch=ppc64le", + }, + clusterVersion: singleArchCV, + expectedArch: "ppc64le", + expectError: false, + }, + { + name: "S390X architecture in multi-arch cluster", + annotations: map[string]string{ + MachineSetArchAnnotationKey: "kubernetes.io/arch=s390x", + }, + clusterVersion: multiArchCV, + expectedArch: "s390x", + expectError: false, }, } @@ -174,7 +247,7 @@ func TestGetArchFromMachineSet(t *testing.T) { }, } - arch, err := getArchFromMachineSet(machineSet) + arch, err := getArchFromMachineSet(machineSet, tc.clusterVersion) if tc.expectError { assert.Error(t, err, "Expected error for test case: %s", tc.name) diff --git a/pkg/controller/bootimage/ms_helpers.go b/pkg/controller/bootimage/ms_helpers.go index 994040e72e..bbb00a3108 100644 --- a/pkg/controller/bootimage/ms_helpers.go +++ b/pkg/controller/bootimage/ms_helpers.go @@ -8,6 +8,7 @@ import ( "strings" "time" + osconfigv1 "github.com/openshift/api/config/v1" machinev1beta1 "github.com/openshift/api/machine/v1beta1" opv1 "github.com/openshift/api/operator/v1" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" @@ -113,9 +114,20 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet } } + // Fetch the ClusterVersion to determine if this is a multi-arch cluster + clusterVersion, err := ctrl.clusterVersionLister.Get("version") + if err != nil { + return fmt.Errorf("failed to fetch clusterversion during machineset sync: %v, defaulting to single-arch behavior", err) + } + // Fetch the architecture type of this machineset - arch, err := getArchFromMachineSet(machineSet) + arch, err := getArchFromMachineSet(machineSet, clusterVersion) if err != nil { + // If no architecture annotation was found, skip this machineset without erroring + // A later sync loop will pick it up once the annotation is added + if strings.Contains(err.Error(), "no architecture annotation found") { + return nil + } return fmt.Errorf("failed to fetch arch during machineset sync: %w", err) } @@ -222,29 +234,37 @@ func (ctrl *Controller) patchMachineSet(oldMachineSet, newMachineSet *machinev1b } // Returns architecture type for a given machineset -func getArchFromMachineSet(machineset *machinev1beta1.MachineSet) (arch string, err error) { +func getArchFromMachineSet(machineset *machinev1beta1.MachineSet, clusterVersion *osconfigv1.ClusterVersion) (arch string, err error) { // Valid set of machineset/node architectures validArchSet := sets.New("arm64", "s390x", "amd64", "ppc64le") // Check if the annotation enclosing arch label is present on this machineset archLabel, archLabelMatch := machineset.Annotations[MachineSetArchAnnotationKey] - if archLabelMatch { - // Parse the annotation value which may contain multiple comma-separated labels - // Example: kubernetes.io/arch=amd64,topology.ebs.csi.aws.com/zone=eu-central-1a - for label := range strings.SplitSeq(archLabel, ",") { - label = strings.TrimSpace(label) - if archLabelValue, found := strings.CutPrefix(label, ArchLabelKey); found { - // Extract just the architecture value after "kubernetes.io/arch=" - if validArchSet.Has(archLabelValue) { - return archtranslater.RpmArch(archLabelValue), nil - } - return "", fmt.Errorf("invalid architecture value found in annotation: %s", archLabelValue) + + if !archLabelMatch { + // Check if this is a multi-arch cluster + // clusterVersion should never be nil as it's validated by the caller + if clusterVersion.Status.Desired.Architecture == osconfigv1.ClusterVersionArchitectureMulti { + // For multi-arch clusters, we require the architecture annotation + klog.Errorf("No architecture annotation found on machineset %s in multi-arch cluster, skipping boot image update", machineset.Name) + return "", fmt.Errorf("no architecture annotation found on machineset %s", machineset.Name) + } + // For single-arch clusters, default to control plane architecture + klog.Infof("No architecture annotation found on machineset %s, defaulting to control plane architecture", machineset.Name) + return archtranslater.CurrentRpmArch(), nil + } + + // Parse the annotation value which may contain multiple comma-separated labels + // Example: kubernetes.io/arch=amd64,topology.ebs.csi.aws.com/zone=eu-central-1a + for label := range strings.SplitSeq(archLabel, ",") { + label = strings.TrimSpace(label) + if archLabelValue, found := strings.CutPrefix(label, ArchLabelKey); found { + // Extract just the architecture value after "kubernetes.io/arch=" + if validArchSet.Has(archLabelValue) { + return archtranslater.RpmArch(archLabelValue), nil } + return "", fmt.Errorf("invalid architecture value found in annotation: %s", archLabelValue) } - return "", fmt.Errorf("kubernetes.io/arch label not found in annotation: %s", archLabel) } - // If no arch annotation was found on the machineset, default to the control plane arch. - // return the architecture of the node running this pod, which will always be a control plane node. - klog.Infof("Defaulting to control plane architecture") - return archtranslater.CurrentRpmArch(), nil + return "", fmt.Errorf("kubernetes.io/arch label not found in annotation: %s", archLabel) }