Skip to content

Commit ca6f8d2

Browse files
committed
operator: move pool metrics to controller
1 parent 0bc0e71 commit ca6f8d2

File tree

6 files changed

+82
-204
lines changed

6 files changed

+82
-204
lines changed

pkg/controller/common/metrics.go

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,56 @@ var (
3232
Name: "mcc_drain_err",
3333
Help: "logs failed drain",
3434
}, []string{"node"})
35-
// MCCPoolAlert logs when the pool configuration changes in a way the user should know.
35+
36+
// MCCPoolAlert logs when the pool configuration changes in a way the user should know
3637
MCCPoolAlert = prometheus.NewGaugeVec(
3738
prometheus.GaugeOpts{
3839
Name: "mcc_pool_alert",
3940
Help: "pool status alert",
4041
}, []string{"node"})
42+
4143
// MCCSubControllerState logs the state of the subcontrollers of the MCC
4244
MCCSubControllerState = prometheus.NewGaugeVec(
4345
prometheus.GaugeOpts{
4446
Name: "mcc_sub_controller_state",
4547
Help: "state of sub-controllers in the MCC",
4648
}, []string{"subcontroller", "state", "object"})
49+
50+
// MCCState is the state of the machine config controller
51+
// pause, updated, updating, degraded
52+
MCCState = prometheus.NewGaugeVec(
53+
prometheus.GaugeOpts{
54+
Name: "mco_state",
55+
Help: "state of a specified pool",
56+
}, []string{"node", "pool", "state", "reason"})
57+
58+
// MCCMachineCount is the total number of nodes in the pool
59+
MCCMachineCount = prometheus.NewGaugeVec(
60+
prometheus.GaugeOpts{
61+
Name: "mco_machine_count",
62+
Help: "total number of machines in a specified pool",
63+
}, []string{"pool"})
64+
65+
// MCCUpdatedMachineCount is the updated machines in the pool
66+
MCCUpdatedMachineCount = prometheus.NewGaugeVec(
67+
prometheus.GaugeOpts{
68+
Name: "mco_updated_machine_count",
69+
Help: "total number of updated machines in specified pool",
70+
}, []string{"pool"})
71+
72+
// MCCDegradedMachineCount is the degraded machines in the pool
73+
MCCDegradedMachineCount = prometheus.NewGaugeVec(
74+
prometheus.GaugeOpts{
75+
Name: "mco_degraded_machine_count",
76+
Help: "total number of degraded machines in specified pool",
77+
}, []string{"pool"})
78+
79+
// MCCUnavailableMachineCount is the unavailable machines in the pool
80+
MCCUnavailableMachineCount = prometheus.NewGaugeVec(
81+
prometheus.GaugeOpts{
82+
Name: "mco_unavailable_machine_count",
83+
Help: "total number of unavailable machines in specified pool",
84+
}, []string{"pool"})
4785
)
4886

4987
func RegisterMCCMetrics() error {
@@ -52,13 +90,18 @@ func RegisterMCCMetrics() error {
5290
MCCDrainErr,
5391
MCCPoolAlert,
5492
MCCSubControllerState,
93+
MCCState,
94+
MCCMachineCount,
95+
MCCUpdatedMachineCount,
96+
MCCDegradedMachineCount,
97+
MCCUnavailableMachineCount,
5598
})
5699

57100
if err != nil {
58101
return fmt.Errorf("could not register machine-config-controller metrics: %w", err)
59102
}
60103

61-
// Initilize GuageVecs to ensure that metrics of type GuageVec are accessible from the dashboard even if without a logged value
104+
// Initialize GaugeVecs to ensure that metrics of type GaugeVec are accessible from the dashboard even if without a logged value
62105
// Solution to OCPBUGS-20427: https://issues.redhat.com/browse/OCPBUGS-20427
63106
OSImageURLOverride.WithLabelValues("initialize").Set(0)
64107
MCCDrainErr.WithLabelValues("initialize").Set(0)

pkg/controller/node/node_controller.go

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1319,7 +1319,13 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
13191319
}
13201320
ctrlcommon.UpdateStateMetric(ctrlcommon.MCCSubControllerState, "machine-config-controller-node", "Sync Machine Config Pool", pool.Name)
13211321
}
1322-
return ctrl.syncStatusOnly(pool)
1322+
1323+
if err := ctrl.syncStatusOnly(pool); err != nil {
1324+
return err
1325+
}
1326+
1327+
// Update metrics after syncing the pool status
1328+
return ctrl.syncMetrics()
13231329
}
13241330

13251331
// checkIfNodeHasInProgressTaint checks if the given node has in progress taint
@@ -1708,3 +1714,33 @@ func (ctrl *Controller) isConfigOrBuildPresent(mosc *mcfgv1.MachineOSConfig, mos
17081714
func (ctrl *Controller) isConfigAndBuildPresent(mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild) bool {
17091715
return (mosc != nil && mosb != nil)
17101716
}
1717+
1718+
// syncMetrics updates the metrics for all pools
1719+
func (ctrl *Controller) syncMetrics() error {
1720+
pools, err := ctrl.mcpLister.List(labels.Everything())
1721+
if err != nil {
1722+
return err
1723+
}
1724+
// set metrics per pool, we need to get the latest condition to log for the state
1725+
var latestTime metav1.Time
1726+
latestTime.Time = time.Time{}
1727+
var cond mcfgv1.MachineConfigPoolCondition
1728+
for _, pool := range pools {
1729+
for _, condition := range pool.Status.Conditions {
1730+
if condition.Status == corev1.ConditionTrue && condition.LastTransitionTime.After(latestTime.Time) {
1731+
cond = condition
1732+
latestTime = cond.LastTransitionTime
1733+
}
1734+
}
1735+
1736+
nodes, _ := helpers.GetNodesForPool(ctrl.mcpLister, ctrl.nodeLister, pool)
1737+
for _, node := range nodes {
1738+
ctrlcommon.MCCState.WithLabelValues(node.Name, pool.Name, string(cond.Type), cond.Reason).SetToCurrentTime()
1739+
}
1740+
ctrlcommon.MCCMachineCount.WithLabelValues(pool.Name).Set(float64(pool.Status.MachineCount))
1741+
ctrlcommon.MCCUpdatedMachineCount.WithLabelValues(pool.Name).Set(float64(pool.Status.UpdatedMachineCount))
1742+
ctrlcommon.MCCDegradedMachineCount.WithLabelValues(pool.Name).Set(float64(pool.Status.DegradedMachineCount))
1743+
ctrlcommon.MCCUnavailableMachineCount.WithLabelValues(pool.Name).Set(float64(pool.Status.UnavailableMachineCount))
1744+
}
1745+
return nil
1746+
}

pkg/operator/metrics.go

Lines changed: 0 additions & 64 deletions
This file was deleted.

pkg/operator/operator_test.go

Lines changed: 0 additions & 98 deletions
This file was deleted.

pkg/operator/status.go

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"reflect"
88
"sort"
99
"strings"
10-
"time"
1110

1211
mcfgv1 "github.com/openshift/api/machineconfiguration/v1"
1312

@@ -26,7 +25,6 @@ import (
2625
"github.com/openshift/machine-config-operator/pkg/apihelpers"
2726
ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common"
2827
kcc "github.com/openshift/machine-config-operator/pkg/controller/kubelet-config"
29-
"github.com/openshift/machine-config-operator/pkg/helpers"
3028
)
3129

3230
// syncVersion handles reporting the version to the clusteroperator
@@ -296,35 +294,6 @@ func (optr *Operator) syncUpgradeableStatus(co *configv1.ClusterOperator) error
296294
return nil
297295
}
298296

299-
func (optr *Operator) syncMetrics() error {
300-
pools, err := optr.mcpLister.List(labels.Everything())
301-
if err != nil {
302-
return err
303-
}
304-
// set metrics per pool, we need to get the latest condition to log for the state
305-
var latestTime metav1.Time
306-
latestTime.Time = time.Time{}
307-
var cond mcfgv1.MachineConfigPoolCondition
308-
for _, pool := range pools {
309-
for _, condition := range pool.Status.Conditions {
310-
if condition.Status == corev1.ConditionTrue && condition.LastTransitionTime.After(latestTime.Time) {
311-
cond = condition
312-
latestTime = cond.LastTransitionTime
313-
}
314-
}
315-
316-
nodes, _ := helpers.GetNodesForPool(optr.mcpLister, optr.nodeLister, pool)
317-
for _, node := range nodes {
318-
mcoState.WithLabelValues(node.Name, pool.Name, string(cond.Type), cond.Reason).SetToCurrentTime()
319-
}
320-
mcoMachineCount.WithLabelValues(pool.Name).Set(float64(pool.Status.MachineCount))
321-
mcoUpdatedMachineCount.WithLabelValues(pool.Name).Set(float64(pool.Status.UpdatedMachineCount))
322-
mcoDegradedMachineCount.WithLabelValues(pool.Name).Set(float64(pool.Status.DegradedMachineCount))
323-
mcoUnavailableMachineCount.WithLabelValues(pool.Name).Set(float64(pool.Status.UnavailableMachineCount))
324-
}
325-
return nil
326-
}
327-
328297
func (optr *Operator) syncClusterFleetEvaluation(co *configv1.ClusterOperator) error {
329298

330299
unexpectedEvaluations, err := optr.generateClusterFleetEvaluations()

pkg/operator/sync.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -231,10 +231,6 @@ func (optr *Operator) syncAll(syncFuncs []syncFunc) error {
231231
return fmt.Errorf("error updating cluster operator status: %w", syncClusterFleetEvaluationErr)
232232
}
233233

234-
if err := optr.syncMetrics(); err != nil {
235-
return fmt.Errorf("error syncing metrics: %w", err)
236-
}
237-
238234
if optr.inClusterBringup && syncErr.err == nil {
239235
klog.Infof("Initialization complete")
240236
optr.inClusterBringup = false
@@ -1673,10 +1669,6 @@ func (optr *Operator) syncRequiredMachineConfigPools(config *renderConfig, co *c
16731669

16741670
// Let's start with a 10 minute timeout per "required" node.
16751671
if err := wait.PollUntilContextTimeout(ctx, time.Second, time.Duration(requiredMachineCount*10)*time.Minute, false, func(_ context.Context) (bool, error) {
1676-
if err := optr.syncMetrics(); err != nil {
1677-
return false, err
1678-
}
1679-
16801672
if lastErr != nil {
16811673
// In this case, only the status extension field is updated.
16821674
newCOStatus := co.Status.DeepCopy()

0 commit comments

Comments
 (0)