openshift · hongkailiu · Oct 29, 2025 · Oct 29, 2025
diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go
@@ -45,6 +45,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
 	isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{})
 	if isUpgrade {
 		junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
+		junits = append(junits, clusterOperatorIsNotProgressingWhenMachineConfigIs(finalIntervals)...)
 	} else {
 		junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
 	}

diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go
@@ -6,21 +6,21 @@ import (
 	"strings"
 	"time"
 
-	"github.com/openshift/origin/pkg/monitortestlibrary/utility"
+	configv1 "github.com/openshift/api/config/v1"
+	clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
+	"github.com/sirupsen/logrus"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
 
-	"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
-	"github.com/sirupsen/logrus"
-
-	configv1 "github.com/openshift/api/config/v1"
-	clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
 	"github.com/openshift/origin/pkg/monitor/monitorapi"
 	"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
 	platformidentification2 "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
+	"github.com/openshift/origin/pkg/monitortestlibrary/utility"
+	"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
 	"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
 	exutil "github.com/openshift/origin/test/extended/util"
-	"k8s.io/client-go/rest"
 )
 
 // exceptionCallback consumes a suspicious condition and returns an
@@ -516,9 +516,6 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
 	for _, conditionType := range conditionTypes {
 		for _, operatorName := range platformidentification.KnownOperators.List() {
 			bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName)
-			if bzComponent == "Unknown" {
-				bzComponent = operatorName
-			}
 			testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should not change condition/%v", bzComponent, operatorName, conditionType)
 			operatorEvents := eventsByOperator[operatorName]
 			if len(operatorEvents) == 0 {
@@ -586,6 +583,9 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
 			}
 
 			if len(fatal) > 0 || len(excepted) > 0 {
+				// add a failure so we
+				// either flake (or pass) in case len(fatal) == 0 by adding a success to the same test
+				// or fail in case len(fatal) > 0 by leaving the failure as the only output for the test
 				ret = append(ret, &junitapi.JUnitTestCase{
 					Name:      testName,
 					Duration:  duration,
@@ -597,7 +597,204 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
 			}
 
 			if len(fatal) == 0 {
+				if len(excepted) > 0 {
+					// add a success so we flake (or pass) and don't fail
+					ret = append(ret, &junitapi.JUnitTestCase{Name: testName, SystemOut: "Passing the case to make the overall test case flake as the previous failure is expected"})
+				} else {
+					ret = append(ret, &junitapi.JUnitTestCase{Name: testName})
+				}
+			}
+		}
+	}
+
+	return ret
+}
+
+func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
+	var ret []*junitapi.JUnitTestCase
+	upgradeWindows := getUpgradeWindows(events)
+
+	var machineConfigProgressingStart time.Time
+	var eventsInUpgradeWindows monitorapi.Intervals
+
+	var start, stop time.Time
+	for _, event := range events {
+		if !isInUpgradeWindow(upgradeWindows, event) {
+			continue
+		}
+		eventsInUpgradeWindows = append(eventsInUpgradeWindows, event)
+		if start.IsZero() || event.From.Before(start) {
+			start = event.From
+		}
+		if stop.IsZero() || event.To.After(stop) {
+			stop = event.To
+		}
+	}
+	duration := stop.Sub(start).Seconds()
+
+	eventsByOperator := getEventsByOperator(eventsInUpgradeWindows)
+	for _, mcEvent := range eventsByOperator["machine-config"] {
+		condition := monitorapi.GetOperatorConditionStatus(mcEvent)
+		if condition == nil {
+			continue // ignore non-condition intervals
+		}
+		if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionTrue {
+			machineConfigProgressingStart = mcEvent.To
+			break
+		}
+	}
+
+	mcTestCase := &junitapi.JUnitTestCase{
+		Name:     fmt.Sprintf("[bz-Machine Config Operator] clusteroperator/machine-config must go Progressing=True during an upgrade test"),
+		Duration: duration,
+	}
+	if machineConfigProgressingStart.IsZero() {
+		mcTestCase.FailureOutput = &junitapi.FailureOutput{
+			Output: fmt.Sprintf("machine-config was never Progressing=True during the upgrade window from %s to %s", start.Format(time.RFC3339), stop.Format(time.RFC3339)),
+		}
+		return []*junitapi.JUnitTestCase{mcTestCase}
+	} else {
+		mcTestCase.SystemOut = fmt.Sprintf("machine-config became Progressing=True at %s during the upgrade window from %s to %s", machineConfigProgressingStart.Format(time.RFC3339), start.Format(time.RFC3339), stop.Format(time.RFC3339))
+	}
+	ret = append(ret, mcTestCase)
+
+	for _, operatorName := range platformidentification.KnownOperators.Difference(sets.NewString("machine-config")).List() {
+		bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName)
+		testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should stay Progressing=False while MCO is Progressing=True", bzComponent, operatorName)
+		operatorEvents := eventsByOperator[operatorName]
+		if len(operatorEvents) == 0 {
+			ret = append(ret, &junitapi.JUnitTestCase{
+				Name:     testName,
+				Duration: duration,
+			})
+			continue
+		}
+
+		except := func(co string, reason string) string {
+			switch co {
+			case "csi-snapshot-controller":
+				if reason == "CSISnapshotController_Deploying" {
+					return "https://issues.redhat.com/browse/OCPBUGS-62624"
+				}
+			case "dns":
+				if reason == "DNSReportsProgressingIsTrue" {
+					return "https://issues.redhat.com/browse/OCPBUGS-62623"
+				}
+			case "image-registry":
+				if reason == "NodeCADaemonUnavailable::Ready" || reason == "DeploymentNotCompleted" {
+					return "https://issues.redhat.com/browse/OCPBUGS-62626"
+				}
+			case "ingress":
+				if reason == "Reconciling" {
+					return "https://issues.redhat.com/browse/OCPBUGS-62627"
+				}
+			case "kube-storage-version-migrator":
+				if reason == "KubeStorageVersionMigrator_Deploying" {
+					return "https://issues.redhat.com/browse/OCPBUGS-62629"
+				}
+			case "network":
+				if reason == "Deploying" {
+					return "https://issues.redhat.com/browse/OCPBUGS-62630"
+				}
+			case "node-tuning":
+				if reason == "Reconciling" || reason == "ProfileProgressing" {
+					return "https://issues.redhat.com/browse/OCPBUGS-62632"
+				}
+			case "openshift-controller-manager":
+				// _DesiredStateNotYetAchieved
+				// RouteControllerManager_DesiredStateNotYetAchieved
+				if strings.HasSuffix(reason, "_DesiredStateNotYetAchieved") {
+					return "https://issues.redhat.com/browse/OCPBUGS-63116"
+				}
+			case "service-ca":
+				if reason == "_ManagedDeploymentsAvailable" {
+					return "https://issues.redhat.com/browse/OCPBUGS-62633"
+				}
+			case "storage":
+				// GCPPDCSIDriverOperatorCR_GCPPDDriverControllerServiceController_Deploying
+				// GCPPDCSIDriverOperatorCR_GCPPDDriverNodeServiceController_Deploying
+				// AWSEBSCSIDriverOperatorCR_AWSEBSDriverNodeServiceController_Deploying
+				// VolumeDataSourceValidatorDeploymentController_Deploying
+				// GCPPD_Deploying
+				// AWSEBS_Deploying
+				if strings.HasSuffix(reason, "_Deploying") {
+					return "https://issues.redhat.com/browse/OCPBUGS-62634"
+				}
+			case "olm":
+				// CatalogdDeploymentCatalogdControllerManager_Deploying
+				// OperatorcontrollerDeploymentOperatorControllerControllerManager_Deploying
+				if strings.HasSuffix(reason, "ControllerManager_Deploying") {
+					return "https://issues.redhat.com/browse/OCPBUGS-62635"
+				}
+			case "operator-lifecycle-manager-packageserver":
+				if reason == "" {
+					return "https://issues.redhat.com/browse/OCPBUGS-63672"
+				}
+			}
+			return ""
+		}
+
+		var excepted, fatal []string
+		for _, operatorEvent := range operatorEvents {
+			if operatorEvent.From.Before(machineConfigProgressingStart) {
+				continue
+			}
+			condition := monitorapi.GetOperatorConditionStatus(operatorEvent)
+			if condition == nil {
+				continue // ignore non-condition intervals
+			}
+			if condition.Type == "" {
+				fatal = append(fatal, fmt.Sprintf("failed to convert %v into a condition with a type", operatorEvent))
+				continue
+			}
+
+			if condition.Type != configv1.OperatorProgressing || condition.Status == configv1.ConditionFalse {
+				continue
+			}
+
+			// if there was any switch, it was wrong/unexpected at some point
+			failure := fmt.Sprintf("%v", operatorEvent)
+
+			exception := except(operatorName, condition.Reason)
+			if exception == "" {
+				fatal = append(fatal, failure)
+			} else {
+				excepted = append(excepted, fmt.Sprintf("%s (exception: %s)", failure, exception))
+			}
+		}
+
+		output := fmt.Sprintf("%d (out of %d) unexpected clusteroperator state transitions while machine-config is progressing during the upgrade window from %s to %s", len(fatal), len(operatorEvents), start.Format(time.RFC3339), stop.Format(time.RFC3339))
+		if len(fatal) > 0 {
+			output = fmt.Sprintf("%s.  These did not match any known exceptions, so they cause this test-case to fail:\n\n%v\n", output, strings.Join(fatal, "\n"))
+		} else {
+			output = fmt.Sprintf("%s, as desired.", output)
+		}
+		output = fmt.Sprintf("%s\n%d unwelcome but acceptable clusteroperator state transitions while machine-config is progressing during the upgrade window from %s to %s", output, len(excepted), start.Format(time.RFC3339), stop.Format(time.RFC3339))
+		if len(excepted) > 0 {
+			output = fmt.Sprintf("%s.  These should not happen, but because they are tied to exceptions, the fact that they did happen is not sufficient to cause this test-case to fail:\n\n%v\n", output, strings.Join(excepted, "\n"))
+		} else {
+			output = fmt.Sprintf("%s, as desired.", output)
+		}
+
+		if len(fatal) > 0 || len(excepted) > 0 {
+			// add a failure so we
+			// either flake (or pass) in case len(fatal) == 0 by adding a success to the same test
+			// or fail in case len(fatal) > 0 by leaving the failure as the only output for the test
+			ret = append(ret, &junitapi.JUnitTestCase{
+				Name:      testName,
+				Duration:  duration,
+				SystemOut: output,
+				FailureOutput: &junitapi.FailureOutput{
+					Output: output,
+				},
+			})
+		}
+
+		if len(fatal) == 0 {
+			if len(excepted) > 0 {
 				// add a success so we flake (or pass) and don't fail
+				ret = append(ret, &junitapi.JUnitTestCase{Name: testName, SystemOut: "Passing the case to make the overall test case flake as the previous failure is expected"})
+			} else {
 				ret = append(ret, &junitapi.JUnitTestCase{Name: testName})
 			}
 		}