Uploaded image for project: 'OpenShift Bugs'
  1. OpenShift Bugs
  2. OCPBUGS-9070

CVO hotloops on CronJob openshift-operator-lifecycle-manager/collect-profiles

XMLWordPrintable

    • Moderate
    • None
    • 3
    • OTA 233, OTA 234, OTA 235
    • 3
    • Unspecified

      Description of problem:

      In a fresh installed cluster, we can see hot-loopings on Service openshift-monitoring/cluster-monitoring-operator.

      1. grep -o 'Updating .*due to diff' cvo2.log | sort | uniq -c
        18 Updating CronJob openshift-operator-lifecycle-manager/collect-profiles due to diff
        12 Updating Service openshift-monitoring/cluster-monitoring-operator due to diff

      Looking at the CronJob hot-looping

      # grep -A60 'Updating CronJob openshift-operator-lifecycle-manager/collect-profiles due to diff' cvo2.log | tail -n61
      I0110 06:32:44.489277       1 generic.go:109] Updating CronJob openshift-operator-lifecycle-manager/collect-profiles due to diff:   &unstructured.Unstructured{
        	Object: map[string]interface{}{
        		"apiVersion": string("batch/v1"),
        		"kind":       string("CronJob"),
        		"metadata":   map[string]interface{}{"annotations": map[string]interface{}{"include.release.openshift.io/ibm-cloud-managed": string("true"), "include.release.openshift.io/self-managed-high-availability": string("true")}, "creationTimestamp": string("2022-01-10T04:35:19Z"), "generation": int64(1), "managedFields": []interface{}{map[string]interface{}{"apiVersion": string("batch/v1"), "fieldsType": string("FieldsV1"), "fieldsV1": map[string]interface{}{"f:metadata": map[string]interface{}{"f:annotations": map[string]interface{}{".": map[string]interface{}{}, "f:include.release.openshift.io/ibm-cloud-managed": map[string]interface{}{}, "f:include.release.openshift.io/self-managed-high-availability": map[string]interface{}{}}, "f:ownerReferences": map[string]interface{}{".": map[string]interface{}{}, `k:{"uid":"334d6c04-126d-4271-96ec-d303e93b7d1c"}`: map[string]interface{}{}}}, "f:spec": map[string]interface{}{"f:concurrencyPolicy": map[string]interface{}{}, "f:failedJobsHistoryLimit": map[string]interface{}{}, "f:jobTemplate": map[string]interface{}{"f:spec": map[string]interface{}{"f:template": map[string]interface{}{"f:spec": map[string]interface{}{"f:containers": map[string]interface{}{`k:{"name":"collect-profiles"}`: map[string]interface{}{".": map[string]interface{}{}, "f:args": map[string]interface{}{}, "f:command": map[string]interface{}{}, "f:image": map[string]interface{}{}, ...}}, "f:dnsPolicy": map[string]interface{}{}, "f:priorityClassName": map[string]interface{}{}, "f:restartPolicy": map[string]interface{}{}, ...}}}}, "f:schedule": map[string]interface{}{}, ...}}, "manager": string("cluster-version-operator"), ...}, map[string]interface{}{"apiVersion": string("batch/v1"), "fieldsType": string("FieldsV1"), "fieldsV1": map[string]interface{}{"f:status": map[string]interface{}{"f:lastScheduleTime": map[string]interface{}{}, "f:lastSuccessfulTime": map[string]interface{}{}}}, "manager": string("kube-controller-manager"), ...}}, ...},
        		"spec": map[string]interface{}{
      + 			"concurrencyPolicy":      string("Allow"),
      + 			"failedJobsHistoryLimit": int64(1),
        			"jobTemplate": map[string]interface{}{
      + 				"metadata": map[string]interface{}{"creationTimestamp": nil},
        				"spec": map[string]interface{}{
        					"template": map[string]interface{}{
      + 						"metadata": map[string]interface{}{"creationTimestamp": nil},
        						"spec": map[string]interface{}{
        							"containers": []interface{}{
        								map[string]interface{}{
        									... // 4 identical entries
        									"name":                     string("collect-profiles"),
        									"resources":                map[string]interface{}{"requests": map[string]interface{}{"cpu": string("10m"), "memory": string("80Mi")}},
      + 									"terminationMessagePath":   string("/dev/termination-log"),
      + 									"terminationMessagePolicy": string("File"),
        									"volumeMounts":             []interface{}{map[string]interface{}{"mountPath": string("/etc/config"), "name": string("config-volume")}, map[string]interface{}{"mountPath": string("/var/run/secrets/serving-cert"), "name": string("secret-volume")}},
        								},
        							},
      + 							"dnsPolicy":                     string("ClusterFirst"),
        							"priorityClassName":             string("openshift-user-critical"),
        							"restartPolicy":                 string("Never"),
      + 							"schedulerName":                 string("default-scheduler"),
      + 							"securityContext":               map[string]interface{}{},
      + 							"serviceAccount":                string("collect-profiles"),
        							"serviceAccountName":            string("collect-profiles"),
      + 							"terminationGracePeriodSeconds": int64(30),
        							"volumes": []interface{}{
        								map[string]interface{}{
        									"configMap": map[string]interface{}{
      + 										"defaultMode": int64(420),
        										"name":        string("collect-profiles-config"),
        									},
        									"name": string("config-volume"),
        								},
        								map[string]interface{}{
        									"name": string("secret-volume"),
        									"secret": map[string]interface{}{
      + 										"defaultMode": int64(420),
        										"secretName":  string("pprof-cert"),
        									},
        								},
        							},
        						},
        					},
        				},
        			},
        			"schedule":                   string("*/15 * * * *"),
      + 			"successfulJobsHistoryLimit": int64(3),
      + 			"suspend":                    bool(false),
        		},
        		"status": map[string]interface{}{"lastScheduleTime": string("2022-01-10T06:30:00Z"), "lastSuccessfulTime": string("2022-01-10T06:30:11Z")},
        	},
        }
      I0110 06:32:44.499764       1 sync_worker.go:771] Done syncing for cronjob "openshift-operator-lifecycle-manager/collect-profiles" (574 of 765)
      I0110 06:32:44.499814       1 sync_worker.go:759] Running sync for deployment "openshift-operator-lifecycle-manager/olm-operator" (575 of 765)
      

      Extract the manifest:

      # cat 0000_50_olm_07-collect-profiles.cronjob.yaml
      apiVersion: batch/v1
      kind: CronJob
      metadata:
        annotations:
          include.release.openshift.io/ibm-cloud-managed: "true"
          include.release.openshift.io/self-managed-high-availability: "true"
        name: collect-profiles
        namespace: openshift-operator-lifecycle-manager
      spec:
        schedule: "*/15 * * * *"
        jobTemplate:
          spec:
            template:
              spec:
                serviceAccountName: collect-profiles
                priorityClassName: openshift-user-critical
                containers:
                  - name: collect-profiles
                    image: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:2a8d116943a7c1eb32cd161a0de5cb173713724ff419a03abe0382a2d5d9c9a7
                    imagePullPolicy: IfNotPresent
                    command:
                      - bin/collect-profiles
                    args:
                      - -n
                      - openshift-operator-lifecycle-manager
                      - --config-mount-path
                      - /etc/config
                      - --cert-mount-path
                      - /var/run/secrets/serving-cert
                      - olm-operator-heap-:https://olm-operator-metrics:8443/debug/pprof/heap
                      - catalog-operator-heap-:https://catalog-operator-metrics:8443/debug/pprof/heap
                    volumeMounts:
                      - mountPath: /etc/config
                        name: config-volume
                      - mountPath: /var/run/secrets/serving-cert
                        name: secret-volume
                    resources:
                      requests:
                        cpu: 10m
                        memory: 80Mi
                volumes:
                  - name: config-volume
                    configMap:
                      name: collect-profiles-config
                  - name: secret-volume
                    secret:
                      secretName: pprof-cert
                restartPolicy: Never
      

      Looking at the in-cluster object:

      # oc get cronjob.batch/collect-profiles -oyaml -n openshift-operator-lifecycle-manager
      apiVersion: batch/v1
      kind: CronJob
      metadata:
        annotations:
          include.release.openshift.io/ibm-cloud-managed: "true"
          include.release.openshift.io/self-managed-high-availability: "true"
        creationTimestamp: "2022-01-10T04:35:19Z"
        generation: 1
        name: collect-profiles
        namespace: openshift-operator-lifecycle-manager
        ownerReferences:
        - apiVersion: config.openshift.io/v1
          kind: ClusterVersion
          name: version
          uid: 334d6c04-126d-4271-96ec-d303e93b7d1c
        resourceVersion: "450801"
        uid: d0b92cd3-3213-466c-921c-d4c4c77f7a6b
      spec:
        concurrencyPolicy: Allow
        failedJobsHistoryLimit: 1
        jobTemplate:
          metadata:
            creationTimestamp: null
          spec:
            template:
              metadata:
                creationTimestamp: null
              spec:
                containers:
                - args:
                  - -n
                  - openshift-operator-lifecycle-manager
                  - --config-mount-path
                  - /etc/config
                  - --cert-mount-path
                  - /var/run/secrets/serving-cert
                  - olm-operator-heap-:https://olm-operator-metrics:8443/debug/pprof/heap
                  - catalog-operator-heap-:https://catalog-operator-metrics:8443/debug/pprof/heap
                  command:
                  - bin/collect-profiles
                  image: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:2a8d116943a7c1eb32cd161a0de5cb173713724ff419a03abe0382a2d5d9c9a7
                  imagePullPolicy: IfNotPresent
                  name: collect-profiles
                  resources:
                    requests:
                      cpu: 10m
                      memory: 80Mi
                  terminationMessagePath: /dev/termination-log
                  terminationMessagePolicy: File
                  volumeMounts:
                  - mountPath: /etc/config
                    name: config-volume
                  - mountPath: /var/run/secrets/serving-cert
                    name: secret-volume
                dnsPolicy: ClusterFirst
                priorityClassName: openshift-user-critical
                restartPolicy: Never
                schedulerName: default-scheduler
                securityContext: {}
                serviceAccount: collect-profiles
                serviceAccountName: collect-profiles
                terminationGracePeriodSeconds: 30
                volumes:
                - configMap:
                    defaultMode: 420
                    name: collect-profiles-config
                  name: config-volume
                - name: secret-volume
                  secret:
                    defaultMode: 420
                    secretName: pprof-cert
        schedule: '*/15 * * * *'
        successfulJobsHistoryLimit: 3
        suspend: false
      status:
        lastScheduleTime: "2022-01-11T03:00:00Z"
        lastSuccessfulTime: "2022-01-11T03:00:07Z"
      

      Version-Release number of the following components:
      4.10.0-0.nightly-2022-01-09-195852

      How reproducible:
      1/1

      Steps to Reproduce:
      1.Install a 4.10 cluster
      2. Grep 'Updating .*due to diff' in the cvo log to check hot-loopings
      3.

      Actual results:
      CVO hotloops on CronJob openshift-operator-lifecycle-manager/collect-profiles

      Expected results:
      CVO should not hotloop on it in a fresh installed cluster

      Additional info:
      attachment 1850058 CVO log file

              dhurta@redhat.com David Hurta
              yanyang@redhat.com Yang Yang
              Evgeni Vakhonin Evgeni Vakhonin
              Red Hat Employee
              Votes:
              0 Vote for this issue
              Watchers:
              12 Start watching this issue

                Created:
                Updated:
                Resolved: