Uploaded image for project: 'Machine Config Operator'
  1. Machine Config Operator
  2. MCO-778

ocp 4.14 vSphere cluster node stuck in Provisioned state

XMLWordPrintable

    • Icon: Bug Bug
    • Resolution: Done
    • Icon: Critical Critical
    • None
    • None
    • False
    • None
    • False
    • Critical
    • 10
    • 0

      Description of problem:

      after deploying an OCP 4.14.0-rc.0 cluster on vSphere platform with a standard 3 worker node configuration, only 2 worker nodes come up while the last one is stuck in Provisioned state.

      oc get machinepools.hive.openshift.io -n qe4-vmware-ibm
      NAME                    POOLNAME   CLUSTERDEPLOYMENT   REPLICAS
      qe4-vmware-ibm-worker   worker     qe4-vmware-ibm      3 
      
      

       

      Version-Release number of selected component (if applicable):

      2.9.0-DOWNSTREAM-2023-09-11-15-47-23

      How reproducible:

      always

      Steps to Reproduce:

      1. provision vsphere cluster on ocp 4.14.0-rc.0
      2.  
      3. ...

      Actual results:

      Expected results:

      Additional info:

      MachinePool CRD (from ACM hub)

      apiVersion: v1
      items:
      - apiVersion: hive.openshift.io/v1
        kind: MachinePool
        metadata:
          creationTimestamp: "2023-09-12T16:01:02Z"
          finalizers:
          - hive.openshift.io/remotemachineset
          generation: 1
          name: qe4-vmware-ibm-worker
          namespace: qe4-vmware-ibm
          resourceVersion: "1120426"
          uid: a99bcbcc-e5c7-4fff-b4a9-dc84bab93630
        spec:
          clusterDeploymentRef:
            name: qe4-vmware-ibm
          name: worker
          platform:
            vsphere:
              coresPerSocket: 2
              cpus: 4
              memoryMB: 16384
              osDisk:
                diskSizeGB: 120
          replicas: 3
        status:
          conditions:
          - lastProbeTime: "2023-09-12T16:01:02Z"
            lastTransitionTime: "2023-09-12T16:01:02Z"
            message: Condition Initialized
            reason: Initialized
            status: Unknown
            type: NotEnoughReplicas
          - lastProbeTime: "2023-09-12T16:01:02Z"
            lastTransitionTime: "2023-09-12T16:01:02Z"
            message: Condition Initialized
            reason: Initialized
            status: Unknown
            type: NoMachinePoolNameLeasesAvailable
          - lastProbeTime: "2023-09-12T16:01:02Z"
            lastTransitionTime: "2023-09-12T16:01:02Z"
            message: Condition Initialized
            reason: Initialized
            status: Unknown
            type: InvalidSubnets
          - lastProbeTime: "2023-09-12T16:01:02Z"
            lastTransitionTime: "2023-09-12T16:01:02Z"
            message: Condition Initialized
            reason: Initialized
            status: Unknown
            type: UnsupportedConfiguration
          machineSets:
          - errorMessage: ""
            errorReason: ""
            maxReplicas: 3
            minReplicas: 3
            name: qe4-vmware-ibm-z4fpw-worker-0
            readyReplicas: 2
            replicas: 3
          replicas: 3
      kind: List
      metadata:
        resourceVersion: "" 

      MachineSet CRD (from provisioned cluster)

      apiVersion: v1
      items:
      - apiVersion: machine.openshift.io/v1beta1
        kind: MachineSet
        metadata:
          annotations:
            machine.openshift.io/memoryMb: "16384"
            machine.openshift.io/vCPU: "4"
          creationTimestamp: "2023-09-12T16:06:57Z"
          generation: 1
          labels:
            hive.openshift.io/machine-pool: worker
            hive.openshift.io/managed: "true"
            machine.openshift.io/cluster-api-cluster: qe4-vmware-ibm-z4fpw
          name: qe4-vmware-ibm-z4fpw-worker-0
          namespace: openshift-machine-api
          resourceVersion: "31757"
          uid: bade5076-e1e2-4f5c-a6f0-f04f7f7c28c4
        spec:
          replicas: 3
          selector:
            matchLabels:
              machine.openshift.io/cluster-api-cluster: qe4-vmware-ibm-z4fpw
              machine.openshift.io/cluster-api-machineset: qe4-vmware-ibm-z4fpw-worker-0
          template:
            metadata:
              labels:
                machine.openshift.io/cluster-api-cluster: qe4-vmware-ibm-z4fpw
                machine.openshift.io/cluster-api-machine-role: worker
                machine.openshift.io/cluster-api-machine-type: worker
                machine.openshift.io/cluster-api-machineset: qe4-vmware-ibm-z4fpw-worker-0
            spec:
              lifecycleHooks: {}
              metadata: {}
              providerSpec:
                value:
                  apiVersion: machine.openshift.io/v1beta1
                  credentialsSecret:
                    name: vsphere-cloud-credentials
                  diskGiB: 120
                  kind: VSphereMachineProviderSpec
                  memoryMiB: 16384
                  metadata:
                    creationTimestamp: null
                  network:
                    devices:
                    - networkName: Public Network
                  numCPUs: 4
                  numCoresPerSocket: 2
                  snapshot: ""
                  template: qe4-vmware-ibm-z4fpw-rhcos-generated-region-generated-zone
                  userDataSecret:
                    name: worker-user-data
                  workspace:
                    datacenter: Workload Datacenter
                    datastore: /Workload Datacenter/datastore/WORKLOAD-DS-Folder/WORKLOAD-DS
                    folder: /Workload Datacenter/vm/ACM-QE
                    resourcePool: /Workload Datacenter/host/Workload Cluster//Resources
                    server: acmcicd-vcsa-01.cicd.red-chesterfield.com
        status:
          availableReplicas: 2
          fullyLabeledReplicas: 3
          observedGeneration: 1
          readyReplicas: 2
          replicas: 3
      kind: List
      metadata:
        resourceVersion: "" 

      Machines (from cluster)

      oc get machines.machine.openshift.io -n openshift-machine-api
      NAME                                  PHASE         TYPE   REGION   ZONE   AGE
      qe4-vmware-ibm-z4fpw-master-0         Running                              156m
      qe4-vmware-ibm-z4fpw-master-1         Running                              156m
      qe4-vmware-ibm-z4fpw-master-2         Running                              156m
      qe4-vmware-ibm-z4fpw-worker-0-4c6p6   Running                              146m
      qe4-vmware-ibm-z4fpw-worker-0-rtbj7   Running                              146m
      qe4-vmware-ibm-z4fpw-worker-0-svxdj   Provisioned                          146m
      
      apiVersion: machine.openshift.io/v1beta1
      kind: Machine
      metadata:
        annotations:
          machine.openshift.io/instance-state: poweredOn
        creationTimestamp: "2023-09-12T16:16:55Z"
        finalizers:
        - machine.machine.openshift.io
        generateName: qe4-vmware-ibm-z4fpw-worker-0-
        generation: 2
        labels:
          machine.openshift.io/cluster-api-cluster: qe4-vmware-ibm-z4fpw
          machine.openshift.io/cluster-api-machine-role: worker
          machine.openshift.io/cluster-api-machine-type: worker
          machine.openshift.io/cluster-api-machineset: qe4-vmware-ibm-z4fpw-worker-0
          machine.openshift.io/region: ""
          machine.openshift.io/zone: ""
        name: qe4-vmware-ibm-z4fpw-worker-0-svxdj
        namespace: openshift-machine-api
        ownerReferences:
        - apiVersion: machine.openshift.io/v1beta1
          blockOwnerDeletion: true
          controller: true
          kind: MachineSet
          name: qe4-vmware-ibm-z4fpw-worker-0
          uid: bade5076-e1e2-4f5c-a6f0-f04f7f7c28c4
        resourceVersion: "15508"
        uid: bf52b904-36bc-4de7-8c29-6c1f5504cd4c
      spec:
        lifecycleHooks: {}
        metadata: {}
        providerID: vsphere://4229df5b-0c8e-e9fe-512e-aa2d8b21a7ac
        providerSpec:
          value:
            apiVersion: machine.openshift.io/v1beta1
            credentialsSecret:
              name: vsphere-cloud-credentials
            diskGiB: 120
            kind: VSphereMachineProviderSpec
            memoryMiB: 16384
            metadata:
              creationTimestamp: null
            network:
              devices:
              - networkName: Public Network
            numCPUs: 4
            numCoresPerSocket: 2
            snapshot: ""
            template: qe4-vmware-ibm-z4fpw-rhcos-generated-region-generated-zone
            userDataSecret:
              name: worker-user-data
            workspace:
              datacenter: Workload Datacenter
              datastore: /Workload Datacenter/datastore/WORKLOAD-DS-Folder/WORKLOAD-DS
              folder: /Workload Datacenter/vm/ACM-QE
              resourcePool: /Workload Datacenter/host/Workload Cluster//Resources
              server: acmcicd-vcsa-01.cicd.red-chesterfield.com
      status:
        addresses:
        - address: qe4-vmware-ibm-z4fpw-worker-0-svxdj
          type: InternalDNS
        conditions:
        - lastTransitionTime: "2023-09-12T16:17:08Z"
          status: "True"
          type: Drainable
        - lastTransitionTime: "2023-09-12T16:20:08Z"
          status: "True"
          type: InstanceExists
        - lastTransitionTime: "2023-09-12T16:17:08Z"
          status: "True"
          type: Terminable
        lastUpdated: "2023-09-12T16:20:08Z"
        phase: Provisioned
        providerStatus:
          conditions:
          - lastTransitionTime: "2023-09-12T16:18:11Z"
            message: Machine successfully created
            reason: MachineCreationSucceeded
            status: "True"
            type: MachineCreation
          instanceId: 4229df5b-0c8e-e9fe-512e-aa2d8b21a7ac
          instanceState: poweredOn
          taskRef: task-5267166
       

      CD (from ACM hub)

      apiVersion: hive.openshift.io/v1
      kind: ClusterDeployment
      metadata:
        annotations:
          open-cluster-management.io/user-group: c3lzdGVtOmNsdXN0ZXItYWRtaW5zLHN5c3RlbTphdXRoZW50aWNhdGVk
          open-cluster-management.io/user-identity: a3ViZTphZG1pbg==
        creationTimestamp: "2023-09-12T16:01:03Z"
        finalizers:
        - hive.openshift.io/deprovision
        generation: 3
        labels:
          cloud: vSphere
          cluster.open-cluster-management.io/clusterset: default
          hive.openshift.io/cluster-platform: vsphere
          hive.openshift.io/cluster-region: unknown
          hive.openshift.io/version: 4.14.0-rc.0
          hive.openshift.io/version-major: "4"
          hive.openshift.io/version-major-minor: "4.14"
          hive.openshift.io/version-major-minor-patch: 4.14.0
          vendor: OpenShift
        name: qe4-vmware-ibm
        namespace: qe4-vmware-ibm
        resourceVersion: "1229449"
        uid: 9bef6ebb-0d4e-4dad-b2d6-a7214bd344d4
      spec:
        baseDomain: dev09.red-chesterfield.com
        clusterMetadata:
          adminKubeconfigSecretRef:
            name: qe4-vmware-ibm-0-q95kb-admin-kubeconfig
          adminPasswordSecretRef:
            name: qe4-vmware-ibm-0-q95kb-admin-password
          clusterID: 2068fb64-2c02-428b-b833-097da48c1fbb
          infraID: qe4-vmware-ibm-z4fpw
        clusterName: qe4-vmware-ibm
        controlPlaneConfig:
          servingCertificates: {}
        installAttemptsLimit: 1
        installed: true
        platform:
          vsphere:
            certificatesSecretRef:
              name: qe4-vmware-ibm-vsphere-certs
            cluster: /Workload Datacenter/host/Workload Cluster
            credentialsSecretRef:
              name: qe4-vmware-ibm-vsphere-creds
            datacenter: Workload Datacenter
            defaultDatastore: /Workload Datacenter/datastore/WORKLOAD-DS-Folder/WORKLOAD-DS
            folder: /Workload Datacenter/vm/ACM-QE
            network: Public Network
            vCenter: acmcicd-vcsa-01.cicd.red-chesterfield.com
        provisioning:
          imageSetRef:
            name: img4.14.0-rc.0-multi
          installConfigSecretRef:
            name: qe4-vmware-ibm-install-config
          sshPrivateKeySecretRef:
            name: qe4-vmware-ibm-ssh-private-key
        pullSecretRef:
          name: qe4-vmware-ibm-pull-secret
      status:
        apiURL: https://api.qe4-vmware-ibm.dev09.red-chesterfield.com:6443
        cliImage: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:c6fde16873a3def595063f2ae2a7ea786207d548fae3f4a174aab181cfd8207c
        conditions:
        - lastProbeTime: "2023-09-12T16:37:36Z"
          lastTransitionTime: "2023-09-12T16:37:36Z"
          message: 'Unsupported platform: no actuator to handle it'
          reason: Unsupported
          status: "False"
          type: Hibernating
        - lastProbeTime: "2023-09-12T16:01:04Z"
          lastTransitionTime: "2023-09-12T16:01:04Z"
          message: Platform credentials passed authentication check
          reason: PlatformAuthSuccess
          status: "False"
          type: AuthenticationFailure
        - lastProbeTime: "2023-09-12T16:37:36Z"
          lastTransitionTime: "2023-09-12T16:37:36Z"
          message: Control plane certificates are present
          reason: ControlPlaneCertificatesFound
          status: "False"
          type: ControlPlaneCertificateNotFound
        - lastProbeTime: "2023-09-12T16:01:11Z"
          lastTransitionTime: "2023-09-12T16:01:11Z"
          message: Images required for cluster deployment installations are resolved
          reason: ImagesResolved
          status: "False"
          type: InstallImagesNotResolved
        - lastProbeTime: "2023-09-12T16:01:26Z"
          lastTransitionTime: "2023-09-12T16:01:26Z"
          message: Successfully launched install pod
          reason: InstallLaunchSuccessful
          status: "False"
          type: InstallLaunchError
        - lastProbeTime: "2023-09-12T16:01:08Z"
          lastTransitionTime: "2023-09-12T16:01:08Z"
          message: InstallerImage is resolved.
          reason: InstallerImageResolved
          status: "False"
          type: InstallerImageResolutionFailed
        - lastProbeTime: "2023-09-12T16:37:36Z"
          lastTransitionTime: "2023-09-12T16:37:36Z"
          message: Provision qe4-vmware-ibm-0-q95kb succeeded.
          reason: ProvisionSucceeded
          status: "False"
          type: ProvisionFailed
        - lastProbeTime: "2023-09-12T16:01:11Z"
          lastTransitionTime: "2023-09-12T16:01:11Z"
          message: Provision is not stopped
          reason: ProvisionNotStopped
          status: "False"
          type: ProvisionStopped
        - lastProbeTime: "2023-09-12T16:37:36Z"
          lastTransitionTime: "2023-09-12T16:37:36Z"
          message: Cluster is provisioned
          reason: Provisioned
          status: "True"
          type: Provisioned
        - lastProbeTime: "2023-09-12T16:37:36Z"
          lastTransitionTime: "2023-09-12T16:37:36Z"
          message: No power state actuator -- assuming running
          reason: Running
          status: "True"
          type: Ready
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: no ClusterRelocates match
          reason: NoMatchingRelocates
          status: "False"
          type: RelocationFailed
        - lastProbeTime: "2023-09-12T16:01:11Z"
          lastTransitionTime: "2023-09-12T16:01:11Z"
          message: All pre-provision requirements met
          reason: AllRequirementsMet
          status: "True"
          type: RequirementsMet
        - lastProbeTime: "2023-09-12T16:37:39Z"
          lastTransitionTime: "2023-09-12T16:37:39Z"
          message: SyncSet apply is successful
          reason: SyncSetApplySuccess
          status: "False"
          type: SyncSetFailed
        - lastProbeTime: "2023-09-12T18:37:37Z"
          lastTransitionTime: "2023-09-12T16:37:37Z"
          message: cluster is reachable
          reason: ClusterReachable
          status: "False"
          type: Unreachable
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: AWSPrivateLinkFailed
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: AWSPrivateLinkReady
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: ActiveAPIURLOverride
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: ClusterInstallCompleted
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: ClusterInstallFailed
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: ClusterInstallRequirementsMet
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: ClusterInstallStopped
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: DNSNotReady
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: DeprovisionLaunchError
        - lastProbeTime: "2023-09-12T16:01:03Z"
          lastTransitionTime: "2023-09-12T16:01:03Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: IngressCertificateNotFound
        installStartedTimestamp: "2023-09-12T16:01:12Z"
        installVersion: 4.14.0-rc.0
        installedTimestamp: "2023-09-12T16:37:36Z"
        installerImage: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e4aa8f7b1caf1a4674d463e5d96987711cd77d9e83f33b912b02441b2cc15d13
        powerState: Running
        provisionRef:
          name: qe4-vmware-ibm-0-q95kb
        webConsoleURL: https://console-openshift-console.apps.qe4-vmware-ibm.dev09.red-chesterfield.com 

            team-mco Team MCO
            rhn-support-dhuynh David Huynh
            David Huynh David Huynh
            ACM QE Team
            Votes:
            0 Vote for this issue
            Watchers:
            5 Start watching this issue

              Created:
              Updated:
              Resolved: