Uploaded image for project: 'OpenShift Hive'
  1. OpenShift Hive
  2. HIVE-2518

Unable to scale vSphere Machine Pool and machine set replicas for OCP 4.14, 4.15

XMLWordPrintable

    • False
    • None
    • False
    • Critical

      Description of problem:

      When deploying a hive cluster on OCP 4.14 or 4.15 via hive, the machine pool is not properly connecting to the machine set replicas on the cluster:

      Though we can see the nodes:

      Works with OCP 4.16.0-rc.2

      Version-Release number of selected component (if applicable):

      ACM 2.11.0-DOWNSTREAM-2024-05-20-22-50-29

      OCP 4.14.25
      OCP 4.15.14

      How reproducible:

      always

      Steps to Reproduce:

      1. provision either a vsphere hive cluster on either 4.14 or 4.15
      2. check the machine pools for the cluster
      3. attempt to scale the machine pool, notice nothing happens

      Actual results:

      Expected results:

      machine pool works as expected

      Additional info:

      On the hub:

      install-config:

       apiVersion: v1
      metadata:
        name: 'qe5-vmware-ibm'
      baseDomain: dev09.red-chesterfield.com
      controlPlane:
        hyperthreading: Enabled
        name: master
        replicas: 3
        platform:
          vsphere:
            cpus:  4
            coresPerSocket:  2
            memoryMB:  16384
            osDisk:
              diskSizeGB: 120
      compute:
      - hyperthreading: Enabled
        name: 'worker'
        replicas: 3
        platform:
          vsphere:
            cpus:  4
            coresPerSocket:  2
            memoryMB:  16384
            osDisk:
              diskSizeGB: 120
      networking:
        networkType: OVNKubernetes
        clusterNetwork:
        - cidr: 10.128.0.0/14
          hostPrefix: 23
        machineNetwork:
        - cidr: 150.240.0.0/16
        serviceNetwork:
        - 172.30.0.0/16
      platform:
        vsphere:
          vCenter: xxx.cicd.red-chesterfield.com
          username: xxx
          password: yyy
          datacenter: Workload Datacenter
          defaultDatastore: /Workload Datacenter/datastore/WORKLOAD-DS-Folder/WORKLOAD-DS
          cluster: /Workload Datacenter/host/Workload Cluster
          apiVIP: 150.240.xx.xxx
          ingressVIP: 150.240.xx.xxx
          network: 'Network
          diskType: 'thin'
          folder: '/Workload Datacenter/vm/ACM-QE'
      fips: true
      pullSecret: "" # skip, hive will inject based on it's secrets
      sshKey: |-

      ClusterDeployment:

      apiVersion: hive.openshift.io/v1
      kind: ClusterDeployment
      metadata:
        annotations:
          open-cluster-management.io/user-group: c3lzdGVtOmNsdXN0ZXItYWRtaW5zLHN5c3RlbTphdXRoZW50aWNhdGVk
          open-cluster-management.io/user-identity: a3ViZTphZG1pbg==
        creationTimestamp: "2024-05-22T07:15:35Z"
        finalizers:
        - hive.openshift.io/deprovision
        generation: 3
        labels:
          cloud: vSphere
          cluster.open-cluster-management.io/clusterset: default
          hive.openshift.io/cluster-platform: vsphere
          hive.openshift.io/cluster-region: unknown
          hive.openshift.io/version: 4.15.14
          hive.openshift.io/version-major: "4"
          hive.openshift.io/version-major-minor: "4.15"
          hive.openshift.io/version-major-minor-patch: 4.15.14
          vendor: OpenShift
        name: qe5-vmware-ibm
        namespace: qe5-vmware-ibm
        resourceVersion: "1609565"
        uid: 16c87e0d-4608-41ae-8e4a-667bbc3396f4
      spec:
        baseDomain: dev09.red-chesterfield.com
        clusterMetadata:
          adminKubeconfigSecretRef:
            name: qe5-vmware-ibm-0-g9rj8-admin-kubeconfig
          adminPasswordSecretRef:
            name: qe5-vmware-ibm-0-g9rj8-admin-password
          clusterID: 87c9391b-d864-46ac-9609-906ae1f8ab5e
          infraID: qe5-vmware-ibm-9n4bx
        clusterName: qe5-vmware-ibm
        controlPlaneConfig:
          servingCertificates: {}
        installAttemptsLimit: 1
        installed: true
        platform:
          vsphere:
            certificatesSecretRef:
              name: qe5-vmware-ibm-vsphere-certs
            cluster: /Workload Datacenter/host/Workload Cluster
            credentialsSecretRef:
              name: qe5-vmware-ibm-vsphere-creds
            datacenter: Workload Datacenter
            defaultDatastore: /Workload Datacenter/datastore/WORKLOAD-DS-Folder/WORKLOAD-DS
            folder: /Workload Datacenter/vm/ACM-QE
            network: Public Network
            vCenter: acmcicd-vcsa-01.cicd.red-chesterfield.com
        provisioning:
          imageSetRef:
            name: img4.15.14-multi-appsub
          installConfigSecretRef:
            name: qe5-vmware-ibm-install-config
          sshPrivateKeySecretRef:
            name: qe5-vmware-ibm-ssh-private-key
        pullSecretRef:
          name: qe5-vmware-ibm-pull-secret
      status:
        apiURL: https://api.qe5-vmware-ibm.dev09.red-chesterfield.com:6443
        cliImage: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:7aa4524899d8e82ce2f144b156125f53ef8ab6a6034cef9c44633a69bd9f59a6
        conditions:
        - lastProbeTime: "2024-05-22T07:47:05Z"
          lastTransitionTime: "2024-05-22T07:47:05Z"
          message: 'Unsupported platform: no actuator to handle it'
          reason: Unsupported
          status: "False"
          type: Hibernating
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Platform credentials passed authentication check
          reason: PlatformAuthSuccess
          status: "False"
          type: AuthenticationFailure
        - lastProbeTime: "2024-05-22T07:47:05Z"
          lastTransitionTime: "2024-05-22T07:47:05Z"
          message: Control plane certificates are present
          reason: ControlPlaneCertificatesFound
          status: "False"
          type: ControlPlaneCertificateNotFound
        - lastProbeTime: "2024-05-22T07:15:46Z"
          lastTransitionTime: "2024-05-22T07:15:46Z"
          message: Images required for cluster deployment installations are resolved
          reason: ImagesResolved
          status: "False"
          type: InstallImagesNotResolved
        - lastProbeTime: "2024-05-22T07:16:15Z"
          lastTransitionTime: "2024-05-22T07:16:15Z"
          message: Successfully launched install pod
          reason: InstallLaunchSuccessful
          status: "False"
          type: InstallLaunchError
        - lastProbeTime: "2024-05-22T07:15:43Z"
          lastTransitionTime: "2024-05-22T07:15:43Z"
          message: InstallerImage is resolved.
          reason: InstallerImageResolved
          status: "False"
          type: InstallerImageResolutionFailed
        - lastProbeTime: "2024-05-22T07:47:05Z"
          lastTransitionTime: "2024-05-22T07:47:05Z"
          message: Provision qe5-vmware-ibm-0-g9rj8 succeeded.
          reason: ProvisionSucceeded
          status: "False"
          type: ProvisionFailed
        - lastProbeTime: "2024-05-22T07:15:46Z"
          lastTransitionTime: "2024-05-22T07:15:46Z"
          message: Provision is not stopped
          reason: ProvisionNotStopped
          status: "False"
          type: ProvisionStopped
        - lastProbeTime: "2024-05-22T07:47:05Z"
          lastTransitionTime: "2024-05-22T07:47:05Z"
          message: Cluster is provisioned
          reason: Provisioned
          status: "True"
          type: Provisioned
        - lastProbeTime: "2024-05-22T07:47:05Z"
          lastTransitionTime: "2024-05-22T07:47:05Z"
          message: No power state actuator -- assuming running
          reason: Running
          status: "True"
          type: Ready
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: no ClusterRelocates match
          reason: NoMatchingRelocates
          status: "False"
          type: RelocationFailed
        - lastProbeTime: "2024-05-22T07:15:46Z"
          lastTransitionTime: "2024-05-22T07:15:46Z"
          message: All pre-provision requirements met
          reason: AllRequirementsMet
          status: "True"
          type: RequirementsMet
        - lastProbeTime: "2024-05-22T07:47:08Z"
          lastTransitionTime: "2024-05-22T07:47:08Z"
          message: SyncSet apply is successful
          reason: SyncSetApplySuccess
          status: "False"
          type: SyncSetFailed
        - lastProbeTime: "2024-05-22T07:47:05Z"
          lastTransitionTime: "2024-05-22T07:47:05Z"
          message: cluster is reachable
          reason: ClusterReachable
          status: "False"
          type: Unreachable
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: AWSPrivateLinkFailed
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: AWSPrivateLinkReady
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: ActiveAPIURLOverride
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: ClusterInstallCompleted
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: ClusterInstallFailed
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: ClusterInstallRequirementsMet
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: ClusterInstallStopped
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: DNSNotReady
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: DeprovisionLaunchError
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: IngressCertificateNotFound
        installStartedTimestamp: "2024-05-22T07:15:47Z"
        installVersion: 4.15.14
        installedTimestamp: "2024-05-22T07:47:05Z"
        installerImage: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:485e570d51b0b27e9667044dcee48485ddfabe06f010aa01e44ec894c26a75b6
        powerState: Running
        provisionRef:
          name: qe5-vmware-ibm-0-g9rj8
        webConsoleURL: https://console-openshift-console.apps.qe5-vmware-ibm.dev09.red-chesterfield.com 

      MachinePool:

      apiVersion: hive.openshift.io/v1
      kind: MachinePool
      metadata:
        creationTimestamp: "2024-05-22T07:15:34Z"
        finalizers:
        - hive.openshift.io/remotemachineset
        generation: 2
        name: qe5-vmware-ibm-worker
        namespace: qe5-vmware-ibm
        resourceVersion: "1637460"
        uid: 218d9229-b042-46ce-9336-b18b366c4381
      spec:
        clusterDeploymentRef:
          name: qe5-vmware-ibm
        name: worker
        platform:
          vsphere:
            coresPerSocket: 2
            cpus: 4
            memoryMB: 16384
            osDisk:
              diskSizeGB: 120
        replicas: 4
      status:
        conditions:
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: NotEnoughReplicas
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: NoMachinePoolNameLeasesAvailable
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: InvalidSubnets
        - lastProbeTime: "2024-05-22T07:15:35Z"
          lastTransitionTime: "2024-05-22T07:15:35Z"
          message: Condition Initialized
          reason: Initialized
          status: Unknown
          type: UnsupportedConfiguration

      On the cluster:

      MachineSets

      apiVersion: machine.openshift.io/v1beta1
      kind: MachineSet
      metadata:
        annotations:
          machine.openshift.io/memoryMb: "16384"
          machine.openshift.io/vCPU: "4"
        creationTimestamp: "2024-05-22T07:20:50Z"
        generation: 1
        labels:
          hive.openshift.io/machine-pool: worker
          hive.openshift.io/managed: "true"
          machine.openshift.io/cluster-api-cluster: qe5-vmware-ibm-9n4bx
        name: qe5-vmware-ibm-9n4bx-worker-0
        namespace: openshift-machine-api
        resourceVersion: "42680"
        uid: 522db1e3-edd3-4cf5-9bf6-da41904343a0
      spec:
        replicas: 3
        selector:
          matchLabels:
            machine.openshift.io/cluster-api-cluster: qe5-vmware-ibm-9n4bx
            machine.openshift.io/cluster-api-machineset: qe5-vmware-ibm-9n4bx-worker-0
        template:
          metadata:
            labels:
              machine.openshift.io/cluster-api-cluster: qe5-vmware-ibm-9n4bx
              machine.openshift.io/cluster-api-machine-role: worker
              machine.openshift.io/cluster-api-machine-type: worker
              machine.openshift.io/cluster-api-machineset: qe5-vmware-ibm-9n4bx-worker-0
          spec:
            lifecycleHooks: {}
            metadata: {}
            providerSpec:
              value:
                apiVersion: machine.openshift.io/v1beta1
                credentialsSecret:
                  name: vsphere-cloud-credentials
                diskGiB: 120
                kind: VSphereMachineProviderSpec
                memoryMiB: 16384
                metadata:
                  creationTimestamp: null
                network:
                  devices:
                  - networkName: Public Network
                numCPUs: 4
                numCoresPerSocket: 2
                snapshot: ""
                template: qe5-vmware-ibm-9n4bx-rhcos-generated-region-generated-zone
                userDataSecret:
                  name: worker-user-data
                workspace:
                  datacenter: Workload Datacenter
                  datastore: /Workload Datacenter/datastore/WORKLOAD-DS-Folder/WORKLOAD-DS
                  folder: /Workload Datacenter/vm/ACM-QE
                  resourcePool: /Workload Datacenter/host/Workload Cluster//Resources
                  server: acmcicd-vcsa-01.cicd.red-chesterfield.com
      status:
        availableReplicas: 3
        fullyLabeledReplicas: 3
        observedGeneration: 1
        readyReplicas: 3
        replicas: 3 

      Machines/Nodes

      $ oc get machines -A
      NAMESPACE               NAME                                  PHASE     TYPE   REGION   ZONE   AGE
      openshift-machine-api   qe5-vmware-ibm-9n4bx-master-0         Running                          49m
      openshift-machine-api   qe5-vmware-ibm-9n4bx-master-1         Running                          49m
      openshift-machine-api   qe5-vmware-ibm-9n4bx-master-2         Running                          49m
      openshift-machine-api   qe5-vmware-ibm-9n4bx-worker-0-drwpw   Running                          43m
      openshift-machine-api   qe5-vmware-ibm-9n4bx-worker-0-m2bk8   Running                          43m
      openshift-machine-api   qe5-vmware-ibm-9n4bx-worker-0-z9xcf   Running                          43m
      
      $ oc get nodes
      NAME                                  STATUS   ROLES                  AGE   VERSION
      qe5-vmware-ibm-9n4bx-master-0         Ready    control-plane,master   48m   v1.28.9+416ecaf
      qe5-vmware-ibm-9n4bx-master-1         Ready    control-plane,master   48m   v1.28.9+416ecaf
      qe5-vmware-ibm-9n4bx-master-2         Ready    control-plane,master   48m   v1.28.9+416ecaf
      qe5-vmware-ibm-9n4bx-worker-0-drwpw   Ready    worker                 35m   v1.28.9+416ecaf
      qe5-vmware-ibm-9n4bx-worker-0-m2bk8   Ready    worker                 35m   v1.28.9+416ecaf
      qe5-vmware-ibm-9n4bx-worker-0-z9xcf   Ready    worker                 35m   v1.28.9+416ecaf

       

        1. image-2024-05-22-01-00-14-012.png
          115 kB
          David Huynh
        2. image-2024-05-22-01-01-59-349.png
          166 kB
          David Huynh
        3. image-2024-06-27-09-32-15-356.png
          25 kB
          Antoni Segura Puimedon

              leah_leshchinsky Leah Leshchinsky (Inactive)
              rhn-support-dhuynh David Huynh
              David Huynh David Huynh
              ACM QE Team
              Votes:
              0 Vote for this issue
              Watchers:
              10 Start watching this issue

                Created:
                Updated:
                Resolved: