Uploaded image for project: 'OpenShift Bugs'
  1. OpenShift Bugs
  2. OCPBUGS-22665

After applying Performance Profile, unable to apply cgroup configuration

XMLWordPrintable

    • No
    • Approved
    • Hide

      None

      Show
      None

      Description of problem:

      After applying performance profile, Node is stuck in NotReady state with below condition
      
      KubeletNotReady              container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?
      
      Further checking I observe multus pods fails to create with error "runc create failed: unable to start container process: unable to apply cgroup configuration: failed to write \"0-3\": write /sys/fs/cgroup/cpuset/system.slice/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podedf60dee_c378_4f74_9bff_74b3d2583824.slice/crio-cf7d301f54722b4e2a04eb18bcc0048c8c888b70018e2041bfda8a71a89e89c0.scope/cpuset.cpus: permission denied"
      
      
      % oc get clusterversion
      NAME      VERSION                              AVAILABLE   PROGRESSING   SINCE   STATUS
      version   4.15.0-0.nightly-2023-10-27-135451   True        False         57m     Cluster version is 4.15.0-0.nightly-2023-10-27-135451
      
      
      % oc get nodes
      NAME                                        STATUS                        ROLES                  AGE   VERSION
      ip-10-0-16-48.us-east-2.compute.internal    Ready                         worker                 14m   v1.28.3+fa9f909
      ip-10-0-27-117.us-east-2.compute.internal   Ready                         control-plane,master   85m   v1.28.3+fa9f909
      ip-10-0-29-237.us-east-2.compute.internal   NotReady,SchedulingDisabled   worker,worker-cnf      18m   v1.28.3+fa9f909
      ip-10-0-43-133.us-east-2.compute.internal   Ready                         worker                 76m   v1.28.3+fa9f909
      ip-10-0-57-76.us-east-2.compute.internal    Ready                         control-plane,master   85m   v1.28.3+fa9f909
      ip-10-0-87-235.us-east-2.compute.internal   Ready                         control-plane,master   85m   v1.28.3+fa9f909
      ip-10-0-89-143.us-east-2.compute.internal   Ready                         worker                 76m   v1.28.3+fa9f909
      
      
      % oc get mcp
      NAME         CONFIG                                                 UPDATED   UPDATING   DEGRADED   MACHINECOUNT   READYMACHINECOUNT   UPDATEDMACHINECOUNT   DEGRADEDMACHINECOUNT   AGE
      master       rendered-master-ce73ecf6d29713b46c30adf93426e19c       True      False      False      3              3                   3                     0                      82m
      worker       rendered-worker-2c1dc427483d1d27335642c81fcd1bd0       True      False      False      3              3                   3                     0                      82m
      worker-cnf   rendered-worker-cnf-2c1dc427483d1d27335642c81fcd1bd0   False     True       False      1              0                   0                     0                      14m
      
      
      % oc get performanceprofile performance -o yaml
      apiVersion: performance.openshift.io/v2
      kind: PerformanceProfile
      metadata:
        annotations:
          kubectl.kubernetes.io/last-applied-configuration: |
            {"apiVersion":"performance.openshift.io/v2","kind":"PerformanceProfile","metadata":{"annotations":{},"name":"performance"},"spec":{"cpu":{"isolated":"2","reserved":"0-1"},"machineConfigPoolSelector":{"machineconfiguration.openshift.io/role":"worker-cnf"},"nodeSelector":{"node-role.kubernetes.io/worker-cnf":""}}}
        creationTimestamp: "2023-10-30T08:52:33Z"
        finalizers:
        - foreground-deletion
        generation: 1
        name: performance
        resourceVersion: "60694"
        uid: dfc53cda-7836-4ec2-8ed8-578233e6c71c
      spec:
        cpu:
          isolated: "2"
          reserved: 0-1
        machineConfigPoolSelector:
          machineconfiguration.openshift.io/role: worker-cnf
        nodeSelector:
          node-role.kubernetes.io/worker-cnf: ""
      status:
        conditions:
        - lastHeartbeatTime: "2023-10-30T08:52:33Z"
          lastTransitionTime: "2023-10-30T08:52:33Z"
          status: "True"
          type: Available
        - lastHeartbeatTime: "2023-10-30T08:52:33Z"
          lastTransitionTime: "2023-10-30T08:52:33Z"
          status: "True"
          type: Upgradeable
        - lastHeartbeatTime: "2023-10-30T08:52:33Z"
          lastTransitionTime: "2023-10-30T08:52:33Z"
          status: "False"
          type: Progressing
        - lastHeartbeatTime: "2023-10-30T08:52:33Z"
          lastTransitionTime: "2023-10-30T08:52:33Z"
          status: "False"
          type: Degraded
        runtimeClass: performance-performance
        tuned: openshift-cluster-node-tuning-operator/openshift-node-performance-performance
      
      
      % oc describe node ip-10-0-29-237.us-east-2.compute.internal
      Name:               ip-10-0-29-237.us-east-2.compute.internal
      Roles:              worker,worker-cnf
      Labels:             beta.kubernetes.io/arch=amd64
                          beta.kubernetes.io/instance-type=m6i.xlarge
                          beta.kubernetes.io/os=linux
                          failure-domain.beta.kubernetes.io/region=us-east-2
                          failure-domain.beta.kubernetes.io/zone=us-east-2a
                          kubernetes.io/arch=amd64
                          kubernetes.io/hostname=ip-10-0-29-237.us-east-2.compute.internal
                          kubernetes.io/os=linux
                          machine.openshift.io/interruptible-instance=
                          node-role.kubernetes.io/worker=
                          node-role.kubernetes.io/worker-cnf=
                          node.kubernetes.io/instance-type=m6i.xlarge
                          node.openshift.io/os_id=rhcos
                          topology.ebs.csi.aws.com/zone=us-east-2a
                          topology.kubernetes.io/region=us-east-2
                          topology.kubernetes.io/zone=us-east-2a
      Annotations:        cloud.network.openshift.io/egress-ipconfig:
                            [{"interface":"eni-0faea0eb5c309c649","ifaddr":{"ipv4":"10.0.0.0/19"},"capacity":{"ipv4":14,"ipv6":15}}]
                          csi.volume.kubernetes.io/nodeid: {"ebs.csi.aws.com":"i-09ec614d98071a2a4"}
                          machine.openshift.io/machine: openshift-machine-api/sunilc415a-h2tb8-worker-us-east-2a-92l7d
                          machineconfiguration.openshift.io/controlPlaneTopology: HighlyAvailable
                          machineconfiguration.openshift.io/currentConfig: rendered-worker-cnf-2c1dc427483d1d27335642c81fcd1bd0
                          machineconfiguration.openshift.io/desiredConfig: rendered-worker-cnf-d88e259d19dfa70382a02afa9433dc4b
                          machineconfiguration.openshift.io/desiredDrain: drain-rendered-worker-cnf-d88e259d19dfa70382a02afa9433dc4b
                          machineconfiguration.openshift.io/lastAppliedDrain: drain-rendered-worker-cnf-d88e259d19dfa70382a02afa9433dc4b
                          machineconfiguration.openshift.io/lastSyncedControllerConfigResourceVersion: 23231
                          machineconfiguration.openshift.io/reason: 
                          machineconfiguration.openshift.io/state: Working
                          tuned.openshift.io/bootcmdline:
                            skew_tick=1 tsc=reliable rcupdate.rcu_normal_after_boot=1 nohz=on rcu_nocbs=2 tuned.non_isolcpus=0000000b systemd.cpu_affinity=0,1,3 intel...
                          volumes.kubernetes.io/controller-managed-attach-detach: true
      CreationTimestamp:  Mon, 30 Oct 2023 14:13:46 +0530
      Taints:             node.kubernetes.io/not-ready:NoExecute
                          node.kubernetes.io/not-ready:NoSchedule
                          node.kubernetes.io/unschedulable:NoSchedule
      Unschedulable:      true
      Lease:
        HolderIdentity:  ip-10-0-29-237.us-east-2.compute.internal
        AcquireTime:     <unset>
        RenewTime:       Mon, 30 Oct 2023 14:32:31 +0530
      Conditions:
        Type             Status  LastHeartbeatTime                 LastTransitionTime                Reason                       Message
        ----             ------  -----------------                 ------------------                ------                       -------
        MemoryPressure   False   Mon, 30 Oct 2023 14:32:31 +0530   Mon, 30 Oct 2023 14:24:29 +0530   KubeletHasSufficientMemory   kubelet has sufficient memory available
        DiskPressure     False   Mon, 30 Oct 2023 14:32:31 +0530   Mon, 30 Oct 2023 14:24:29 +0530   KubeletHasNoDiskPressure     kubelet has no disk pressure
        PIDPressure      False   Mon, 30 Oct 2023 14:32:31 +0530   Mon, 30 Oct 2023 14:24:29 +0530   KubeletHasSufficientPID      kubelet has sufficient PID available
        Ready            False   Mon, 30 Oct 2023 14:32:31 +0530   Mon, 30 Oct 2023 14:24:29 +0530   KubeletNotReady              container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?
      Addresses:
        InternalIP:   10.0.29.237
        InternalDNS:  ip-10-0-29-237.us-east-2.compute.internal
        Hostname:     ip-10-0-29-237.us-east-2.compute.internal
      Capacity:
        cpu:                4
        ephemeral-storage:  125238252Ki
        hugepages-1Gi:      0
        hugepages-2Mi:      0
        memory:             16092968Ki
        pods:               250
      Allocatable:
        cpu:                2
        ephemeral-storage:  114345831029
        hugepages-1Gi:      0
        hugepages-2Mi:      0
        memory:             14966568Ki
        pods:               250
      System Info:
        Machine ID:                             ec2fc1d328898394686e2560a520cbc0
        System UUID:                            ec2fc1d3-2889-8394-686e-2560a520cbc0
        Boot ID:                                ce610e94-2d76-4705-82f8-42031a526f82
        Kernel Version:                         5.14.0-284.38.1.el9_2.x86_64
        OS Image:                               Red Hat Enterprise Linux CoreOS 415.92.202310270236-0 (Plow)
        Operating System:                       linux
        Architecture:                           amd64
        Container Runtime Version:              cri-o://1.28.1-9.rhaos4.15.git664b9cf.el9
        Kubelet Version:                        v1.28.3+fa9f909
        Kube-Proxy Version:                     v1.28.3+fa9f909
      ProviderID:                               aws:///us-east-2a/i-09ec614d98071a2a4
      Non-terminated Pods:                      (14 in total)
        Namespace                               Name                                     CPU Requests  CPU Limits  Memory Requests  Memory Limits  Age
        ---------                               ----                                     ------------  ----------  ---------------  -------------  ---
        openshift-cluster-csi-drivers           aws-ebs-csi-driver-node-r5bcm            30m (1%)      0 (0%)      150Mi (1%)       0 (0%)         18m
        openshift-cluster-node-tuning-operator  tuned-q6jql                              10m (0%)      0 (0%)      50Mi (0%)        0 (0%)         18m
        openshift-dns                           dns-default-9929b                        60m (3%)      0 (0%)      110Mi (0%)       0 (0%)         17m
        openshift-dns                           node-resolver-jwfbt                      5m (0%)       0 (0%)      21Mi (0%)        0 (0%)         18m
        openshift-image-registry                node-ca-fgjkl                            10m (0%)      0 (0%)      10Mi (0%)        0 (0%)         18m
        openshift-ingress-canary                ingress-canary-t5c4n                     10m (0%)      0 (0%)      20Mi (0%)        0 (0%)         17m
        openshift-machine-api                   machine-api-termination-handler-fmtj4    10m (0%)      0 (0%)      20Mi (0%)        0 (0%)         18m
        openshift-machine-config-operator       machine-config-daemon-l6c24              40m (2%)      0 (0%)      100Mi (0%)       0 (0%)         18m
        openshift-monitoring                    node-exporter-s5l72                      9m (0%)       0 (0%)      47Mi (0%)        0 (0%)         18m
        openshift-multus                        multus-additional-cni-plugins-fgltj      10m (0%)      0 (0%)      10Mi (0%)        0 (0%)         18m
        openshift-multus                        multus-jtcjc                             10m (0%)      0 (0%)      65Mi (0%)        0 (0%)         18m
        openshift-multus                        network-metrics-daemon-hntjt             20m (1%)      0 (0%)      120Mi (0%)       0 (0%)         18m
        openshift-network-diagnostics           network-check-target-cdcdz               10m (0%)      0 (0%)      15Mi (0%)        0 (0%)         18m
        openshift-sdn                           sdn-k9vjj                                110m (5%)     0 (0%)      220Mi (1%)       0 (0%)         18m
      Allocated resources:
        (Total limits may be over 100 percent, i.e., overcommitted.)
        Resource           Requests    Limits
        --------           --------    ------
        cpu                344m (17%)  0 (0%)
        memory             958Mi (6%)  0 (0%)
        ephemeral-storage  0 (0%)      0 (0%)
        hugepages-1Gi      0 (0%)      0 (0%)
        hugepages-2Mi      0 (0%)      0 (0%)
      Events:
        Type     Reason                   Age                    From                   Message
        ----     ------                   ----                   ----                   -------
        Normal   NodeHasSufficientMemory  18m (x2 over 18m)      kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasSufficientMemory
        Normal   NodeHasSufficientPID     18m (x2 over 18m)      kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasSufficientPID
        Normal   NodeHasNoDiskPressure    18m (x2 over 18m)      kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasNoDiskPressure
        Normal   Starting                 18m                    kubelet                Starting kubelet.
        Normal   Synced                   18m                    cloud-node-controller  Node synced successfully
        Normal   RegisteredNode           18m                    node-controller        Node ip-10-0-29-237.us-east-2.compute.internal event: Registered Node ip-10-0-29-237.us-east-2.compute.internal in Controller
        Normal   NodeHasSufficientPID     18m                    kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasSufficientPID
        Normal   NodeHasSufficientMemory  18m                    kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasSufficientMemory
        Normal   NodeHasNoDiskPressure    18m                    kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasNoDiskPressure
        Normal   Starting                 18m                    kubelet                Starting kubelet.
        Normal   NodeAllocatableEnforced  18m                    kubelet                Updated Node Allocatable limit across pods
        Normal   NodeReady                17m                    kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeReady
        Normal   NodeNotSchedulable       17m                    kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeNotSchedulable
        Normal   OSUpdateStaged           17m                    machineconfigdaemon    Changes to OS staged
        Warning  Rebooted                 16m                    kubelet                Node ip-10-0-29-237.us-east-2.compute.internal has been rebooted, boot id: 0d7a6616-d629-44fe-8b47-fe6d64a9b094
        Normal   NodeNotReady             16m                    kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeNotReady
        Normal   Starting                 16m                    kubelet                Starting kubelet.
        Normal   NodeAllocatableEnforced  16m                    kubelet                Updated Node Allocatable limit across pods
        Normal   NodeHasNoDiskPressure    16m (x2 over 16m)      kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasNoDiskPressure
        Normal   NodeHasSufficientPID     16m (x2 over 16m)      kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasSufficientPID
        Normal   NodeHasSufficientMemory  16m (x2 over 16m)      kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasSufficientMemory
        Normal   NodeReady                16m                    kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeReady
        Normal   NodeSchedulable          16m                    kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeSchedulable
        Normal   NodeNotSchedulable       9m53s (x2 over 16m)    kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeNotSchedulable
        Normal   OSUpdateStaged           8m55s                  machineconfigdaemon    Changes to OS staged
        Normal   NodeNotReady             8m18s (x2 over 16m)    node-controller        Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeNotReady
        Normal   NodeHasSufficientMemory  8m12s (x2 over 8m12s)  kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasSufficientMemory
        Normal   Starting                 8m12s                  kubelet                Starting kubelet.
        Normal   NodeHasNoDiskPressure    8m12s (x2 over 8m12s)  kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasNoDiskPressure
        Normal   NodeHasSufficientPID     8m12s (x2 over 8m12s)  kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeHasSufficientPID
        Normal   NodeAllocatableEnforced  8m12s                  kubelet                Updated Node Allocatable limit across pods
        Warning  Rebooted                 8m12s                  kubelet                Node ip-10-0-29-237.us-east-2.compute.internal has been rebooted, boot id: ce610e94-2d76-4705-82f8-42031a526f82
        Normal   NodeNotReady             8m12s                  kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeNotReady
        Normal   NodeNotSchedulable       8m12s                  kubelet                Node ip-10-0-29-237.us-east-2.compute.internal status is now: NodeNotSchedulable
      
      
      % oc get pod -n openshift-multus -o wide
      NAME                                           READY   STATUS                      RESTARTS      AGE   IP            NODE                                        NOMINATED NODE   READINESS GATES
      multus-7h7wv                                   1/1     Running                     1             77m   10.0.43.133   ip-10-0-43-133.us-east-2.compute.internal   <none>           <none>
      multus-additional-cni-plugins-4gqlq            1/1     Running                     1             77m   10.0.43.133   ip-10-0-43-133.us-east-2.compute.internal   <none>           <none>
      multus-additional-cni-plugins-4m8p8            1/1     Running                     0             15m   10.0.16.48    ip-10-0-16-48.us-east-2.compute.internal    <none>           <none>
      multus-additional-cni-plugins-d48gz            1/1     Running                     1             85m   10.0.57.76    ip-10-0-57-76.us-east-2.compute.internal    <none>           <none>
      multus-additional-cni-plugins-fgltj            0/1     Init:CreateContainerError   1 (16m ago)   19m   10.0.29.237   ip-10-0-29-237.us-east-2.compute.internal   <none>           <none>
      multus-additional-cni-plugins-hdhgq            1/1     Running                     1             85m   10.0.27.117   ip-10-0-27-117.us-east-2.compute.internal   <none>           <none>
      multus-additional-cni-plugins-hk947            1/1     Running                     1             85m   10.0.87.235   ip-10-0-87-235.us-east-2.compute.internal   <none>           <none>
      multus-additional-cni-plugins-p9pn6            1/1     Running                     1             77m   10.0.89.143   ip-10-0-89-143.us-east-2.compute.internal   <none>           <none>
      multus-admission-controller-5b87b9b756-47ttt   2/2     Running                     0             48m   10.128.0.17   ip-10-0-27-117.us-east-2.compute.internal   <none>           <none>
      multus-admission-controller-5b87b9b756-nn9c7   2/2     Running                     0             43m   10.129.0.37   ip-10-0-57-76.us-east-2.compute.internal    <none>           <none>
      multus-jtcjc                                   0/1     CreateContainerError        2             19m   10.0.29.237   ip-10-0-29-237.us-east-2.compute.internal   <none>           <none>
      multus-l2tbg                                   1/1     Running                     6             77m   10.0.89.143   ip-10-0-89-143.us-east-2.compute.internal   <none>           <none>
      multus-pr68t                                   1/1     Running                     1             85m   10.0.57.76    ip-10-0-57-76.us-east-2.compute.internal    <none>           <none>
      multus-r78jk                                   1/1     Running                     1             85m   10.0.27.117   ip-10-0-27-117.us-east-2.compute.internal   <none>           <none>
      multus-vgs9c                                   1/1     Running                     0             15m   10.0.16.48    ip-10-0-16-48.us-east-2.compute.internal    <none>           <none>
      multus-zcc84                                   1/1     Running                     1             85m   10.0.87.235   ip-10-0-87-235.us-east-2.compute.internal   <none>           <none>
      network-metrics-daemon-5wwf9                   2/2     Running                     2             85m   10.129.0.7    ip-10-0-57-76.us-east-2.compute.internal    <none>           <none>
      network-metrics-daemon-7gn2w                   2/2     Running                     0             15m   10.130.2.5    ip-10-0-16-48.us-east-2.compute.internal    <none>           <none>
      network-metrics-daemon-9fsss                   2/2     Running                     2             77m   10.129.2.4    ip-10-0-43-133.us-east-2.compute.internal   <none>           <none>
      network-metrics-daemon-hc4ks                   2/2     Running                     2             85m   10.130.0.4    ip-10-0-87-235.us-east-2.compute.internal   <none>           <none>
      network-metrics-daemon-hntjt                   0/2     ContainerCreating           4             19m   <none>        ip-10-0-29-237.us-east-2.compute.internal   <none>           <none>
      network-metrics-daemon-p5zr8                   2/2     Running                     2             85m   10.128.0.7    ip-10-0-27-117.us-east-2.compute.internal   <none>           <none>
      network-metrics-daemon-rnr49                   2/2     Running                     2             77m   10.128.2.2    ip-10-0-89-143.us-east-2.compute.internal   <none>           <none>  
      
      Checking pod logs states below error
      
      Warning  Failed        7m26s                  kubelet            Error: container create failed: time="2023-10-30T08:56:08Z" level=error msg="runc create failed: unable to start container process: unable to apply cgroup configuration: failed to write \"0-3\": write /sys/fs/cgroup/cpuset/system.slice/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podedf60dee_c378_4f74_9bff_74b3d2583824.slice/crio-cf7d301f54722b4e2a04eb18bcc0048c8c888b70018e2041bfda8a71a89e89c0.scope/cpuset.cpus: permission denied"
        Warning  Failed        6m50s (x3 over 7m14s)  kubelet            (combined from similar events): Error: container create failed: time="2023-10-30T08:56:44Z" level=error msg="runc create failed: unable to start container process: unable to apply cgroup configuration: failed to write \"0-3\": write /sys/fs/cgroup/cpuset/system.slice/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podedf60dee_c378_4f74_9bff_74b3d2583824.slice/crio-4574e21f7cf86aef9192a16bb37688b78f0582ab66709bfe9dff3bfcbc45617f.scope/cpuset.cpus: permission denied"

      Version-Release number of selected component (if applicable):

      4.15.0-0.nightly-2023-10-27-135451

      How reproducible:

       

      Steps to Reproduce:

      1. Deploy 4.15 cluster
      2. Create below performance profile
      
      apiVersion: performance.openshift.io/v2
      kind: PerformanceProfile
      metadata:
        name: performance
      spec:
        cpu:
          isolated: "2"
          reserved: 0-1
        nodeSelector:
          node-role.kubernetes.io/worker: ""
      

      Actual results:

      Worker nodes gets stuck in NotReady state

      Expected results:

      Performance Profile is applied successfully

      Additional info:

      Same performance profile applies successfully on 4.14 cluster

            pehunt@redhat.com Peter Hunt
            schoudha Sunil Choudhary
            Sunil Choudhary Sunil Choudhary
            Votes:
            0 Vote for this issue
            Watchers:
            12 Start watching this issue

              Created:
              Updated:
              Resolved: