Uploaded image for project: 'Openshift sandboxed containers'
  1. Openshift sandboxed containers
  2. KATA-2146

Pod does not start on 4.13.0-rc.5 with --threadpoolsize=N annotation

XMLWordPrintable

    • Icon: Bug Bug
    • Resolution: Done
    • Icon: High High
    • OCP 4.13, OCP 4.14
    • OCP 4.13, OCP 4.14
    • kata-containers
    • None
    • False
    • None
    • False
    • Hide
      .Pod with `io.katacontainers.config.hypervisor.virtio_fs_extra_args` annotation does not start

      `virtiofsd-1.5.0-1.el9_2.1` is available in {openshift} 4.13.24 and 4.14.4.

      `virtiofsd` now accepts the extra `--thread-pool-size=16` option. The number of threads increases from 1 to 16 as expected.


      Show
      .Pod with `io.katacontainers.config.hypervisor.virtio_fs_extra_args` annotation does not start `virtiofsd-1.5.0-1.el9_2.1` is available in {openshift} 4.13.24 and 4.14.4. `virtiofsd` now accepts the extra `--thread-pool-size=16` option. The number of threads increases from 1 to 16 as expected.
    • Bug Fix
    • Done
    • Kata Sprint #245
    • 0
    • 0

      Description

      Start a specific pod on OCP 4.13.0-rc.5 (based on RHCOS 9); it stays in ContainerCreating seemingly indefinitely.  Other pods have worked; this pod worked on 4.12 and earlier builds of OCP 4.13 that based on RHCOS 8 (4.13.0-ec).

      Note that RHEL/RHCOS 9 detects disks asynchronously, so that the /dev/sdX names cannot be relied on; the names in e. g. /dev/disk/by-id must be used.

      Steps to reproduce

      <What actions did you take to hit the bug?>

      1. Node has at least two disks with LSO/ODF installed.
      2. oc apply the attached pod

      Expected result

      Pod should start and run correctly.

      Actual result

      Pod is created but never transitions from ContainerCreating to Running.

      Impact

      Unable to run the pod in question.

      Env

      OCP 4.13.0-rc.5, kata-containers-3.0.2-5.el9.x86_64, 3 masters+3 workers in IBM cloud (x86) with two attached disks on worker, ODF operator 4.12.2-rhodf with LSO and/or ODF in use.  Fails with both Kata operator 1.3 and 1.4.

      Additional helpful info

      Have must-gather, but too large to attach.

      Pod:

      [root@ebattat-perf-ci-server0 ~]# oc get pod -n benchmark-runner
      NAME                    READY   STATUS              RESTARTS   AGE
      vdbench-kata-2ac4cc43   0/1     ContainerCreating   0          3h55m
      [root@ebattat-perf-ci-server0 ~]# oc get pod -n benchmark-runner -oyaml
      apiVersion: v1
      items:

      • apiVersion: v1
          kind: Pod
          metadata:
            annotations:
              io.katacontainers.config.hypervisor.virtio_fs_extra_args: '["-o","allow_direct_io","--thread-pool-size=16"]'
              k8s.ovn.org/pod-networks: '{"default":{"ip_addresses":["10.128.3.22/23"],"mac_address":"0a:58:0a:80:03:16","gateway_ips":["10.128.2.1"],"ip_address":"10.128.3.22/23","gateway_ip":"10.128.2.1"}}'
              k8s.v1.cni.cncf.io/network-status: |-
                [{
                    "name": "ovn-kubernetes",
                    "interface": "eth0",
                    "ips": [
                        "10.128.3.22"
                    ],
                    "mac": "0a:58:0a:80:03:16",
                    "default": true,
                    "dns": {}
                }]
              openshift.io/scc: privileged
            creationTimestamp: "2023-04-27T15:02:42Z"
            labels:
              app: vdbench-2ac4cc43
              benchmark-runner-workload: vdbench
              benchmark-uuid: 2ac4cc43-3aaa-4dd6-8599-b387ee14005b
              type: vdbench-kata-2ac4cc43
            name: vdbench-kata-2ac4cc43
            namespace: benchmark-runner
            resourceVersion: "8417690"
            uid: 471beea6-30ae-4533-835d-08f6a2970d3b
          spec:
            containers:
            - args:
              - -c
              - $WORKLOAD_METHOD
              command:
              - /bin/bash
              env:
              - name: BLOCK_SIZES
                value: 64,oltp1
              - name: IO_OPERATION
                value: write,oltp1
              - name: IO_THREADS
                value: 16,3
              - name: FILES_IO
                value: random,oltp1
              - name: IO_RATE
                value: max,max
              - name: MIX_PRECENTAGE
              - name: DURATION
                value: "20"
              - name: PAUSE
                value: "0"
              - name: WARMUP
                value: "20"
              - name: FILES_SELECTION
                value: random
              - name: COMPRESSION_RATIO
                value: "2"
              - name: RUN_FILLUP
                value: "yes"
              - name: LOGS_DIR
                value: /workload/
              - name: DIRECTORIES
                value: "100"
              - name: FILES_PER_DIRECTORY
                value: "10"
              - name: SIZE_PER_FILE
                value: "5"
              - name: REDIS_HOST
                value: redis-deployment.benchmark-runner.svc.cluster.local
              - name: WORKLOAD_METHOD
                value: /vdbench/vdbench_runner.sh
              - name: TIMEOUT
                value: "3600"
              image: quay.io/ebattat/centos-stream8-vdbench5.04.07-pod:v1.0.13
              imagePullPolicy: IfNotPresent
              name: vdbench-pod
              resources:
                limits:
                  cpu: "2"
                requests:
                  cpu: 10m
                  memory: 4Gi
              securityContext:
                capabilities:
                  drop:
                  - MKNOD
              terminationMessagePath: /dev/termination-log
              terminationMessagePolicy: File
              volumeMounts:
              - mountPath: /workload
                name: vdbench-pod-pvc-claim
              - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
                name: kube-api-access-f5thb
                readOnly: true
            dnsPolicy: ClusterFirst
            enableServiceLinks: true
            imagePullSecrets:
            - name: default-dockercfg-92zhb
            nodeName: worker-1
            nodeSelector:
              kubernetes.io/hostname: worker-1
              node-role.kubernetes.io/kata-oc: ""
            overhead:
              cpu: 250m
              memory: 350Mi
            preemptionPolicy: PreemptLowerPriority
            priority: 0
            restartPolicy: Never
            runtimeClassName: kata
            schedulerName: default-scheduler
            securityContext:
              seLinuxOptions:
                level: s0:c47,c9
            serviceAccount: default
            serviceAccountName: default
            terminationGracePeriodSeconds: 30
            tolerations:
            - effect: NoExecute
              key: node.kubernetes.io/not-ready
              operator: Exists
              tolerationSeconds: 300
            - effect: NoExecute
              key: node.kubernetes.io/unreachable
              operator: Exists
              tolerationSeconds: 300
            - effect: NoSchedule
              key: node.kubernetes.io/memory-pressure
              operator: Exists
            volumes:
            - name: vdbench-pod-pvc-claim
              persistentVolumeClaim:
                claimName: vdbench-pod-pvc-claim
            - name: kube-api-access-f5thb
              projected:
                defaultMode: 420
                sources:
                - serviceAccountToken:
                    expirationSeconds: 3607
                    path: token
                - configMap:
                    items:
                    - key: ca.crt
                      path: ca.crt
                    name: kube-root-ca.crt
                - downwardAPI:
                    items:
                    - fieldRef:
                        apiVersion: v1
                        fieldPath: metadata.namespace
                      path: namespace
                - configMap:
                    items:
                    - key: service-ca.crt
                      path: service-ca.crt
                    name: openshift-service-ca.crt
          status:
            conditions:
            - lastProbeTime: null
              lastTransitionTime: "2023-04-27T15:02:42Z"
              status: "True"
              type: Initialized
            - lastProbeTime: null
              lastTransitionTime: "2023-04-27T15:02:42Z"
              message: 'containers with unready status: [vdbench-pod]'
              reason: ContainersNotReady
              status: "False"
              type: Ready
            - lastProbeTime: null
              lastTransitionTime: "2023-04-27T15:02:42Z"
              message: 'containers with unready status: [vdbench-pod]'
              reason: ContainersNotReady
              status: "False"
              type: ContainersReady
            - lastProbeTime: null
              lastTransitionTime: "2023-04-27T15:02:42Z"
              status: "True"
              type: PodScheduled
            containerStatuses:
            - image: quay.io/ebattat/centos-stream8-vdbench5.04.07-pod:v1.0.13
              imageID: ""
              lastState: {}
              name: vdbench-pod
              ready: false
              restartCount: 0
              started: false
              state:
                waiting:
                  reason: ContainerCreating
            hostIP: 10.36.200.199
            phase: Pending
            qosClass: Burstable
            startTime: "2023-04-27T15:02:42Z"
        kind: List
        metadata:
          resourceVersion: ""

            rhgkurz Greg Kurz
            robertkrawitz Robert Krawitz
            Miriam Weiss Miriam Weiss
            Votes:
            0 Vote for this issue
            Watchers:
            18 Start watching this issue

              Created:
              Updated:
              Resolved: