Uploaded image for project: 'Red Hat OpenStack Services on OpenShift'
  1. Red Hat OpenStack Services on OpenShift
  2. OSPRH-13036

nova_virtqemud container does not restart after virtqemud dies

XMLWordPrintable

    • Compute FR2 d/t + 18.0.7 BF, Compute Early F + 18.0.7
    • 2
    • Moderate

      Description of problem:
      At the customer environment, nova_virtqemud container died and did not restart. VMs were left in an unmanageable state until nova_virtqemud was manually restarted by an operator.

      Version-Release number of selected component (if applicable):
      17.1.3, RHEL9 computes only

      How reproducible:
      Partially reproducible: We don't know the reason for virtqemud dying in the first place, but we can reproduce the consequences.

      Steps to Reproduce:
      1. Start a bunch of VMs in a compute host

      2. Log into the compute host and capture initial status
      ~~~
      [root@overcloud-novacompute-2 ~]# podman exec nova_virtqemud virsh list
      Id Name State
      -----------------------------------
      12 instance-00000034 running
      13 instance-00000037 running
      14 instance-0000003a running
      15 instance-0000003d running
      16 instance-00000040 running
      17 instance-00000043 running
      [root@overcloud-novacompute-2 ~]# ps -ef | grep instance | cut -c-103
      qemu 29347 25490 3 Oct25 ? 02:11:01 /usr/libexec/qemu-kvm -name guest=instance-00000034
      qemu 29412 25490 3 Oct25 ? 02:11:03 /usr/libexec/qemu-kvm -name guest=instance-00000037
      qemu 29449 25490 3 Oct25 ? 02:09:56 /usr/libexec/qemu-kvm -name guest=instance-0000003a
      qemu 29472 25490 3 Oct25 ? 02:11:10 /usr/libexec/qemu-kvm -name guest=instance-0000003d
      qemu 29512 25490 3 Oct25 ? 02:08:02 /usr/libexec/qemu-kvm -name guest=instance-00000040
      qemu 29572 25490 3 Oct25 ? 02:10:08 /usr/libexec/qemu-kvm -name guest=instance-00000043
      root 56233 55600 0 10:23 pts/1 00:00:00 grep --color=auto instance
      [root@overcloud-novacompute-2 ~]#
      [root@overcloud-novacompute-2 ~]# ps fauxwww | cut -c -122 | grep -A18 25490
      root 25490 0.0 0.0 8304 1892 ? Ss Oct25 0:00 /usr/bin/conmon --api-version 1 -c 6e80f1b2c42f89978618
      root 25492 0.0 0.0 2500 964 ? Ss Oct25 0:00 _ dumb-init --single-child – kolla_start
      root 25494 0.1 0.1 2135132 39512 ? Sl Oct25 4:09 | _ /usr/sbin/virtqemud --config /etc/libvirt/virtq
      qemu 29347 3.2 1.0 3461852 248784 ? Sl Oct25 131:16 _ /usr/libexec/qemu-kvm -name guest=instance-00000034
      qemu 29412 3.2 1.0 3527608 257624 ? Sl Oct25 131:18 _ /usr/libexec/qemu-kvm -name guest=instance-00000037
      qemu 29449 3.2 1.0 3492700 260056 ? Sl Oct25 130:12 _ /usr/libexec/qemu-kvm -name guest=instance-0000003a
      qemu 29472 3.2 1.0 2749156 253528 ? Sl Oct25 131:26 _ /usr/libexec/qemu-kvm -name guest=instance-0000003d
      qemu 29512 3.1 1.0 3400388 245968 ? Sl Oct25 128:18 _ /usr/libexec/qemu-kvm -name guest=instance-00000040
      qemu 29572 3.2 1.0 3470052 251540 ? Sl Oct25 130:23 _ /usr/libexec/qemu-kvm -name guest=instance-00000043

      [root@overcloud-novacompute-2 ~]# systemctl status tripleo_nova_virtqemud
      ● tripleo_nova_virtqemud.service - nova_virtqemud container
      Loaded: loaded (/etc/systemd/system/tripleo_nova_virtqemud.service; enabled; preset: disabled)
      Active: active (running) since Fri 2024-10-25 15:39:05 CEST; 2 days ago
      Process: 25477 ExecStart=/usr/libexec/tripleo-start-podman-container nova_virtqemud (code=exited, status=0/SUCCESS)
      Main PID: 25490 (conmon)
      Tasks: 1 (limit: 151827)
      Memory: 652.0K
      CPU: 239ms
      CGroup: /system.slice/tripleo_nova_virtqemud.service
      └─25490 /usr/bin/conmon --api-version 1 -c 6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e -u 6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e -r /usr/bin/crun -b /va>

      Oct 25 15:39:04 overcloud-novacompute-2 systemd[1]: Starting nova_virtqemud container...
      Oct 25 15:39:05 overcloud-novacompute-2 podman[25478]: 2024-10-25 15:39:05.123479962 +0200 CEST m=+0.167358587 container init 6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e (image=satellite.kell>
      Oct 25 15:39:05 overcloud-novacompute-2 podman[25478]: 2024-10-25 15:39:05.159366286 +0200 CEST m=+0.203244899 container start 6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e (image=satellite.kel>
      Oct 25 15:39:05 overcloud-novacompute-2 tripleo-start-podman-container[25478]: nova_virtqemud
      Oct 25 15:39:05 overcloud-novacompute-2 tripleo-start-podman-container[25477]: Creating additional drop-in dependency for "nova_virtqemud" (6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e)
      Oct 25 15:39:05 overcloud-novacompute-2 systemd[1]: Started nova_virtqemud container.
      [root@overcloud-novacompute-2 ~]#
      [root@overcloud-novacompute-2 ~]# podman inspect nova_virtqemud | jq .[].State
      {
      "OciVersion": "1.1.0-rc.1",
      "Status": "running",
      "Running": true,
      "Paused": false,
      "Restarting": false,
      "OOMKilled": false,
      "Dead": false,
      "Pid": 25492,
      "ConmonPid": 25490,
      "ExitCode": 0,
      "Error": "container is stopped",
      "StartedAt": "2024-10-25T15:39:05.129726511+02:00",
      "FinishedAt": "2024-10-25T15:37:40.68525401+02:00",
      "Health":

      { "Status": "", "FailingStreak": 0, "Log": null }

      ,
      "CgroupPath": "/machine.slice/libpod-6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e.scope",
      "CheckpointedAt": "0001-01-01T00:00:00Z",
      "RestoredAt": "0001-01-01T00:00:00Z"
      }
      [root@overcloud-novacompute-2 ~]#
      ~~~

      3. Look for the pid of virtqemud and kill it
      ~~~
      [root@overcloud-novacompute-2 ~]# kill 25494
      [root@overcloud-novacompute-2 ~]#
      ~~~

      Actual results:

      nova_virtqemud dies and does not respawn:
      ~~~
      [root@overcloud-novacompute-2 ~]# podman ps -a | grep nova_virtqemud$
      6e80f1b2c42f satellite.keller.lab:443/keller-prod-osp_17_1_cv-rhosp17_containers-nova-libvirt:17.1 kolla_start 3 days ago Exited (0) 31 minutes ago nova_virtqemud
      [root@overcloud-novacompute-2 ~]#
      [root@overcloud-novacompute-2 ~]# podman inspect nova_virtqemud | jq .[].State
      {
      "OciVersion": "1.1.0-rc.1",
      "Status": "stopped",
      "Running": false,
      "Paused": false,
      "Restarting": false,
      "OOMKilled": false,
      "Dead": false,
      "Pid": 0,
      "ExitCode": 0,
      "Error": "container is stopped",
      "StartedAt": "2024-10-25T15:39:05.129726511+02:00",
      "FinishedAt": "2024-10-28T11:40:32.528626743+01:00",
      "Health":

      { "Status": "", "FailingStreak": 0, "Log": null }

      ,
      "CheckpointedAt": "0001-01-01T00:00:00Z",
      "RestoredAt": "0001-01-01T00:00:00Z"
      }
      [root@overcloud-novacompute-2 ~]#
      ~~~

      Expected results:

      Container virt_qemud should respawn automatically. Note that this is exactly what happens in 17.1 multi-rhel, where nova_libvirt is running:
      ~~~
      [root@overcloud-novacompute-1 ~]# ps fauxwww | cut -c -122 | grep -A18 904527
      root 904527 0.0 0.0 143824 2332 ? Ssl Oct25 0:00 /usr/bin/conmon --api-version 1 -c 6e1b915e25e493d4d768
      root 904540 0.0 0.0 4240 872 ? Ss Oct25 0:00 _ dumb-init --single-child – kolla_start
      root 904553 0.0 0.6 2455244 54800 ? Sl Oct25 2:24 | _ /usr/sbin/libvirtd
      qemu 186493 34.6 8.6 3917552 688220 ? Sl 10:29 0:43 _ /usr/libexec/qemu-kvm -name guest=instance-00000026
      qemu 186516 27.5 8.5 3951476 681880 ? Sl 10:29 0:34 _ /usr/libexec/qemu-kvm -name guest=instance-00000029
      qemu 186548 29.6 8.5 3917564 680060 ? Rl 10:29 0:36 _ /usr/libexec/qemu-kvm -name guest=instance-00000023
      qemu 186594 38.0 8.5 3279588 679236 ? Sl 10:29 0:46 _ /usr/libexec/qemu-kvm -name guest=instance-0000002c
      qemu 186620 28.2 8.5 3937092 677888 ? Sl 10:29 0:34 _ /usr/libexec/qemu-kvm -name guest=instance-0000002f
      tripleo+ 187267 0.0 0.1 89580 9304 ? Ss 10:30 0:00 /usr/lib/systemd/systemd --user
      tripleo+ 187269 0.0 0.1 321668 8136 ? S 10:30 0:00 _ (sd-pam)

      [root@overcloud-novacompute-1 ~]# kill 904553
      [root@overcloud-novacompute-1 ~]#
      [root@overcloud-novacompute-1 ~]# podman inspect nova_libvirt | jq .[].State
      {
      "OciVersion": "1.0.2-dev",
      "Status": "running",
      "Running": true,
      "Paused": false,
      "Restarting": false,
      "OOMKilled": false,
      "Dead": false,
      "Pid": 195901,
      "ConmonPid": 195889,
      "ExitCode": 0,
      "Error": "",
      "StartedAt": "2024-10-28T10:44:02.093184286Z",
      "FinishedAt": "2024-10-28T10:44:01.314110128Z",
      "Healthcheck": {
      "Status": "healthy",
      "FailingStreak": 0,
      "Log": [

      { "Start": "2024-10-28T10:42:08.590278718Z", "End": "2024-10-28T10:42:08.729828866Z", "ExitCode": 0, "Output": "" }

      ,

      { "Start": "2024-10-28T10:42:39.58061018Z", "End": "2024-10-28T10:42:39.670714923Z", "ExitCode": 0, "Output": "" }

      ,

      { "Start": "2024-10-28T10:43:10.587290322Z", "End": "2024-10-28T10:43:10.748464669Z", "ExitCode": 0, "Output": "" }

      ,

      { "Start": "2024-10-28T10:43:41.588152594Z", "End": "2024-10-28T10:43:41.750076795Z", "ExitCode": 0, "Output": "" }

      ,

      { "Start": "2024-10-28T10:44:02.182402344Z", "End": "2024-10-28T10:44:02.41700434Z", "ExitCode": 0, "Output": "" }

      ]
      }
      }
      [root@overcloud-novacompute-1 ~]#
      [root@overcloud-novacompute-1 ~]# ps -ef |grep libvirtd
      root 195914 195901 0 10:44 ? 00:00:00 /usr/sbin/libvirtd
      root 196229 187339 0 10:44 pts/5 00:00:00 grep --color=auto libvirtd
      [root@overcloud-novacompute-1 ~]#
      [root@overcloud-novacompute-1 ~]#
      ~~~

      Additional info:

      Details from customer environment will follow in private comments.

              bdobreli@redhat.com Bohdan Dobrelia
              jira-bugzilla-migration RH Bugzilla Integration
              RH Bugzilla Integration RH Bugzilla Integration
              rhos-workloads-compute
              Votes:
              0 Vote for this issue
              Watchers:
              9 Start watching this issue

                Created:
                Updated:
                Resolved: