-
Bug
-
Resolution: Done-Errata
-
Major
-
rhos-17.1.4
-
1
-
False
-
-
False
-
openstack-tripleo-heat-templates-14.3.1-17.1.20250312101345.e7c7ce3.el9ost
-
None
-
-
-
Compute FR2 d/t + 18.0.7 BF, Compute Early F + 18.0.7
-
2
-
Moderate
Description of problem:
At the customer environment, nova_virtqemud container died and did not restart. VMs were left in an unmanageable state until nova_virtqemud was manually restarted by an operator.
Version-Release number of selected component (if applicable):
17.1.3, RHEL9 computes only
How reproducible:
Partially reproducible: We don't know the reason for virtqemud dying in the first place, but we can reproduce the consequences.
Steps to Reproduce:
1. Start a bunch of VMs in a compute host
2. Log into the compute host and capture initial status
~~~
[root@overcloud-novacompute-2 ~]# podman exec nova_virtqemud virsh list
Id Name State
-----------------------------------
12 instance-00000034 running
13 instance-00000037 running
14 instance-0000003a running
15 instance-0000003d running
16 instance-00000040 running
17 instance-00000043 running
[root@overcloud-novacompute-2 ~]# ps -ef | grep instance | cut -c-103
qemu 29347 25490 3 Oct25 ? 02:11:01 /usr/libexec/qemu-kvm -name guest=instance-00000034
qemu 29412 25490 3 Oct25 ? 02:11:03 /usr/libexec/qemu-kvm -name guest=instance-00000037
qemu 29449 25490 3 Oct25 ? 02:09:56 /usr/libexec/qemu-kvm -name guest=instance-0000003a
qemu 29472 25490 3 Oct25 ? 02:11:10 /usr/libexec/qemu-kvm -name guest=instance-0000003d
qemu 29512 25490 3 Oct25 ? 02:08:02 /usr/libexec/qemu-kvm -name guest=instance-00000040
qemu 29572 25490 3 Oct25 ? 02:10:08 /usr/libexec/qemu-kvm -name guest=instance-00000043
root 56233 55600 0 10:23 pts/1 00:00:00 grep --color=auto instance
[root@overcloud-novacompute-2 ~]#
[root@overcloud-novacompute-2 ~]# ps fauxwww | cut -c -122 | grep -A18 25490
root 25490 0.0 0.0 8304 1892 ? Ss Oct25 0:00 /usr/bin/conmon --api-version 1 -c 6e80f1b2c42f89978618
root 25492 0.0 0.0 2500 964 ? Ss Oct25 0:00 _ dumb-init --single-child – kolla_start
root 25494 0.1 0.1 2135132 39512 ? Sl Oct25 4:09 | _ /usr/sbin/virtqemud --config /etc/libvirt/virtq
qemu 29347 3.2 1.0 3461852 248784 ? Sl Oct25 131:16 _ /usr/libexec/qemu-kvm -name guest=instance-00000034
qemu 29412 3.2 1.0 3527608 257624 ? Sl Oct25 131:18 _ /usr/libexec/qemu-kvm -name guest=instance-00000037
qemu 29449 3.2 1.0 3492700 260056 ? Sl Oct25 130:12 _ /usr/libexec/qemu-kvm -name guest=instance-0000003a
qemu 29472 3.2 1.0 2749156 253528 ? Sl Oct25 131:26 _ /usr/libexec/qemu-kvm -name guest=instance-0000003d
qemu 29512 3.1 1.0 3400388 245968 ? Sl Oct25 128:18 _ /usr/libexec/qemu-kvm -name guest=instance-00000040
qemu 29572 3.2 1.0 3470052 251540 ? Sl Oct25 130:23 _ /usr/libexec/qemu-kvm -name guest=instance-00000043
[root@overcloud-novacompute-2 ~]# systemctl status tripleo_nova_virtqemud
● tripleo_nova_virtqemud.service - nova_virtqemud container
Loaded: loaded (/etc/systemd/system/tripleo_nova_virtqemud.service; enabled; preset: disabled)
Active: active (running) since Fri 2024-10-25 15:39:05 CEST; 2 days ago
Process: 25477 ExecStart=/usr/libexec/tripleo-start-podman-container nova_virtqemud (code=exited, status=0/SUCCESS)
Main PID: 25490 (conmon)
Tasks: 1 (limit: 151827)
Memory: 652.0K
CPU: 239ms
CGroup: /system.slice/tripleo_nova_virtqemud.service
└─25490 /usr/bin/conmon --api-version 1 -c 6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e -u 6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e -r /usr/bin/crun -b /va>
Oct 25 15:39:04 overcloud-novacompute-2 systemd[1]: Starting nova_virtqemud container...
Oct 25 15:39:05 overcloud-novacompute-2 podman[25478]: 2024-10-25 15:39:05.123479962 +0200 CEST m=+0.167358587 container init 6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e (image=satellite.kell>
Oct 25 15:39:05 overcloud-novacompute-2 podman[25478]: 2024-10-25 15:39:05.159366286 +0200 CEST m=+0.203244899 container start 6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e (image=satellite.kel>
Oct 25 15:39:05 overcloud-novacompute-2 tripleo-start-podman-container[25478]: nova_virtqemud
Oct 25 15:39:05 overcloud-novacompute-2 tripleo-start-podman-container[25477]: Creating additional drop-in dependency for "nova_virtqemud" (6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e)
Oct 25 15:39:05 overcloud-novacompute-2 systemd[1]: Started nova_virtqemud container.
[root@overcloud-novacompute-2 ~]#
[root@overcloud-novacompute-2 ~]# podman inspect nova_virtqemud | jq .[].State
{
"OciVersion": "1.1.0-rc.1",
"Status": "running",
"Running": true,
"Paused": false,
"Restarting": false,
"OOMKilled": false,
"Dead": false,
"Pid": 25492,
"ConmonPid": 25490,
"ExitCode": 0,
"Error": "container is stopped",
"StartedAt": "2024-10-25T15:39:05.129726511+02:00",
"FinishedAt": "2024-10-25T15:37:40.68525401+02:00",
"Health":
,
"CgroupPath": "/machine.slice/libpod-6e80f1b2c42f89978618fea8ac7ae34fbcd3f0b112205704559b17c4aa0cc76e.scope",
"CheckpointedAt": "0001-01-01T00:00:00Z",
"RestoredAt": "0001-01-01T00:00:00Z"
}
[root@overcloud-novacompute-2 ~]#
~~~
3. Look for the pid of virtqemud and kill it
~~~
[root@overcloud-novacompute-2 ~]# kill 25494
[root@overcloud-novacompute-2 ~]#
~~~
Actual results:
nova_virtqemud dies and does not respawn:
~~~
[root@overcloud-novacompute-2 ~]# podman ps -a | grep nova_virtqemud$
6e80f1b2c42f satellite.keller.lab:443/keller-prod-osp_17_1_cv-rhosp17_containers-nova-libvirt:17.1 kolla_start 3 days ago Exited (0) 31 minutes ago nova_virtqemud
[root@overcloud-novacompute-2 ~]#
[root@overcloud-novacompute-2 ~]# podman inspect nova_virtqemud | jq .[].State
{
"OciVersion": "1.1.0-rc.1",
"Status": "stopped",
"Running": false,
"Paused": false,
"Restarting": false,
"OOMKilled": false,
"Dead": false,
"Pid": 0,
"ExitCode": 0,
"Error": "container is stopped",
"StartedAt": "2024-10-25T15:39:05.129726511+02:00",
"FinishedAt": "2024-10-28T11:40:32.528626743+01:00",
"Health":
,
"CheckpointedAt": "0001-01-01T00:00:00Z",
"RestoredAt": "0001-01-01T00:00:00Z"
}
[root@overcloud-novacompute-2 ~]#
~~~
Expected results:
Container virt_qemud should respawn automatically. Note that this is exactly what happens in 17.1 multi-rhel, where nova_libvirt is running:
~~~
[root@overcloud-novacompute-1 ~]# ps fauxwww | cut -c -122 | grep -A18 904527
root 904527 0.0 0.0 143824 2332 ? Ssl Oct25 0:00 /usr/bin/conmon --api-version 1 -c 6e1b915e25e493d4d768
root 904540 0.0 0.0 4240 872 ? Ss Oct25 0:00 _ dumb-init --single-child – kolla_start
root 904553 0.0 0.6 2455244 54800 ? Sl Oct25 2:24 | _ /usr/sbin/libvirtd
qemu 186493 34.6 8.6 3917552 688220 ? Sl 10:29 0:43 _ /usr/libexec/qemu-kvm -name guest=instance-00000026
qemu 186516 27.5 8.5 3951476 681880 ? Sl 10:29 0:34 _ /usr/libexec/qemu-kvm -name guest=instance-00000029
qemu 186548 29.6 8.5 3917564 680060 ? Rl 10:29 0:36 _ /usr/libexec/qemu-kvm -name guest=instance-00000023
qemu 186594 38.0 8.5 3279588 679236 ? Sl 10:29 0:46 _ /usr/libexec/qemu-kvm -name guest=instance-0000002c
qemu 186620 28.2 8.5 3937092 677888 ? Sl 10:29 0:34 _ /usr/libexec/qemu-kvm -name guest=instance-0000002f
tripleo+ 187267 0.0 0.1 89580 9304 ? Ss 10:30 0:00 /usr/lib/systemd/systemd --user
tripleo+ 187269 0.0 0.1 321668 8136 ? S 10:30 0:00 _ (sd-pam)
[root@overcloud-novacompute-1 ~]# kill 904553
[root@overcloud-novacompute-1 ~]#
[root@overcloud-novacompute-1 ~]# podman inspect nova_libvirt | jq .[].State
{
"OciVersion": "1.0.2-dev",
"Status": "running",
"Running": true,
"Paused": false,
"Restarting": false,
"OOMKilled": false,
"Dead": false,
"Pid": 195901,
"ConmonPid": 195889,
"ExitCode": 0,
"Error": "",
"StartedAt": "2024-10-28T10:44:02.093184286Z",
"FinishedAt": "2024-10-28T10:44:01.314110128Z",
"Healthcheck": {
"Status": "healthy",
"FailingStreak": 0,
"Log": [
,
,
,
,
{ "Start": "2024-10-28T10:44:02.182402344Z", "End": "2024-10-28T10:44:02.41700434Z", "ExitCode": 0, "Output": "" } ]
}
}
[root@overcloud-novacompute-1 ~]#
[root@overcloud-novacompute-1 ~]# ps -ef |grep libvirtd
root 195914 195901 0 10:44 ? 00:00:00 /usr/sbin/libvirtd
root 196229 187339 0 10:44 pts/5 00:00:00 grep --color=auto libvirtd
[root@overcloud-novacompute-1 ~]#
[root@overcloud-novacompute-1 ~]#
~~~
Additional info:
Details from customer environment will follow in private comments.
- external trackers
- links to
-
RHBA-2025:146011 Red Hat OpenStack Platform 17.1 bug fix and enhancement advisory
- mentioned on