=================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc get nodes/worker-0-2 -o json | jq .spec.taints null =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc get nodes -l 'node-role.kubernetes.io/worker' NAME STATUS ROLES AGE VERSION worker-0-0 Ready worker 10h v1.33.3 worker-0-1 Ready worker 10h v1.33.3 worker-0-2 Ready worker 10h v1.33.3 =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc debug node/worker-0-2 -- chroot /host bash -c "uptime -s" Temporary namespace openshift-debug-zlbhb is created for debugging node... Starting pod/worker-0-2-debug-5k526 ... To use host binaries, run `chroot /host` 2025-09-05 19:17:59 Removing debug pod ... Temporary namespace openshift-debug-zlbhb was removed. =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc get clusterversion NAME VERSION AVAILABLE PROGRESSING SINCE STATUS version 4.20.0-0.nightly-2025-09-01-101753 True False 10h Cluster version is 4.20.0-0.nightly-2025-09-01-101753 =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc get csv NAME DISPLAY VERSION REPLACES PHASE fence-agents-remediation.v0.6.0 Fence Agents Remediation Operator 0.6.0 fence-agents-remediation.v0.5.1 Succeeded node-healthcheck-operator.v0.10.0 Node Health Check Operator 0.10.0 node-healthcheck-operator.v0.9.1 Succeeded =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc get fartemplate -o yaml apiVersion: v1 items: - apiVersion: fence-agents-remediation.medik8s.io/v1alpha1 kind: FenceAgentsRemediationTemplate metadata: annotations: kubectl.kubernetes.io/last-applied-configuration: | {"apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","kind":"FenceAgentsRemediationTemplate","metadata":{"annotations":{},"name":"fenceagentsremediationtemplate-test","namespace":"openshift-workload-availability"},"spec":{"template":{"spec":{"agent":"fence_ipmilan","nodeSecretNames":{"worker-0-0":"worker-0-cred","worker-0-1":"worker-1-cred","worker-0-2":"worker-2-cred"},"nodeparameters":{"--ipport":{"master-0-0":"6230","master-0-1":"6231","master-0-2":"6232","worker-0-0":"6233","worker-0-1":"6234","worker-0-2":"6235"}},"retrycount":5,"retryinterval":"10s","sharedparameters":{"--action":"reboot","--ip":"192.168.123.1","--lanplus":"","--username":"admin"},"timeout":"300s"}}}} remediation.medik8s.io/multiple-templates-support: "true" creationTimestamp: "2025-09-05T19:13:33Z" generation: 2 name: fenceagentsremediationtemplate-test namespace: openshift-workload-availability resourceVersion: "242220" uid: 1d8acdbc-79f1-41ee-a2c8-585514e69199 spec: template: spec: agent: fence_ipmilan nodeSecretNames: worker-0-0: worker-0-cred worker-0-1: worker-1-cred worker-0-2: worker-2-cred nodeparameters: --ipport: master-0-0: "6230" master-0-1: "6231" master-0-2: "6232" worker-0-0: "6233" worker-0-1: "6234" worker-0-2: "6235" remediationStrategy: ResourceDeletion retrycount: 5 retryinterval: 10s sharedSecretName: fence-agents-credentials-shared sharedparameters: --action: reboot --ip: 192.168.123.1 --lanplus: "" --username: admin timeout: 5m0s kind: List metadata: resourceVersion: "" =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc get nhc -o yaml apiVersion: v1 items: - apiVersion: remediation.medik8s.io/v1alpha1 kind: NodeHealthCheck metadata: annotations: kubectl.kubernetes.io/last-applied-configuration: | {"apiVersion":"remediation.medik8s.io/v1alpha1","kind":"NodeHealthCheck","metadata":{"annotations":{},"name":"nhc-far-worker"},"spec":{"minHealthy":"30%","remediationTemplate":{"apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","kind":"FenceAgentsRemediationTemplate","name":"fenceagentsremediationtemplate-test","namespace":"openshift-workload-availability"},"selector":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"DoesNotExist","values":[]},{"key":"node-role.kubernetes.io/master","operator":"DoesNotExist","values":[]}]},"unhealthyConditions":[{"duration":"30s","status":"False","type":"Ready"},{"duration":"30s","status":"Unknown","type":"Ready"}]}} creationTimestamp: "2025-09-05T19:13:33Z" generation: 1 name: nhc-far-worker resourceVersion: "241041" uid: b6917445-7d43-4bb7-899c-68fc638c3866 spec: minHealthy: 30% remediationTemplate: apiVersion: fence-agents-remediation.medik8s.io/v1alpha1 kind: FenceAgentsRemediationTemplate name: fenceagentsremediationtemplate-test namespace: openshift-workload-availability selector: matchExpressions: - key: node-role.kubernetes.io/control-plane operator: DoesNotExist values: [] - key: node-role.kubernetes.io/master operator: DoesNotExist values: [] unhealthyConditions: - duration: 30s status: "False" type: Ready - duration: 30s status: Unknown type: Ready status: conditions: - lastTransitionTime: "2025-09-05T19:13:33Z" message: No issues found, NodeHealthCheck is enabled. reason: NodeHealthCheckEnabled status: "False" type: Disabled healthyNodes: 3 lastUpdateTime: "2025-09-05T19:19:12Z" observedNodes: 3 phase: Enabled reason: NHC is enabled, no ongoing remediation kind: List metadata: resourceVersion: "" =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc debug node/worker-0-2 -- chroot /host bash -c "systemctl stop kubelet" Temporary namespace openshift-debug-ndbrg is created for debugging node... Starting pod/worker-0-2-debug-b4jr6 ... To use host binaries, run `chroot /host` =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc get nodes -l 'node-role.kubernetes.io/worker' NAME STATUS ROLES AGE VERSION worker-0-0 Ready worker 10h v1.33.3 worker-0-1 Ready worker 10h v1.33.3 worker-0-2 NotReady worker 10h v1.33.3 =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc get far -o yaml apiVersion: v1 items: - apiVersion: fence-agents-remediation.medik8s.io/v1alpha1 kind: FenceAgentsRemediation metadata: annotations: remediation.medik8s.io/node-name: worker-0-2 remediation.medik8s.io/template-name: fenceagentsremediationtemplate-test creationTimestamp: "2025-09-05T19:27:51Z" finalizers: - fence-agents-remediation.medik8s.io/far-finalizer generateName: worker-0-2- generation: 1 labels: app.kubernetes.io/part-of: node-healthcheck-controller name: worker-0-2-4v9st namespace: openshift-workload-availability ownerReferences: - apiVersion: remediation.medik8s.io/v1alpha1 controller: false kind: NodeHealthCheck name: nhc-far-worker uid: b6917445-7d43-4bb7-899c-68fc638c3866 resourceVersion: "244073" uid: 4029d989-ff40-44f3-806f-41d3976dfb5b spec: agent: fence_ipmilan nodeSecretNames: worker-0-0: worker-0-cred worker-0-1: worker-1-cred worker-0-2: worker-2-cred nodeparameters: --ipport: master-0-0: "6230" master-0-1: "6231" master-0-2: "6232" worker-0-0: "6233" worker-0-1: "6234" worker-0-2: "6235" remediationStrategy: ResourceDeletion retrycount: 5 retryinterval: 10s sharedSecretName: fence-agents-credentials-shared sharedparameters: --action: reboot --ip: 192.168.123.1 --lanplus: "" --username: admin timeout: 5m0s status: conditions: - lastTransitionTime: "2025-09-05T19:27:51Z" message: FAR CR was found, its name matches one of the cluster nodes, and a finalizer was set to the CR reason: RemediationStarted status: "True" type: Processing - lastTransitionTime: "2025-09-05T19:27:51Z" message: FAR CR was found, its name matches one of the cluster nodes, and a finalizer was set to the CR reason: RemediationStarted status: Unknown type: FenceAgentActionSucceeded - lastTransitionTime: "2025-09-05T19:27:51Z" message: FAR CR was found, its name matches one of the cluster nodes, and a finalizer was set to the CR reason: RemediationStarted status: Unknown type: Succeeded lastUpdateTime: "2025-09-05T19:27:51Z" kind: List metadata: resourceVersion: "" =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc get nodes -l 'node-role.kubernetes.io/worker' NAME STATUS ROLES AGE VERSION worker-0-0 Ready worker 10h v1.33.3 worker-0-1 Ready worker 10h v1.33.3 worker-0-2 Ready worker 10h v1.33.3 =================================================================================================================================== [kni@cert-rhosp-02 ~]$ oc get nodes/worker-0-2 -o json | jq .spec.taints null [kni@cert-rhosp-02 ~]$ oc debug node/worker-0-2 -- chroot /host bash -c "uptime -s" Temporary namespace openshift-debug-qjbsm is created for debugging node... Starting pod/worker-0-2-debug-f5qzh ... To use host binaries, run `chroot /host` 2025-09-05 19:28:00 Removing debug pod ... Temporary namespace openshift-debug-qjbsm was removed. FAR Logs: 2025-09-05T19:10:51.22260076Z DEBUG events fence-agents-remediation-controller-manager-748dc69b5b-sxh2m_91b3bd89-c7fb-4e5c-8049-2987806f7f35 became leader {"type": "Normal", "object": {"kind":"Lease","namespace":"openshift-workload-availability","name":"cb305759.medik8s.io","uid":"7d23e64b-9fc9-420b-8a6f-c8ccecdcc418","apiVersion":"coordination.k8s.io/v1","resourceVersion":"237256"}, "reason": "LeaderElection"} 2025-09-05T19:10:51.324730786Z INFO Starting workers {"controller": "fenceagentsremediation", "controllerGroup": "fence-agents-remediation.medik8s.io", "controllerKind": "FenceAgentsRemediation", "worker count": 1} 2025-09-05T19:17:50.451527254Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-05T19:17:50.451562409Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-05T19:17:50.561621159Z INFO controllers.FenceAgentsRemediation Finalizer was added {"CR Name": "worker-0-2-ncvhm"} 2025-09-05T19:17:50.561651158Z INFO controllers.FenceAgentsRemediation Updating Status Condition {"processingConditionStatus": "True", "fenceAgentActionSucceededConditionStatus": "Unknown", "succeededConditionStatus": "Unknown", "reason": "RemediationStarted", "LastUpdateTime": "2025-09-05T19:17:50.561649723Z"} 2025-09-05T19:17:50.561875071Z DEBUG events [remediation] Remediation started {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-2-ncvhm","uid":"78587e5a-99fa-436d-b5de-604a7e08364e","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"239709"}, "reason": "RemediationStarted"} 2025-09-05T19:17:50.56191298Z DEBUG events [remediation] Finalizer was added {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-2-ncvhm","uid":"78587e5a-99fa-436d-b5de-604a7e08364e","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"239709"}, "reason": "AddFinalizer"} 2025-09-05T19:17:50.767810712Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-05T19:17:50.7678974Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-05T19:17:50.767913147Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-05T19:17:50.775418281Z INFO taints Taint was added {"taint effect": "NoExecute", "taint list": [{"key":"node.kubernetes.io/unreachable","effect":"NoSchedule","timeAdded":"2025-09-05T19:17:19Z"},{"key":"node.kubernetes.io/unreachable","effect":"NoExecute","timeAdded":"2025-09-05T19:17:25Z"},{"key":"medik8s.io/fence-agents-remediation","effect":"NoExecute","timeAdded":"2025-09-05T19:17:50Z"}]} 2025-09-05T19:17:50.775506961Z INFO controllers.FenceAgentsRemediation FAR remediation taint was added {"Node Name": "worker-0-2"} 2025-09-05T19:17:50.775562536Z INFO controllers.FenceAgentsRemediation Build fence agent command line {"Fence Agent": "fence_ipmilan", "Node Name": "worker-0-2"} 2025-09-05T19:17:50.775687012Z DEBUG events [remediation] Remediation taint was added {"type": "Normal", "object": {"kind":"Node","name":"worker-0-2","uid":"681d8f9a-ac99-4792-bf19-b7824aee5f6c","apiVersion":"v1","resourceVersion":"239512"}, "reason": "AddRemediationTaint"} 2025-09-05T19:17:50.975970602Z INFO controllers.FenceAgentsRemediation Execute the fence agent {"Fence Agent": "fence_ipmilan", "Node Name": "worker-0-2", "FAR uid": "78587e5a-99fa-436d-b5de-604a7e08364e", "ParametersError": "json: unsupported type: iter.Seq[github.com/medik8s/fence-agents-remediation/api/v1alpha1.ParameterName]"} 2025-09-05T19:17:50.976152561Z INFO executer fence agent start {"uid": "78587e5a-99fa-436d-b5de-604a7e08364e", "fence_agent": "fence_ipmilan", "retryCount": 5, "retryInterval": "10s", "timeout": "5m0s"} 2025-09-05T19:17:50.976258062Z DEBUG events [remediation] Fence agent was executed {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-2-ncvhm","uid":"78587e5a-99fa-436d-b5de-604a7e08364e","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"239711"}, "reason": "FenceAgentExecuted"} 2025-09-05T19:17:50.99405439Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-05T19:17:50.99413384Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-05T19:17:50.994149331Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-05T19:17:50.994217178Z INFO controllers.FenceAgentsRemediation A Fence Agent is already running {"Fence Agent": "fence_ipmilan", "Node Name": "worker-0-2", "FAR uid": "78587e5a-99fa-436d-b5de-604a7e08364e"} 2025-09-05T19:17:51.005002542Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-05T19:17:56.048722429Z INFO executer command completed {"uid": "78587e5a-99fa-436d-b5de-604a7e08364e", "response": "Success: Rebooted\n", "errMessage": "", "err": null} 2025-09-05T19:17:56.048818554Z INFO executer fence agent done {"uid": "78587e5a-99fa-436d-b5de-604a7e08364e", "fence_agent": "fence_ipmilan", "stdout": "Success: Rebooted\n", "stderr": "", "err": null} 2025-09-05T19:17:56.048835674Z INFO executer updating status {"FAR uid": "78587e5a-99fa-436d-b5de-604a7e08364e"} 2025-09-05T19:17:56.048897738Z INFO executer Updating Status Condition {"processingConditionStatus": "", "fenceAgentActionSucceededConditionStatus": "True", "succeededConditionStatus": "", "reason": "FenceAgentSucceeded", "LastUpdateTime": "2025-09-05T19:17:56.048896992Z"} 2025-09-05T19:17:56.049194729Z DEBUG events [remediation] Fence agent was succeeded {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-2-ncvhm","uid":"78587e5a-99fa-436d-b5de-604a7e08364e","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"239711"}, "reason": "FenceAgentSucceeded"} 2025-09-05T19:17:56.056815649Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-05T19:17:56.056889094Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-05T19:17:56.056946382Z INFO controllers.FenceAgentsRemediation Remediation strategy is ResourceDeletion which explicitly deletes resources - manually deleting workload {"Node Name": "worker-0-2-ncvhm"} 2025-09-05T19:17:56.057035494Z DEBUG events [remediation] Manually delete pods from the unhealthy node {"type": "Normal", "object": {"kind":"Node","name":"worker-0-2","uid":"681d8f9a-ac99-4792-bf19-b7824aee5f6c","apiVersion":"v1","resourceVersion":"239713"}, "reason": "DeleteResources"} 2025-09-05T19:17:56.057090026Z INFO executer status updated {"FAR uid": "78587e5a-99fa-436d-b5de-604a7e08364e"} 2025-09-05T19:17:56.158469519Z INFO commons-resource starting to delete pods {"node name": "worker-0-2"} 2025-09-05T19:17:58.511081169Z INFO commons-resource done deleting pods {"node name": "worker-0-2"} 2025-09-05T19:17:58.511110369Z INFO controllers.FenceAgentsRemediation Updating Status Condition {"processingConditionStatus": "False", "fenceAgentActionSucceededConditionStatus": "", "succeededConditionStatus": "True", "reason": "RemediationFinishedSuccessfully", "LastUpdateTime": "2025-09-05T19:17:58.511108983Z"} 2025-09-05T19:17:58.511127841Z INFO executer cancelling fence agent routine {"uid": "78587e5a-99fa-436d-b5de-604a7e08364e"} 2025-09-05T19:17:58.511134034Z INFO controllers.FenceAgentsRemediation FenceAgentsRemediation CR has completed to remediate the node {"Node Name": "worker-0-2"} 2025-09-05T19:17:58.511250689Z DEBUG events [remediation] Unhealthy node remediation was completed {"type": "Normal", "object": {"kind":"Node","name":"worker-0-2","uid":"681d8f9a-ac99-4792-bf19-b7824aee5f6c","apiVersion":"v1","resourceVersion":"239713"}, "reason": "NodeRemediationCompleted"} 2025-09-05T19:17:58.511320877Z DEBUG events [remediation] Remediation finished {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-2-ncvhm","uid":"78587e5a-99fa-436d-b5de-604a7e08364e","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"240195"}, "reason": "RemediationFinished"} 2025-09-05T19:17:58.716779404Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-05T19:17:58.7168535Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-05T19:17:58.716863082Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-05T19:17:58.722539763Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-05T19:19:12.139034431Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-05T19:19:12.139153641Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-05T19:19:12.139207498Z INFO controllers.FenceAgentsRemediation CR's deletion timestamp is not zero, and FAR finalizer exists {"CR Name": "worker-0-2-ncvhm"} 2025-09-05T19:19:12.145030014Z INFO taints Taint was removed {"taint effect": "NoExecute", "taint list": [{"key":"node.kubernetes.io/not-ready","effect":"NoExecute","timeAdded":"2025-09-05T19:19:10Z"}]} 2025-09-05T19:19:12.145070177Z INFO controllers.FenceAgentsRemediation FAR remediation taint was removed {"Node Name": "worker-0-2"} 2025-09-05T19:19:12.145192739Z DEBUG events [remediation] Remediation taint was removed {"type": "Normal", "object": {"kind":"Node","name":"worker-0-2","uid":"681d8f9a-ac99-4792-bf19-b7824aee5f6c","apiVersion":"v1","resourceVersion":"240984"}, "reason": "RemoveRemediationTaint"} 2025-09-05T19:19:12.157876781Z INFO controllers.FenceAgentsRemediation Finalizer was removed {"CR Name": "worker-0-2-ncvhm"} 2025-09-05T19:19:12.157922037Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-05T19:19:12.157940929Z DEBUG events [remediation] Finalizer was removed {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-2-ncvhm","uid":"78587e5a-99fa-436d-b5de-604a7e08364e","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"241008"}, "reason": "RemoveFinalizer"} 2025-09-05T19:19:12.158178295Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-05T19:19:12.158224227Z INFO controllers.FenceAgentsRemediation FenceAgentsRemediation CR was not found {"CR Name": "worker-0-2-ncvhm", "CR Namespace": "openshift-workload-availability"} 2025-09-05T19:19:12.158243849Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile