[kni@cert-rhosp-02 ~]$ vi test.yaml [kni@cert-rhosp-02 ~]$ cat test.yaml apiVersion: fence-agents-remediation.medik8s.io/v1alpha1 kind: FenceAgentsRemediationTemplate metadata: name: fenceagentsremediationtemplate-test namespace: openshift-workload-availability spec: template: spec: agent: fence_ipmilan retrycount: 5 retryinterval: 10s timeout: 300s nodeparameters: '--ipport': master-0-0: '6230' master-0-1: '6231' master-0-2: '6232' worker-0-0: '6233' worker-0-1: '6234' worker-0-2: '6235' sharedparameters: '--action': reboot '--lanplus': '' '--ip': 192.168.123.1 '--password': password '--username': admin --- apiVersion: remediation.medik8s.io/v1alpha1 kind: NodeHealthCheck metadata: name: nhc-far-worker spec: maxUnhealthy: 30% remediationTemplate: apiVersion: fence-agents-remediation.medik8s.io/v1alpha1 kind: FenceAgentsRemediationTemplate name: fenceagentsremediationtemplate-test namespace: openshift-workload-availability selector: matchExpressions: - key: node-role.kubernetes.io/control-plane operator: DoesNotExist values: [] - key: node-role.kubernetes.io/master operator: DoesNotExist values: [] unhealthyConditions: - duration: 30s status: 'False' type: Ready - duration: 30s status: Unknown type: Ready [kni@cert-rhosp-02 ~]$ oc apply -f test.yaml [kni@cert-rhosp-02 ~]$ oc get clusterversion NAME VERSION AVAILABLE PROGRESSING SINCE STATUS version 4.16.0-0.nightly-2025-09-06-014223 True False 2d Cluster version is 4.16.0-0.nightly-2025-09-06-014223 [kni@cert-rhosp-02 ~]$ oc get csv NAME DISPLAY VERSION REPLACES PHASE fence-agents-remediation.v0.6.0 Fence Agents Remediation Operator 0.6.0 fence-agents-remediation.v0.5.1 Succeeded node-healthcheck-operator.v0.10.0 Node Health Check Operator 0.10.0 node-healthcheck-operator.v0.9.1 Succeeded [kni@cert-rhosp-02 ~]$ PODS=$(oc get pods -o name | grep fence-agents-remediation-controller-manager) [kni@cert-rhosp-02 ~]$ # PODS=$(oc get pods -o name | grep self-node) [kni@cert-rhosp-02 ~]$ for p in $PODS; do > echo "== $p" > oc get "$p" -o json | jq .spec.nodeName > done == pod/fence-agents-remediation-controller-manager-5f76bb6467-ms9xp "worker-0-2" == pod/fence-agents-remediation-controller-manager-5f76bb6467-rs9ns "worker-0-1" [kni@cert-rhosp-02 ~]$ oc debug node/worker-0-0 -- chroot /host bash -c "uptime -s" Temporary namespace openshift-debug-rwbc2 is created for debugging node... Starting pod/worker-0-0-debug-fpktv ... To use host binaries, run `chroot /host` 2025-09-09 19:35:02 Removing debug pod ... Temporary namespace openshift-debug-rwbc2 was removed. [kni@cert-rhosp-02 ~]$ oc debug node/worker-0-0 -- chroot /host bash -c "systemctl stop kubelet" Temporary namespace openshift-debug-6wjwd is created for debugging node... Starting pod/worker-0-0-debug-z5xjd ... [kni@cert-rhosp-02 ~]$ oc get nodes -l 'node-role.kubernetes.io/worker' NAME STATUS ROLES AGE VERSION worker-0-0 Ready worker 2d1h v1.29.14+c68a663 worker-0-1 Ready worker 2d1h v1.29.14+c68a663 worker-0-2 Ready worker 2d1h v1.29.14+c68a663 [kni@cert-rhosp-02 ~]$ oc get nodes -l 'node-role.kubernetes.io/worker' NAME STATUS ROLES AGE VERSION worker-0-0 NotReady worker 2d1h v1.29.14+c68a663 worker-0-1 Ready worker 2d1h v1.29.14+c68a663 worker-0-2 Ready worker 2d1h v1.29.14+c68a663 [kni@cert-rhosp-02 ~]$ oc get far -o yaml apiVersion: v1 items: - apiVersion: fence-agents-remediation.medik8s.io/v1alpha1 kind: FenceAgentsRemediation metadata: annotations: remediation.medik8s.io/node-name: worker-0-0 remediation.medik8s.io/template-name: fenceagentsremediationtemplate-test creationTimestamp: "2025-09-09T21:46:36Z" finalizers: - fence-agents-remediation.medik8s.io/far-finalizer generateName: worker-0-0- generation: 1 labels: app.kubernetes.io/part-of: node-healthcheck-controller name: worker-0-0-q89z8 namespace: openshift-workload-availability ownerReferences: - apiVersion: remediation.medik8s.io/v1alpha1 controller: false kind: NodeHealthCheck name: nhc-far-worker uid: a8b4ecb5-488a-4f31-8f1b-0b786364e2a0 resourceVersion: "1202950" uid: 3644295c-03ae-4fbf-ba1a-753f8d176057 spec: agent: fence_ipmilan nodeparameters: --ipport: master-0-0: "6230" master-0-1: "6231" master-0-2: "6232" worker-0-0: "6233" worker-0-1: "6234" worker-0-2: "6235" remediationStrategy: ResourceDeletion retrycount: 5 retryinterval: 10s sharedSecretName: fence-agents-credentials-shared sharedparameters: --action: reboot --ip: 192.168.123.1 --lanplus: "" --password: password --username: admin timeout: 5m0s status: conditions: - lastTransitionTime: "2025-09-09T21:46:44Z" message: The unhealthy node was fully remediated (it was tainted, fenced using the fence agent and all the node resources have been deleted) reason: RemediationFinishedSuccessfully status: "False" type: Processing - lastTransitionTime: "2025-09-09T21:46:41Z" message: FAR taint was added and the fence agent command has been created and executed successfully reason: FenceAgentSucceeded status: "True" type: FenceAgentActionSucceeded - lastTransitionTime: "2025-09-09T21:46:44Z" message: The unhealthy node was fully remediated (it was tainted, fenced using the fence agent and all the node resources have been deleted) reason: RemediationFinishedSuccessfully status: "True" type: Succeeded lastUpdateTime: "2025-09-09T21:46:44Z" kind: List metadata: resourceVersion: "" [kni@cert-rhosp-02 ~]$ oc debug node/worker-0-0 -- chroot /host bash -c "uptime -s" Temporary namespace openshift-debug-hzcs5 is created for debugging node... Starting pod/worker-0-0-debug-llw7x ... To use host binaries, run `chroot /host` 2025-09-09 21:46:45 Removing debug pod ... Temporary namespace openshift-debug-hzcs5 was removed. [kni@cert-rhosp-02 ~]$ oc logs pod/fence-agents-remediation-controller-manager-5f76bb6467-ms9xp 2025-09-09T21:23:28.478270079Z INFO setup Go Version: go1.24.4 (Red Hat 1.24.4-2.el9) X:strictfipsruntime 2025-09-09T21:23:28.478381216Z INFO setup Go OS/Arch: linux/amd64 2025-09-09T21:23:28.4783843Z INFO setup Operator Version: bd73055e 2025-09-09T21:23:28.478386419Z INFO setup Git Commit: bd73055ef2c68bfdc865d2c54179f4448bd454da 2025-09-09T21:23:28.478388419Z INFO setup Build Date: 2025-09-08T09:09:10+00:00 2025-09-09T21:23:28.478404767Z INFO setup HTTP/2 for webhooks disabled 2025-09-09T21:23:28.492110464Z INFO validation out of service taint strategy {"isSupported": true, "k8sMajorVersion": 1, "k8sMinorVersion": 29} 2025-09-09T21:23:28.492149684Z INFO setup out-of-service taint is supported on this cluster 2025-09-09T21:23:28.492193382Z INFO controller-runtime.builder skip registering a mutating webhook, object does not implement admission.Defaulter or WithDefaulter wasn't called {"GVK": "fence-agents-remediation.medik8s.io/v1alpha1, Kind=FenceAgentsRemediation"} 2025-09-09T21:23:28.492232952Z INFO controller-runtime.builder Registering a validating webhook {"GVK": "fence-agents-remediation.medik8s.io/v1alpha1, Kind=FenceAgentsRemediation", "path": "/validate-fence-agents-remediation-medik8s-io-v1alpha1-fenceagentsremediation"} 2025-09-09T21:23:28.492331877Z INFO controller-runtime.webhook Registering webhook {"path": "/validate-fence-agents-remediation-medik8s-io-v1alpha1-fenceagentsremediation"} 2025-09-09T21:23:28.49239411Z INFO controller-runtime.builder Registering a mutating webhook {"GVK": "fence-agents-remediation.medik8s.io/v1alpha1, Kind=FenceAgentsRemediationTemplate", "path": "/mutate-fence-agents-remediation-medik8s-io-v1alpha1-fenceagentsremediationtemplate"} 2025-09-09T21:23:28.492442465Z INFO controller-runtime.webhook Registering webhook {"path": "/mutate-fence-agents-remediation-medik8s-io-v1alpha1-fenceagentsremediationtemplate"} 2025-09-09T21:23:28.492468272Z INFO controller-runtime.builder Registering a validating webhook {"GVK": "fence-agents-remediation.medik8s.io/v1alpha1, Kind=FenceAgentsRemediationTemplate", "path": "/validate-fence-agents-remediation-medik8s-io-v1alpha1-fenceagentsremediationtemplate"} 2025-09-09T21:23:28.492517409Z INFO controller-runtime.webhook Registering webhook {"path": "/validate-fence-agents-remediation-medik8s-io-v1alpha1-fenceagentsremediationtemplate"} 2025-09-09T21:23:28.492541307Z INFO setup starting manager 2025-09-09T21:23:28.492661616Z INFO controller-runtime.metrics Starting metrics server 2025-09-09T21:23:28.492773285Z INFO controller-runtime.metrics Serving metrics server {"bindAddress": ":8080", "secure": false} 2025-09-09T21:23:28.492773404Z INFO starting server {"name": "health probe", "addr": "[::]:8081"} 2025-09-09T21:23:28.492874544Z INFO controller-runtime.webhook Starting webhook server I0909 21:23:28.492991 1 leaderelection.go:257] attempting to acquire leader lease openshift-workload-availability/cb305759.medik8s.io... 2025-09-09T21:23:28.493196258Z INFO controller-runtime.certwatcher Updated current TLS certificate 2025-09-09T21:23:28.493264929Z INFO controller-runtime.webhook Serving webhook server {"host": "", "port": 9443} 2025-09-09T21:23:28.493308923Z INFO controller-runtime.certwatcher Starting certificate poll+watcher {"interval": "10s"} I0909 21:23:44.995636 1 leaderelection.go:271] successfully acquired lease openshift-workload-availability/cb305759.medik8s.io 2025-09-09T21:23:44.995692864Z DEBUG events fence-agents-remediation-controller-manager-5f76bb6467-ms9xp_8b344978-0def-4ce2-ac9f-7a77344f3a72 became leader {"type": "Normal", "object": {"kind":"Lease","namespace":"openshift-workload-availability","name":"cb305759.medik8s.io","uid":"9bcd0029-2dec-4ea9-a827-779095b67b9d","apiVersion":"coordination.k8s.io/v1","resourceVersion":"1193797"}, "reason": "LeaderElection"} 2025-09-09T21:23:45.0071678Z INFO Starting EventSource {"controller": "fenceagentsremediation", "controllerGroup": "fence-agents-remediation.medik8s.io", "controllerKind": "FenceAgentsRemediation", "source": "kind source: *v1alpha1.FenceAgentsRemediation"} 2025-09-09T21:23:45.007202196Z INFO Starting Controller {"controller": "fenceagentsremediation", "controllerGroup": "fence-agents-remediation.medik8s.io", "controllerKind": "FenceAgentsRemediation"} 2025-09-09T21:23:45.108910182Z INFO Starting workers {"controller": "fenceagentsremediation", "controllerGroup": "fence-agents-remediation.medik8s.io", "controllerKind": "FenceAgentsRemediation", "worker count": 1} 2025-09-09T21:33:58.835586941Z INFO fenceagentsremediationtemplate-resource default {"name": "fenceagentsremediationtemplate-test"} 2025-09-09T21:33:58.839439112Z INFO fenceagentsremediationtemplate-resource validate create {"name": "fenceagentsremediationtemplate-test"} 2025-09-09T21:46:36.246130484Z INFO fenceagentsremediation-resource validate create {"name": "worker-0-0-q89z8"} 2025-09-09T21:46:36.251923121Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-09T21:46:36.252035717Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-09T21:46:36.363628236Z INFO controllers.FenceAgentsRemediation Finalizer was added {"CR Name": "worker-0-0-q89z8"} 2025-09-09T21:46:36.363667327Z INFO controllers.FenceAgentsRemediation Updating Status Condition {"processingConditionStatus": "True", "fenceAgentActionSucceededConditionStatus": "Unknown", "succeededConditionStatus": "Unknown", "reason": "RemediationStarted", "LastUpdateTime": "2025-09-09T21:46:36.363665905Z"} 2025-09-09T21:46:36.363746243Z DEBUG events [remediation] Remediation started {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-0-q89z8","uid":"3644295c-03ae-4fbf-ba1a-753f8d176057","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"1202358"}, "reason": "RemediationStarted"} 2025-09-09T21:46:36.36379341Z DEBUG events [remediation] Finalizer was added {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-0-q89z8","uid":"3644295c-03ae-4fbf-ba1a-753f8d176057","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"1202358"}, "reason": "AddFinalizer"} 2025-09-09T21:46:36.56898018Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-09T21:46:36.569084296Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-09T21:46:36.569097184Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-09T21:46:36.576345895Z INFO taints Taint was added {"taint effect": "NoExecute", "taint list": [{"key":"node.kubernetes.io/unreachable","effect":"NoSchedule","timeAdded":"2025-09-09T21:46:05Z"},{"key":"node.kubernetes.io/unreachable","effect":"NoExecute","timeAdded":"2025-09-09T21:46:11Z"},{"key":"medik8s.io/fence-agents-remediation","effect":"NoExecute","timeAdded":"2025-09-09T21:46:36Z"}]} 2025-09-09T21:46:36.576448575Z INFO controllers.FenceAgentsRemediation FAR remediation taint was added {"Node Name": "worker-0-0"} 2025-09-09T21:46:36.576482285Z INFO controllers.FenceAgentsRemediation Build fence agent command line {"Fence Agent": "fence_ipmilan", "Node Name": "worker-0-0"} 2025-09-09T21:46:36.57657468Z DEBUG events [remediation] Remediation taint was added {"type": "Normal", "object": {"kind":"Node","name":"worker-0-0","uid":"6cad2ea6-5cf6-43bf-ac77-02602be4e92b","apiVersion":"v1","resourceVersion":"1202202"}, "reason": "AddRemediationTaint"} 2025-09-09T21:46:36.777640923Z INFO controllers.FenceAgentsRemediation Execute the fence agent {"Fence Agent": "fence_ipmilan", "Node Name": "worker-0-0", "FAR uid": "3644295c-03ae-4fbf-ba1a-753f8d176057", "Parameters": ["--action","--ipport","--ip","--lanplus","--password","--username"]} 2025-09-09T21:46:36.777846014Z INFO executer fence agent start {"uid": "3644295c-03ae-4fbf-ba1a-753f8d176057", "fence_agent": "fence_ipmilan", "retryCount": 5, "retryInterval": "10s", "timeout": "5m0s"} 2025-09-09T21:46:36.777919949Z DEBUG events [remediation] Fence agent was executed {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-0-q89z8","uid":"3644295c-03ae-4fbf-ba1a-753f8d176057","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"1202359"}, "reason": "FenceAgentExecuted"} 2025-09-09T21:46:36.785792356Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-09T21:46:36.785878191Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-09T21:46:36.785937208Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-09T21:46:36.785988096Z INFO controllers.FenceAgentsRemediation A Fence Agent is already running {"Fence Agent": "fence_ipmilan", "Node Name": "worker-0-0", "FAR uid": "3644295c-03ae-4fbf-ba1a-753f8d176057"} 2025-09-09T21:46:36.792589259Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-09T21:46:41.780755093Z INFO executer command completed {"uid": "3644295c-03ae-4fbf-ba1a-753f8d176057", "response": "Success: Rebooted\n", "errMessage": "", "err": null} 2025-09-09T21:46:41.780904243Z INFO executer fence agent done {"uid": "3644295c-03ae-4fbf-ba1a-753f8d176057", "fence_agent": "fence_ipmilan", "stdout": "Success: Rebooted\n", "stderr": "", "err": null} 2025-09-09T21:46:41.780926287Z INFO executer updating status {"FAR uid": "3644295c-03ae-4fbf-ba1a-753f8d176057"} 2025-09-09T21:46:41.781011027Z INFO executer Updating Status Condition {"processingConditionStatus": "", "fenceAgentActionSucceededConditionStatus": "True", "succeededConditionStatus": "", "reason": "FenceAgentSucceeded", "LastUpdateTime": "2025-09-09T21:46:41.78101006Z"} 2025-09-09T21:46:41.781055449Z DEBUG events [remediation] Fence agent was succeeded {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-0-q89z8","uid":"3644295c-03ae-4fbf-ba1a-753f8d176057","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"1202359"}, "reason": "FenceAgentSucceeded"} 2025-09-09T21:46:41.786589852Z INFO executer status updated {"FAR uid": "3644295c-03ae-4fbf-ba1a-753f8d176057"} 2025-09-09T21:46:41.786678353Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-09T21:46:41.786722355Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-09T21:46:41.786790184Z INFO controllers.FenceAgentsRemediation Remediation strategy is ResourceDeletion which explicitly deletes resources - manually deleting workload {"Node Name": "worker-0-0-q89z8"} 2025-09-09T21:46:41.78687054Z DEBUG events [remediation] Manually delete pods from the unhealthy node {"type": "Normal", "object": {"kind":"Node","name":"worker-0-0","uid":"6cad2ea6-5cf6-43bf-ac77-02602be4e92b","apiVersion":"v1","resourceVersion":"1202363"}, "reason": "DeleteResources"} 2025-09-09T21:46:41.88721194Z INFO commons-resource starting to delete pods {"node name": "worker-0-0"} 2025-09-09T21:46:44.153405635Z INFO commons-resource done deleting pods {"node name": "worker-0-0"} 2025-09-09T21:46:44.153520509Z INFO controllers.FenceAgentsRemediation Updating Status Condition {"processingConditionStatus": "False", "fenceAgentActionSucceededConditionStatus": "", "succeededConditionStatus": "True", "reason": "RemediationFinishedSuccessfully", "LastUpdateTime": "2025-09-09T21:46:44.153519043Z"} 2025-09-09T21:46:44.153554942Z INFO executer cancelling fence agent routine {"uid": "3644295c-03ae-4fbf-ba1a-753f8d176057"} 2025-09-09T21:46:44.153577137Z INFO controllers.FenceAgentsRemediation FenceAgentsRemediation CR has completed to remediate the node {"Node Name": "worker-0-0"} 2025-09-09T21:46:44.153914557Z DEBUG events [remediation] Unhealthy node remediation was completed {"type": "Normal", "object": {"kind":"Node","name":"worker-0-0","uid":"6cad2ea6-5cf6-43bf-ac77-02602be4e92b","apiVersion":"v1","resourceVersion":"1202363"}, "reason": "NodeRemediationCompleted"} 2025-09-09T21:46:44.153960072Z DEBUG events [remediation] Remediation finished {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-0-q89z8","uid":"3644295c-03ae-4fbf-ba1a-753f8d176057","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"1202718"}, "reason": "RemediationFinished"} 2025-09-09T21:46:44.363825755Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-09T21:46:44.363972387Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-09T21:46:44.363996065Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-09T21:46:44.368926301Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-09T21:47:59.33281243Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-09T21:47:59.332944239Z INFO controllers.FenceAgentsRemediation Check FAR CR's name 2025-09-09T21:47:59.332981276Z INFO controllers.FenceAgentsRemediation CR's deletion timestamp is not zero, and FAR finalizer exists {"CR Name": "worker-0-0-q89z8"} 2025-09-09T21:47:59.339900081Z INFO taints Taint was removed {"taint effect": "NoExecute", "taint list": [{"key":"node.kubernetes.io/unreachable","effect":"NoExecute","timeAdded":"2025-09-09T21:46:11Z"}]} 2025-09-09T21:47:59.339968192Z INFO controllers.FenceAgentsRemediation FAR remediation taint was removed {"Node Name": "worker-0-0"} 2025-09-09T21:47:59.340369913Z DEBUG events [remediation] Remediation taint was removed {"type": "Normal", "object": {"kind":"Node","name":"worker-0-0","uid":"6cad2ea6-5cf6-43bf-ac77-02602be4e92b","apiVersion":"v1","resourceVersion":"1203517"}, "reason": "RemoveRemediationTaint"} 2025-09-09T21:47:59.351693112Z INFO controllers.FenceAgentsRemediation Finalizer was removed {"CR Name": "worker-0-0-q89z8"} 2025-09-09T21:47:59.351867402Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile 2025-09-09T21:47:59.351913414Z DEBUG events [remediation] Finalizer was removed {"type": "Normal", "object": {"kind":"FenceAgentsRemediation","namespace":"openshift-workload-availability","name":"worker-0-0-q89z8","uid":"3644295c-03ae-4fbf-ba1a-753f8d176057","apiVersion":"fence-agents-remediation.medik8s.io/v1alpha1","resourceVersion":"1203519"}, "reason": "RemoveFinalizer"} 2025-09-09T21:47:59.352435357Z INFO controllers.FenceAgentsRemediation Begin FenceAgentsRemediation Reconcile 2025-09-09T21:47:59.352461904Z INFO controllers.FenceAgentsRemediation FenceAgentsRemediation CR was not found{"CR Name": "worker-0-0-q89z8", "CR Namespace": "openshift-workload-availability"} 2025-09-09T21:47:59.35246534Z INFO controllers.FenceAgentsRemediation Finish FenceAgentsRemediation Reconcile [kni@cert-rhosp-02 ~]$