[kni@cert-rhosp-02 ~]$ oc get clusterversion NAME VERSION AVAILABLE PROGRESSING SINCE STATUS version 4.18.0-0.nightly-2025-09-03-101304 True False 23h Cluster version is 4.18.0-0.nightly-2025-09-03-101304 [kni@cert-rhosp-02 ~]$ oc get csv NAME DISPLAY VERSION REPLACES PHASE fence-agents-remediation.v0.6.0 Fence Agents Remediation Operator 0.6.0 fence-agents-remediation.v0.5.1 Succeeded node-healthcheck-operator.v0.10.0 Node Health Check Operator 0.10.0 node-healthcheck-operator.v0.9.1 Succeeded self-node-remediation.v0.10.0 Self Node Remediation Operator 0.10.0 self-node-remediation.v0.9.0 Succeeded [kni@cert-rhosp-02 ~]$ PODS=$(oc get pods -o name -n openshift-workload-availability | grep node-healthcheck-controller-manager) [kni@cert-rhosp-02 ~]$ echo $PODS pod/node-healthcheck-controller-manager-56687f5d99-978vw pod/node-healthcheck-controller-manager-56687f5d99-pj2st [kni@cert-rhosp-02 ~]$ for p in $PODS; do > echo "== $p" > oc get "$p" -n openshift-workload-availability -o json | jq .spec.nodeName > done == pod/node-healthcheck-controller-manager-56687f5d99-978vw "master-0-2" == pod/node-healthcheck-controller-manager-56687f5d99-pj2st "master-0-1" [kni@cert-rhosp-02 ~]$ oc delete pod node-healthcheck-controller-manager-56687f5d99-qgg94; oc delete pod node-healthcheck-controller-manager-56687f5d99-bxhbb pod "node-healthcheck-controller-manager-56687f5d99-qgg94" deleted pod "node-healthcheck-controller-manager-56687f5d99-bxhbb" deleted [kni@cert-rhosp-02 ~]$ PODS=$(oc get pods -o name -n openshift-workload-availability | grep node-healthcheck-controller-manager) [kni@cert-rhosp-02 ~]$ for p in $PODS; do echo "== $p"; oc get "$p" -n openshift-workload-availability -o json | jq .spec.nodeName; done == pod/node-healthcheck-controller-manager-56687f5d99-45pp8 "master-0-1" == pod/node-healthcheck-controller-manager-56687f5d99-nb4hh "master-0-2" [kni@cert-rhosp-02 ~]$ oc get nodes -l 'node-role.kubernetes.io/master' NAME STATUS ROLES AGE VERSION master-0-0 Ready control-plane,master 24h v1.31.11 master-0-1 Ready control-plane,master 24h v1.31.11 master-0-2 Ready control-plane,master 24h v1.31.11 [kni@cert-rhosp-02 ~]$ for p in $PODS; do echo "== $p"; oc get "$p" -n openshift-workload-availability -o json | jq .spec.nodeName; done == pod/node-healthcheck-controller-manager-56687f5d99-hdv4b "master-0-1" == pod/node-healthcheck-controller-manager-56687f5d99-jgwkr "master-0-0" == pod/node-healthcheck-controller-manager-56687f5d99-nb4hh "master-0-2" [kni@cert-rhosp-02 ~]$ for p in $PODS; do echo "== $p"; oc get "$p" -n openshift-workload-availability -o json | jq .spec.nodeName; done == pod/node-healthcheck-controller-manager-56687f5d99-jgwkr "master-0-0" == pod/node-healthcheck-controller-manager-56687f5d99-nb4hh "master-0-2" [kni@cert-rhosp-02 ~]$ oc get csv node-healthcheck-operator.v0.10.0 -o yaml apiVersion: operators.coreos.com/v1alpha1 kind: ClusterServiceVersion metadata: annotations: alm-examples: |- [ { "apiVersion": "remediation.medik8s.io/v1alpha1", "kind": "NodeHealthCheck", "metadata": { "name": "nodehealthcheck-sample" }, "spec": { "minHealthy": "51%", "remediationTemplate": { "apiVersion": "self-node-remediation.medik8s.io/v1alpha1", "kind": "SelfNodeRemediationTemplate", "name": "self-node-remediation-automatic-strategy-template", "namespace": "openshift-operators" }, "selector": { "matchExpressions": [ { "key": "node-role.kubernetes.io/worker", "operator": "Exists" } ] }, "unhealthyConditions": [ { "duration": "300s", "status": "False", "type": "Ready" }, { "duration": "300s", "status": "Unknown", "type": "Ready" } ] } } ] capabilities: Basic Install categories: OpenShift Optional console.openshift.io/plugins: '["node-remediation-console-plugin"]' containerImage: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d createdAt: "2025-09-01 18:07:56" description: Detect failed Nodes and trigger remediation with a remediation operator. features.operators.openshift.io/cnf: "false" features.operators.openshift.io/cni: "false" features.operators.openshift.io/csi: "false" features.operators.openshift.io/disconnected: "true" features.operators.openshift.io/fips-compliant: "true" features.operators.openshift.io/proxy-aware: "false" features.operators.openshift.io/tls-profiles: "false" features.operators.openshift.io/token-auth-aws: "false" features.operators.openshift.io/token-auth-azure: "false" features.operators.openshift.io/token-auth-gcp: "false" olm.operatorGroup: openshift-workload-availability-hjmlc olm.operatorNamespace: openshift-workload-availability olm.skipRange: '>=0.9.0 <0.10.0' operatorframework.io/properties: '{"properties":[{"type":"olm.gvk","value":{"group":"remediation.medik8s.io","kind":"NodeHealthCheck","version":"v1alpha1"}},{"type":"olm.package","value":{"packageName":"node-healthcheck-operator","version":"0.10.0"}}]}' operatorframework.io/suggested-namespace: openshift-workload-availability operatorframework.io/suggested-namespace-template: '{"kind":"Namespace","apiVersion":"v1","metadata":{"name":"openshift-workload-availability","annotations":{"openshift.io/node-selector":""}}}' operators.openshift.io/valid-subscription: '["OpenShift Kubernetes Engine", "OpenShift Container Platform", "OpenShift Platform Plus"]' operators.operatorframework.io/builder: operator-sdk-v1.33.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 repository: https://github.com/medik8s/node-healthcheck-operator support: Red Hat creationTimestamp: "2025-09-11T19:56:09Z" generation: 1 labels: olm.copiedFrom: openshift-workload-availability olm.managed: "true" name: node-healthcheck-operator.v0.10.0 namespace: openshift-machine-api resourceVersion: "428905" uid: 84e911b3-84b6-450f-8087-8bfae1058be2 spec: apiservicedefinitions: {} cleanup: enabled: false customresourcedefinitions: owned: - description: NodeHealthCheck is the Schema for the nodehealthchecks API displayName: Node Health Check kind: NodeHealthCheck name: nodehealthchecks.remediation.medik8s.io resources: - kind: NodeHealthCheck name: nodehealthchecks version: v1alpha1 specDescriptors: - description: "EscalatingRemediations contain a list of ordered remediation templates with a timeout. The remediation templates will be used one after another, until the unhealthy node gets healthy within the timeout of the currently processed remediation. The order of remediation is defined by the \"order\" field of each \"escalatingRemediation\". \n Mutually exclusive with RemediationTemplate" displayName: Escalating Remediations path: escalatingRemediations - description: Order defines the order for this remediation. Remediations with lower order will be used before remediations with higher order. Remediations must not have the same order. displayName: Order path: escalatingRemediations[0].order - description: "RemediationTemplate is a reference to a remediation template provided by a remediation provider. \n If a node needs remediation the controller will create an object from this template and then it should be picked up by a remediation provider." displayName: Remediation Template path: escalatingRemediations[0].remediationTemplate - description: "Timeout defines how long NHC will wait for the node getting healthy before the next remediation (if any) will be used. When the last remediation times out, the overall remediation is considered as failed. As a safeguard for preventing parallel remediations, a minimum of 60s is enforced. \n Expects a string of decimal numbers each with optional fraction and a unit suffix, eg \"300ms\", \"1.5h\" or \"2h45m\". Valid time units are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"." displayName: Timeout path: escalatingRemediations[0].timeout - description: HealthyDelay is the time before NHC would allow a node to be healthy again. A negative value means that NHC will never consider the node healthy and a manual intervention is expected displayName: Healthy Delay path: healthyDelay - description: Remediation is allowed if no more than "MaxUnhealthy" nodes selected by "selector" are not healthy. Expects either a non-negative integer value or a percentage value. Percentage values must be positive whole numbers and are capped at 100%. 0% is valid and will block all remediation. MaxUnhealthy should not be used with remediators that delete nodes (e.g. MachineDeletionRemediation), as this breaks the logic for counting healthy and unhealthy nodes. MinHealthy and MaxUnhealthy are configuring the same aspect, and they cannot be used at the same time. displayName: Max Unhealthy path: maxUnhealthy - description: Remediation is allowed if at least "MinHealthy" nodes selected by "selector" are healthy. Expects either a non-negative integer value or a percentage value. Percentage values must be positive whole numbers and are capped at 100%. 100% is valid and will block all remediation. MinHealthy and MaxUnhealthy are configuring the same aspect, and they cannot be used at the same time. displayName: Min Healthy path: minHealthy - description: 'PauseRequests will prevent any new remediation to start, while in-flight remediations keep running. Each entry is free form, and ideally represents the requested party reason for this pausing - i.e: "imaginary-cluster-upgrade-manager-operator"' displayName: Pause Requests path: pauseRequests - description: "RemediationTemplate is a reference to a remediation template provided by an infrastructure provider. \n If a node needs remediation the controller will create an object from this template and then it should be picked up by a remediation provider. \n Mutually exclusive with EscalatingRemediations" displayName: Remediation Template path: remediationTemplate - description: "Label selector to match nodes whose health will be exercised. \n Selecting both control-plane and worker nodes in one NHC CR is highly discouraged and can result in undesired behaviour. \n Note: mandatory now for above reason, but for backwards compatibility existing CRs will continue to work with an empty selector, which matches all nodes." displayName: Selector path: selector - description: UnhealthyConditions contains a list of the conditions that determine whether a node is considered unhealthy. The conditions are combined in a logical OR, i.e. if any of the conditions is met, the node is unhealthy. displayName: Unhealthy Conditions path: unhealthyConditions - description: "Duration of the condition specified when a node is considered unhealthy. \n Expects a string of decimal numbers each with optional fraction and a unit suffix, eg \"300ms\", \"1.5h\" or \"2h45m\". Valid time units are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"." displayName: Duration path: unhealthyConditions[0].duration - description: The condition status in the node's status to watch for. Typically False, True or Unknown. displayName: Status path: unhealthyConditions[0].status - description: The condition type in the node's status to watch for. displayName: Type path: unhealthyConditions[0].type statusDescriptors: - description: 'Represents the observations of a NodeHealthCheck''s current state. Known .status.conditions.type are: "Disabled"' displayName: Conditions path: conditions x-descriptors: - urn:alm:descriptor:io.kubernetes.conditions - description: HealthyNodes specified the number of healthy nodes observed displayName: Healthy Nodes path: healthyNodes - description: InFlightRemediations records the timestamp when remediation triggered per node. Deprecated in favour of UnhealthyNodes. displayName: In Flight Remediations path: inFlightRemediations - description: LastUpdateTime is the last time the status was updated. displayName: Last Update Time path: lastUpdateTime - description: ObservedNodes specified the number of nodes observed by using the NHC spec.selector displayName: Observed Nodes path: observedNodes - description: Phase represents the current phase of this Config. Known phases are Disabled, Paused, Remediating and Enabled, based on:\n - the status of the Disabled condition\n - the value of PauseRequests\n - the value of InFlightRemediations displayName: Phase path: phase x-descriptors: - urn:alm:descriptor:io.kubernetes.phase - description: Reason explains the current phase in more detail. displayName: Reason path: reason x-descriptors: - urn:alm:descriptor:io.kubernetes.phase:reason - description: UnhealthyNodes tracks currently unhealthy nodes and their remediations. displayName: Unhealthy Nodes path: unhealthyNodes - description: ConditionsHealthyTimestamp is RFC 3339 date and time at which the unhealthy conditions didn't match anymore. The remediation CR will be deleted at that time, but the node will still be tracked as unhealthy until all remediation CRs are actually deleted, when remediators finished cleanup and removed their finalizers. displayName: Conditions Healthy Timestamp path: unhealthyNodes[0].conditionsHealthyTimestamp - description: HealthyDelayed notes whether a node should be considered healthy, but isn't due to NodeHealthCheckSpec.HealthyDelay configuration. displayName: Healthy Delayed path: unhealthyNodes[0].healthyDelayed - description: Name is the name of the unhealthy node displayName: Name path: unhealthyNodes[0].name - description: Remediations tracks the remediations created for this node displayName: Remediations path: unhealthyNodes[0].remediations - description: Resource is the reference to the remediation CR which was created displayName: Resource path: unhealthyNodes[0].remediations[0].resource - description: Started is the creation time of the remediation CR displayName: Started path: unhealthyNodes[0].remediations[0].started - description: TemplateName is required when using several templates of the same kind displayName: Template Name path: unhealthyNodes[0].remediations[0].templateName - description: TimedOut is the time when the remediation timed out. Applicable for escalating remediations only. displayName: Timed Out path: unhealthyNodes[0].remediations[0].timedOut version: v1alpha1 description: | ### Introduction Hardware is imperfect, and software contains bugs. When node level failures such as kernel hangs or dead NICs occur, the work required from the cluster does not decrease - workloads from affected nodes need to be restarted somewhere. However some workloads, such as RWO volumes and StatefulSets, may require at-most-one semantics. Failures affecting these kind of workloads risk data loss and/or corruption if nodes (and the workloads running on them) are assumed to be dead whenever we stop hearing from them. For this reason it is important to know that the node has reached a safe state before initiating recovery of the workload. Unfortunately it is not always practical to require admin intervention in order to confirm the node’s true status. In order to automate the recovery of exclusive workloads, we provide operators for failure detection and remediation. ### Failure detection: Node Health Check operator The “Node Health Check” (NHC) operator checks each Node’s set of NodeConditions (eg. NotReady) against the criteria and thresholds defined in NodeHealthCheck configuration. If the Node is deemed to be in a failed state, NHC will initiate recovery by using the SIG Cluster API's “External Remediation” API to instantiate the configured remediation template which specifies the mechanism/controller to be used. ### Failure handling: External remediators There are multiple remediators for handling node failure that we recommend: - Self Node Remediation (SNR) - Fence Agents Remediation (FAR) - Machine Deletion Remediation (MDR) #### Self Node Remediation (SNR) SNR uses watchdog timers and heuristics to ensure nodes enter a safe state (no longer hosting workloads) within a known and finite period of time, before signaling to the system that all Pods on the failed Node are no longer active and can be relocated elsewhere. In the case of transient errors, the watchdog’s actions will also result in the node rebooting and rejoining the cluster - restoring capacity. #### Fence Agents Remediation (FAR) FAR uses well-known agents to fence unhealthy nodes, and eventually FAR remediates the nodes. The remediation includes rebooting the unhealthy node using a fence agent, and then evicting workloads from the unhealthy node. #### Machine Deletion Remediation (MDR) MDR is limited to OpenShift, and it uses Machine API for reprovisioning unhealthy nodes by deleting their machines. displayName: Node Health Check Operator icon: - base64data: iVBORw0KGgoAAAANSUhEUgA ... ... ...CYII= mediatype: image/png install: spec: clusterPermissions: - rules: - apiGroups: - "" resources: - namespaces verbs: - create - get - apiGroups: - "" resources: - nodes - pods verbs: - get - list - watch - apiGroups: - apps resources: - deployments verbs: - get - list - watch - apiGroups: - config.openshift.io resources: - clusterversions - featuregates - infrastructures verbs: - get - list - watch - apiGroups: - console.openshift.io resources: - consoleplugins verbs: - create - delete - get - list - patch - update - watch - apiGroups: - coordination.k8s.io resources: - leases verbs: - create - delete - get - list - patch - update - watch - apiGroups: - machine.openshift.io resources: - machinehealthchecks verbs: - get - list - patch - update - watch - apiGroups: - machine.openshift.io resources: - machinehealthchecks/status verbs: - get - patch - update - apiGroups: - machine.openshift.io resources: - machines verbs: - get - list - watch - apiGroups: - policy resources: - poddisruptionbudgets verbs: - get - list - watch - apiGroups: - rbac.authorization.k8s.io resources: - clusterrolebindings - clusterroles verbs: - '*' - apiGroups: - remediation.medik8s.io resources: - nodehealthchecks verbs: - create - delete - get - list - patch - update - watch - apiGroups: - remediation.medik8s.io resources: - nodehealthchecks/finalizers verbs: - update - apiGroups: - remediation.medik8s.io resources: - nodehealthchecks/status verbs: - get - patch - update - apiGroups: - authentication.k8s.io resources: - tokenreviews verbs: - create - apiGroups: - authorization.k8s.io resources: - subjectaccessreviews verbs: - create serviceAccountName: node-healthcheck-controller-manager deployments: - label: app.kubernetes.io/component: controller-manager app.kubernetes.io/name: node-healthcheck-operator name: node-healthcheck-controller-manager spec: replicas: 2 selector: matchLabels: app.kubernetes.io/component: controller-manager app.kubernetes.io/name: node-healthcheck-operator strategy: {} template: metadata: annotations: kubectl.kubernetes.io/default-container: manager creationTimestamp: null labels: app.kubernetes.io/component: controller-manager app.kubernetes.io/name: node-healthcheck-operator spec: affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - preference: matchExpressions: - key: node-role.kubernetes.io/infra operator: Exists weight: 3 - preference: matchExpressions: - key: node-role.kubernetes.io/master operator: Exists weight: 1 - preference: matchExpressions: - key: node-role.kubernetes.io/control-plane operator: Exists weight: 1 containers: - args: - --secure-listen-address=0.0.0.0:8443 - --http2-disable - --upstream=http://127.0.0.1:8080/ - --logtostderr=true - --v=0 - --tls-cert-file=/etc/tls/private/tls.crt - --tls-private-key-file=/etc/tls/private/tls.key image: registry.redhat.io/openshift4/ose-kube-rbac-proxy-rhel9@sha256:3d3333285fd6736d11ea830fb7fe1f2b8d3e304d682a876458e18eb1173f271d name: kube-rbac-proxy ports: - containerPort: 8443 name: https protocol: TCP resources: limits: cpu: 500m memory: 128Mi requests: cpu: 5m memory: 64Mi securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL volumeMounts: - mountPath: /etc/tls/private name: tls-config readOnly: true - args: - --health-probe-bind-address=:8081 - --metrics-bind-address=127.0.0.1:8080 - --leader-elect command: - /manager env: - name: DEPLOYMENT_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace image: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d livenessProbe: httpGet: path: /healthz port: 8081 initialDelaySeconds: 15 periodSeconds: 20 name: manager readinessProbe: httpGet: path: /readyz port: 8081 initialDelaySeconds: 5 periodSeconds: 10 resources: requests: cpu: 100m memory: 20Mi securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL priorityClassName: system-cluster-critical securityContext: runAsNonRoot: true seccompProfile: type: RuntimeDefault serviceAccountName: node-healthcheck-controller-manager terminationGracePeriodSeconds: 10 tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master operator: Exists - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Exists - effect: NoSchedule key: node-role.kubernetes.io/infra operator: Exists - effect: NoExecute key: node-role.kubernetes.io/infra operator: Exists volumes: - name: tls-config secret: secretName: node-healthcheck-tls - label: app.kubernetes.io/component: node-remediation-console-plugin app.kubernetes.io/name: node-healthcheck-operator name: node-healthcheck-node-remediation-console-plugin spec: replicas: 1 selector: matchLabels: app.kubernetes.io/component: node-remediation-console-plugin app.kubernetes.io/name: node-healthcheck-operator strategy: {} template: metadata: creationTimestamp: null labels: app.kubernetes.io/component: node-remediation-console-plugin app.kubernetes.io/name: node-healthcheck-operator spec: affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - preference: matchExpressions: - key: node-role.kubernetes.io/infra operator: Exists weight: 3 - preference: matchExpressions: - key: node-role.kubernetes.io/master operator: Exists weight: 1 - preference: matchExpressions: - key: node-role.kubernetes.io/control-plane operator: Exists weight: 1 containers: - image: registry.redhat.io/workload-availability/node-remediation-console-rhel9@sha256:cc0f671b126cde10476f8cc2061c6903932e7bf5e3dbf4e8bdda9add70d5deea name: node-remediation-console-plugin ports: - containerPort: 9443 name: nrc-server protocol: TCP resources: requests: cpu: 10m memory: 50Mi securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL volumeMounts: - mountPath: /var/serving-cert name: nrc-plugin-cert readOnly: true securityContext: runAsNonRoot: true seccompProfile: type: RuntimeDefault tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master operator: Exists - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Exists - effect: NoSchedule key: node-role.kubernetes.io/infra operator: Exists - effect: NoExecute key: node-role.kubernetes.io/infra operator: Exists volumes: - name: nrc-plugin-cert secret: defaultMode: 420 secretName: nrc-plugin-cert permissions: - rules: - apiGroups: - "" resources: - configmaps verbs: - get - list - watch - create - update - patch - delete - apiGroups: - coordination.k8s.io resources: - leases verbs: - get - list - watch - create - update - patch - delete - apiGroups: - "" resources: - events verbs: - create - patch serviceAccountName: node-healthcheck-controller-manager strategy: deployment installModes: - supported: false type: OwnNamespace - supported: false type: SingleNamespace - supported: false type: MultiNamespace - supported: true type: AllNamespaces keywords: - NHC - Self Node Remediation - SNR - Remediation - Fencing - medik8s - k8s links: - name: Node Healthcheck Operator url: https://access.redhat.com/documentation/en-us/workload_availability_for_red_hat_openshift/25.8/html/remediation_fencing_and_maintenance - name: Source Code url: https://github.com/medik8s/node-healthcheck-operator maintainers: - email: team-dragonfly@redhat.com name: Dragonfly Team maturity: alpha minKubeVersion: 1.20.0 provider: name: Red Hat url: https://www.redhat.com relatedImages: - image: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d name: manager - image: registry.redhat.io/openshift4/ose-kube-rbac-proxy-rhel9@sha256:3d3333285fd6736d11ea830fb7fe1f2b8d3e304d682a876458e18eb1173f271d name: kube-rbac-proxy replaces: node-healthcheck-operator.v0.9.1 version: 0.10.0 webhookdefinitions: - admissionReviewVersions: - v1 containerPort: 443 deploymentName: node-healthcheck-controller-manager failurePolicy: Fail generateName: vnodehealthcheck.kb.io rules: - apiGroups: - remediation.medik8s.io apiVersions: - v1alpha1 operations: - CREATE - UPDATE - DELETE resources: - nodehealthchecks sideEffects: None targetPort: 9443 type: ValidatingAdmissionWebhook webhookPath: /validate-remediation-medik8s-io-v1alpha1-nodehealthcheck status: certsLastUpdated: "2025-09-11T19:56:11Z" certsRotateAt: "2027-09-10T19:56:10Z" cleanup: {} conditions: - lastTransitionTime: "2025-09-11T19:56:07Z" lastUpdateTime: "2025-09-11T19:56:07Z" message: requirements not yet checked phase: Pending reason: RequirementsUnknown - lastTransitionTime: "2025-09-11T19:56:07Z" lastUpdateTime: "2025-09-11T19:56:07Z" message: one or more requirements couldn't be found phase: Pending reason: RequirementsNotMet - lastTransitionTime: "2025-09-11T19:56:10Z" lastUpdateTime: "2025-09-11T19:56:10Z" message: all requirements found, attempting install phase: InstallReady reason: AllRequirementsMet - lastTransitionTime: "2025-09-11T19:56:10Z" lastUpdateTime: "2025-09-11T19:56:10Z" message: waiting for install components to report healthy phase: Installing reason: InstallSucceeded - lastTransitionTime: "2025-09-11T19:56:10Z" lastUpdateTime: "2025-09-11T19:56:11Z" message: 'installing: waiting for deployment node-healthcheck-controller-manager to become ready: deployment "node-healthcheck-controller-manager" not available: Deployment does not have minimum availability.' phase: Installing reason: InstallWaiting - lastTransitionTime: "2025-09-11T19:56:21Z" lastUpdateTime: "2025-09-11T19:56:21Z" message: install strategy completed with no errors phase: Succeeded reason: InstallSucceeded lastTransitionTime: "2025-09-11T19:56:21Z" lastUpdateTime: "2025-09-11T19:56:21Z" message: The operator is running in openshift-workload-availability but is managing this namespace phase: Succeeded reason: Copied requirementStatus: - group: operators.coreos.com kind: ClusterServiceVersion message: CSV minKubeVersion (1.20.0) less than server version (v1.31.11) name: node-healthcheck-operator.v0.10.0 status: Present version: v1alpha1 - group: apiextensions.k8s.io kind: CustomResourceDefinition message: CRD is present and Established condition is true name: nodehealthchecks.remediation.medik8s.io status: Present uuid: 3d57988f-a83a-409c-b968-25d18e0c8b1f version: v1 - dependents: - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["create","get"],"apiGroups":[""],"resources":["namespaces"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["get","list","watch"],"apiGroups":[""],"resources":["nodes","pods"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["get","list","watch"],"apiGroups":["apps"],"resources":["deployments"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["get","list","watch"],"apiGroups":["config.openshift.io"],"resources":["clusterversions","featuregates","infrastructures"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["create","delete","get","list","patch","update","watch"],"apiGroups":["console.openshift.io"],"resources":["consoleplugins"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["create","delete","get","list","patch","update","watch"],"apiGroups":["coordination.k8s.io"],"resources":["leases"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["get","list","patch","update","watch"],"apiGroups":["machine.openshift.io"],"resources":["machinehealthchecks"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["get","patch","update"],"apiGroups":["machine.openshift.io"],"resources":["machinehealthchecks/status"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["get","list","watch"],"apiGroups":["machine.openshift.io"],"resources":["machines"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["get","list","watch"],"apiGroups":["policy"],"resources":["poddisruptionbudgets"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["*"],"apiGroups":["rbac.authorization.k8s.io"],"resources":["clusterrolebindings","clusterroles"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["create","delete","get","list","patch","update","watch"],"apiGroups":["remediation.medik8s.io"],"resources":["nodehealthchecks"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["update"],"apiGroups":["remediation.medik8s.io"],"resources":["nodehealthchecks/finalizers"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["get","patch","update"],"apiGroups":["remediation.medik8s.io"],"resources":["nodehealthchecks/status"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["create"],"apiGroups":["authentication.k8s.io"],"resources":["tokenreviews"]} status: Satisfied version: v1 - group: rbac.authorization.k8s.io kind: PolicyRule message: cluster rule:{"verbs":["create"],"apiGroups":["authorization.k8s.io"],"resources":["subjectaccessreviews"]} status: Satisfied version: v1 group: "" kind: ServiceAccount message: "" name: node-healthcheck-controller-manager status: Present version: v1 [kni@cert-rhosp-02 ~]$ oc get pods node-healthcheck-controller-manager-56687f5d99-978vw -o yaml apiVersion: v1 kind: Pod metadata: annotations: alm-examples: |- [ { "apiVersion": "remediation.medik8s.io/v1alpha1", "kind": "NodeHealthCheck", "metadata": { "name": "nodehealthcheck-sample" }, "spec": { "minHealthy": "51%", "remediationTemplate": { "apiVersion": "self-node-remediation.medik8s.io/v1alpha1", "kind": "SelfNodeRemediationTemplate", "name": "self-node-remediation-automatic-strategy-template", "namespace": "openshift-operators" }, "selector": { "matchExpressions": [ { "key": "node-role.kubernetes.io/worker", "operator": "Exists" } ] }, "unhealthyConditions": [ { "duration": "300s", "status": "False", "type": "Ready" }, { "duration": "300s", "status": "Unknown", "type": "Ready" } ] } } ] capabilities: Basic Install categories: OpenShift Optional console.openshift.io/plugins: '["node-remediation-console-plugin"]' containerImage: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d createdAt: "2025-09-01 18:07:56" description: Detect failed Nodes and trigger remediation with a remediation operator. features.operators.openshift.io/cnf: "false" features.operators.openshift.io/cni: "false" features.operators.openshift.io/csi: "false" features.operators.openshift.io/disconnected: "true" features.operators.openshift.io/fips-compliant: "true" features.operators.openshift.io/proxy-aware: "false" features.operators.openshift.io/tls-profiles: "false" features.operators.openshift.io/token-auth-aws: "false" features.operators.openshift.io/token-auth-azure: "false" features.operators.openshift.io/token-auth-gcp: "false" k8s.ovn.org/pod-networks: '{"default":{"ip_addresses":["10.129.0.91/23"],"mac_address":"0a:58:0a:81:00:5b","gateway_ips":["10.129.0.1"],"routes":[{"dest":"10.128.0.0/14","nextHop":"10.129.0.1"},{"dest":"172.30.0.0/16","nextHop":"10.129.0.1"},{"dest":"169.254.0.5/32","nextHop":"10.129.0.1"},{"dest":"100.64.0.0/16","nextHop":"10.129.0.1"}],"ip_address":"10.129.0.91/23","gateway_ip":"10.129.0.1","role":"primary"}}' k8s.v1.cni.cncf.io/network-status: |- [{ "name": "ovn-kubernetes", "interface": "eth0", "ips": [ "10.129.0.91" ], "mac": "0a:58:0a:81:00:5b", "default": true, "dns": {} }] kubectl.kubernetes.io/default-container: manager olm.operatorGroup: openshift-workload-availability-hjmlc olm.operatorNamespace: openshift-workload-availability olm.skipRange: '>=0.9.0 <0.10.0' olm.targetNamespaces: "" olmcahash: 873236d06f3853f2453b6e868f8a3cdfaa6495b9b3ea303a917a306c0ff415a9 openshift.io/scc: restricted-v2 operatorframework.io/properties: '{"properties":[{"type":"olm.gvk","value":{"group":"remediation.medik8s.io","kind":"NodeHealthCheck","version":"v1alpha1"}},{"type":"olm.package","value":{"packageName":"node-healthcheck-operator","version":"0.10.0"}}]}' operatorframework.io/suggested-namespace: openshift-workload-availability operatorframework.io/suggested-namespace-template: '{"kind":"Namespace","apiVersion":"v1","metadata":{"name":"openshift-workload-availability","annotations":{"openshift.io/node-selector":""}}}' operators.openshift.io/valid-subscription: '["OpenShift Kubernetes Engine", "OpenShift Container Platform", "OpenShift Platform Plus"]' operators.operatorframework.io/builder: operator-sdk-v1.33.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 repository: https://github.com/medik8s/node-healthcheck-operator seccomp.security.alpha.kubernetes.io/pod: runtime/default support: Red Hat creationTimestamp: "2025-09-11T19:56:11Z" generateName: node-healthcheck-controller-manager-56687f5d99- labels: app.kubernetes.io/component: controller-manager app.kubernetes.io/name: node-healthcheck-operator pod-template-hash: 56687f5d99 name: node-healthcheck-controller-manager-56687f5d99-978vw namespace: openshift-workload-availability ownerReferences: - apiVersion: apps/v1 blockOwnerDeletion: true controller: true kind: ReplicaSet name: node-healthcheck-controller-manager-56687f5d99 uid: 5cb729b2-4c5a-476b-b3aa-522a494f6003 resourceVersion: "428818" uid: 977b842c-142d-4e12-8200-3d38f2d0403d spec: affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - preference: matchExpressions: - key: node-role.kubernetes.io/infra operator: Exists weight: 3 - preference: matchExpressions: - key: node-role.kubernetes.io/master operator: Exists weight: 1 - preference: matchExpressions: - key: node-role.kubernetes.io/control-plane operator: Exists weight: 1 containers: - args: - --secure-listen-address=0.0.0.0:8443 - --http2-disable - --upstream=http://127.0.0.1:8080/ - --logtostderr=true - --v=0 - --tls-cert-file=/etc/tls/private/tls.crt - --tls-private-key-file=/etc/tls/private/tls.key env: - name: OPERATOR_CONDITION_NAME value: node-healthcheck-operator.v0.10.0 image: registry.redhat.io/openshift4/ose-kube-rbac-proxy-rhel9@sha256:3d3333285fd6736d11ea830fb7fe1f2b8d3e304d682a876458e18eb1173f271d imagePullPolicy: IfNotPresent name: kube-rbac-proxy ports: - containerPort: 8443 name: https protocol: TCP resources: limits: cpu: 500m memory: 128Mi requests: cpu: 5m memory: 64Mi securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL runAsUser: 1000740000 terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /etc/tls/private name: tls-config readOnly: true - mountPath: /apiserver.local.config/certificates name: apiservice-cert - mountPath: /tmp/k8s-webhook-server/serving-certs name: webhook-cert - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: kube-api-access-965wf readOnly: true - args: - --health-probe-bind-address=:8081 - --metrics-bind-address=127.0.0.1:8080 - --leader-elect command: - /manager env: - name: DEPLOYMENT_NAMESPACE valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.namespace - name: OPERATOR_CONDITION_NAME value: node-healthcheck-operator.v0.10.0 image: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d imagePullPolicy: IfNotPresent livenessProbe: failureThreshold: 3 httpGet: path: /healthz port: 8081 scheme: HTTP initialDelaySeconds: 15 periodSeconds: 20 successThreshold: 1 timeoutSeconds: 1 name: manager readinessProbe: failureThreshold: 3 httpGet: path: /readyz port: 8081 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 successThreshold: 1 timeoutSeconds: 1 resources: requests: cpu: 100m memory: 20Mi securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL runAsUser: 1000740000 terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /apiserver.local.config/certificates name: apiservice-cert - mountPath: /tmp/k8s-webhook-server/serving-certs name: webhook-cert - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: kube-api-access-965wf readOnly: true dnsPolicy: ClusterFirst enableServiceLinks: true imagePullSecrets: - name: node-healthcheck-controller-manager-dockercfg-rkknp nodeName: master-0-2 preemptionPolicy: PreemptLowerPriority priority: 2000000000 priorityClassName: system-cluster-critical restartPolicy: Always schedulerName: default-scheduler securityContext: fsGroup: 1000740000 runAsNonRoot: true seLinuxOptions: level: s0:c27,c19 seccompProfile: type: RuntimeDefault serviceAccount: node-healthcheck-controller-manager serviceAccountName: node-healthcheck-controller-manager terminationGracePeriodSeconds: 10 tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master operator: Exists - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Exists - effect: NoSchedule key: node-role.kubernetes.io/infra operator: Exists - effect: NoExecute key: node-role.kubernetes.io/infra operator: Exists - effect: NoExecute key: node.kubernetes.io/not-ready operator: Exists tolerationSeconds: 300 - effect: NoExecute key: node.kubernetes.io/unreachable operator: Exists tolerationSeconds: 300 - effect: NoSchedule key: node.kubernetes.io/memory-pressure operator: Exists volumes: - name: tls-config secret: defaultMode: 420 secretName: node-healthcheck-tls - name: apiservice-cert secret: defaultMode: 420 items: - key: tls.crt path: apiserver.crt - key: tls.key path: apiserver.key secretName: node-healthcheck-controller-manager-service-cert - name: webhook-cert secret: defaultMode: 420 items: - key: tls.crt path: tls.crt - key: tls.key path: tls.key secretName: node-healthcheck-controller-manager-service-cert - name: kube-api-access-965wf projected: defaultMode: 420 sources: - serviceAccountToken: expirationSeconds: 3607 path: token - configMap: items: - key: ca.crt path: ca.crt name: kube-root-ca.crt - downwardAPI: items: - fieldRef: apiVersion: v1 fieldPath: metadata.namespace path: namespace - configMap: items: - key: service-ca.crt path: service-ca.crt name: openshift-service-ca.crt status: conditions: - lastProbeTime: null lastTransitionTime: "2025-09-11T19:56:12Z" status: "True" type: PodReadyToStartContainers - lastProbeTime: null lastTransitionTime: "2025-09-11T19:56:11Z" status: "True" type: Initialized - lastProbeTime: null lastTransitionTime: "2025-09-11T19:56:21Z" status: "True" type: Ready - lastProbeTime: null lastTransitionTime: "2025-09-11T19:56:21Z" status: "True" type: ContainersReady - lastProbeTime: null lastTransitionTime: "2025-09-11T19:56:11Z" status: "True" type: PodScheduled containerStatuses: - containerID: cri-o://55cbcc09d13718123e0e5ff9c0f4bfd9c06638163ca7317f1a858b3aabc08c86 image: registry.redhat.io/openshift4/ose-kube-rbac-proxy-rhel9@sha256:3d3333285fd6736d11ea830fb7fe1f2b8d3e304d682a876458e18eb1173f271d imageID: registry.redhat.io/openshift4/ose-kube-rbac-proxy-rhel9@sha256:3d3333285fd6736d11ea830fb7fe1f2b8d3e304d682a876458e18eb1173f271d lastState: {} name: kube-rbac-proxy ready: true restartCount: 0 started: true state: running: startedAt: "2025-09-11T19:56:11Z" volumeMounts: - mountPath: /etc/tls/private name: tls-config readOnly: true recursiveReadOnly: Disabled - mountPath: /apiserver.local.config/certificates name: apiservice-cert - mountPath: /tmp/k8s-webhook-server/serving-certs name: webhook-cert - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: kube-api-access-965wf readOnly: true recursiveReadOnly: Disabled - containerID: cri-o://7feef310c450465bce29810bf463b21f1dbb7a54efeac5d91bc64d7622367644 image: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d imageID: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d lastState: {} name: manager ready: true restartCount: 0 started: true state: running: startedAt: "2025-09-11T19:56:11Z" volumeMounts: - mountPath: /apiserver.local.config/certificates name: apiservice-cert - mountPath: /tmp/k8s-webhook-server/serving-certs name: webhook-cert - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: kube-api-access-965wf readOnly: true recursiveReadOnly: Disabled hostIP: 192.168.123.126 hostIPs: - ip: 192.168.123.126 phase: Running podIP: 10.129.0.91 podIPs: - ip: 10.129.0.91 qosClass: Burstable startTime: "2025-09-11T19:56:11Z"