[kni@cert-rhosp-02 ~]$ oc get clusterversion
NAME      VERSION                              AVAILABLE   PROGRESSING   SINCE   STATUS
version   4.18.0-0.nightly-2025-09-03-101304   True        False         23h     Cluster version is 4.18.0-0.nightly-2025-09-03-101304
[kni@cert-rhosp-02 ~]$ oc get csv
NAME                                DISPLAY                             VERSION   REPLACES                           PHASE
fence-agents-remediation.v0.6.0     Fence Agents Remediation Operator   0.6.0     fence-agents-remediation.v0.5.1    Succeeded
node-healthcheck-operator.v0.10.0   Node Health Check Operator          0.10.0    node-healthcheck-operator.v0.9.1   Succeeded
self-node-remediation.v0.10.0       Self Node Remediation Operator      0.10.0    self-node-remediation.v0.9.0       Succeeded
[kni@cert-rhosp-02 ~]$ PODS=$(oc get pods -o name -n openshift-workload-availability | grep node-healthcheck-controller-manager)
[kni@cert-rhosp-02 ~]$ echo $PODS
pod/node-healthcheck-controller-manager-56687f5d99-978vw pod/node-healthcheck-controller-manager-56687f5d99-pj2st
[kni@cert-rhosp-02 ~]$ for p in $PODS; do
>   echo "== $p"
>   oc get "$p" -n openshift-workload-availability -o json | jq .spec.nodeName
> done
== pod/node-healthcheck-controller-manager-56687f5d99-978vw
"master-0-2"
== pod/node-healthcheck-controller-manager-56687f5d99-pj2st
"master-0-1"
[kni@cert-rhosp-02 ~]$ oc delete pod node-healthcheck-controller-manager-56687f5d99-qgg94; oc delete pod node-healthcheck-controller-manager-56687f5d99-bxhbb
pod "node-healthcheck-controller-manager-56687f5d99-qgg94" deleted
pod "node-healthcheck-controller-manager-56687f5d99-bxhbb" deleted
[kni@cert-rhosp-02 ~]$ PODS=$(oc get pods -o name -n openshift-workload-availability | grep node-healthcheck-controller-manager)
[kni@cert-rhosp-02 ~]$ for p in $PODS; do   echo "== $p";   oc get "$p" -n openshift-workload-availability -o json | jq .spec.nodeName; done
== pod/node-healthcheck-controller-manager-56687f5d99-45pp8
"master-0-1"
== pod/node-healthcheck-controller-manager-56687f5d99-nb4hh
"master-0-2"
[kni@cert-rhosp-02 ~]$ oc get nodes -l 'node-role.kubernetes.io/master'
NAME         STATUS   ROLES                  AGE   VERSION
master-0-0   Ready    control-plane,master   24h   v1.31.11
master-0-1   Ready    control-plane,master   24h   v1.31.11
master-0-2   Ready    control-plane,master   24h   v1.31.11
[kni@cert-rhosp-02 ~]$ for p in $PODS; do   echo "== $p";   oc get "$p" -n openshift-workload-availability -o json | jq .spec.nodeName; done
== pod/node-healthcheck-controller-manager-56687f5d99-hdv4b
"master-0-1"
== pod/node-healthcheck-controller-manager-56687f5d99-jgwkr
"master-0-0"
== pod/node-healthcheck-controller-manager-56687f5d99-nb4hh
"master-0-2"
[kni@cert-rhosp-02 ~]$ for p in $PODS; do   echo "== $p";   oc get "$p" -n openshift-workload-availability -o json | jq .spec.nodeName; done
== pod/node-healthcheck-controller-manager-56687f5d99-jgwkr
"master-0-0"
== pod/node-healthcheck-controller-manager-56687f5d99-nb4hh
"master-0-2"
[kni@cert-rhosp-02 ~]$ oc get csv node-healthcheck-operator.v0.10.0 -o yaml
apiVersion: operators.coreos.com/v1alpha1
kind: ClusterServiceVersion
metadata:
  annotations:
    alm-examples: |-
      [
        {
          "apiVersion": "remediation.medik8s.io/v1alpha1",
          "kind": "NodeHealthCheck",
          "metadata": {
            "name": "nodehealthcheck-sample"
          },
          "spec": {
            "minHealthy": "51%",
            "remediationTemplate": {
              "apiVersion": "self-node-remediation.medik8s.io/v1alpha1",
              "kind": "SelfNodeRemediationTemplate",
              "name": "self-node-remediation-automatic-strategy-template",
              "namespace": "openshift-operators"
            },
            "selector": {
              "matchExpressions": [
                {
                  "key": "node-role.kubernetes.io/worker",
                  "operator": "Exists"
                }
              ]
            },
            "unhealthyConditions": [
              {
                "duration": "300s",
                "status": "False",
                "type": "Ready"
              },
              {
                "duration": "300s",
                "status": "Unknown",
                "type": "Ready"
              }
            ]
          }
        }
      ]
    capabilities: Basic Install
    categories: OpenShift Optional
    console.openshift.io/plugins: '["node-remediation-console-plugin"]'
    containerImage: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d
    createdAt: "2025-09-01 18:07:56"
    description: Detect failed Nodes and trigger remediation with a remediation operator.
    features.operators.openshift.io/cnf: "false"
    features.operators.openshift.io/cni: "false"
    features.operators.openshift.io/csi: "false"
    features.operators.openshift.io/disconnected: "true"
    features.operators.openshift.io/fips-compliant: "true"
    features.operators.openshift.io/proxy-aware: "false"
    features.operators.openshift.io/tls-profiles: "false"
    features.operators.openshift.io/token-auth-aws: "false"
    features.operators.openshift.io/token-auth-azure: "false"
    features.operators.openshift.io/token-auth-gcp: "false"
    olm.operatorGroup: openshift-workload-availability-hjmlc
    olm.operatorNamespace: openshift-workload-availability
    olm.skipRange: '>=0.9.0 <0.10.0'
    operatorframework.io/properties: '{"properties":[{"type":"olm.gvk","value":{"group":"remediation.medik8s.io","kind":"NodeHealthCheck","version":"v1alpha1"}},{"type":"olm.package","value":{"packageName":"node-healthcheck-operator","version":"0.10.0"}}]}'
    operatorframework.io/suggested-namespace: openshift-workload-availability
    operatorframework.io/suggested-namespace-template: '{"kind":"Namespace","apiVersion":"v1","metadata":{"name":"openshift-workload-availability","annotations":{"openshift.io/node-selector":""}}}'
    operators.openshift.io/valid-subscription: '["OpenShift Kubernetes Engine", "OpenShift
      Container Platform", "OpenShift Platform Plus"]'
    operators.operatorframework.io/builder: operator-sdk-v1.33.0
    operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
    repository: https://github.com/medik8s/node-healthcheck-operator
    support: Red Hat
  creationTimestamp: "2025-09-11T19:56:09Z"
  generation: 1
  labels:
    olm.copiedFrom: openshift-workload-availability
    olm.managed: "true"
  name: node-healthcheck-operator.v0.10.0
  namespace: openshift-machine-api
  resourceVersion: "428905"
  uid: 84e911b3-84b6-450f-8087-8bfae1058be2
spec:
  apiservicedefinitions: {}
  cleanup:
    enabled: false
  customresourcedefinitions:
    owned:
    - description: NodeHealthCheck is the Schema for the nodehealthchecks API
      displayName: Node Health Check
      kind: NodeHealthCheck
      name: nodehealthchecks.remediation.medik8s.io
      resources:
      - kind: NodeHealthCheck
        name: nodehealthchecks
        version: v1alpha1
      specDescriptors:
      - description: "EscalatingRemediations contain a list of ordered remediation
          templates with a timeout. The remediation templates will be used one after
          another, until the unhealthy node gets healthy within the timeout of the
          currently processed remediation. The order of remediation is defined by
          the \"order\" field of each \"escalatingRemediation\". \n Mutually exclusive
          with RemediationTemplate"
        displayName: Escalating Remediations
        path: escalatingRemediations
      - description: Order defines the order for this remediation. Remediations with
          lower order will be used before remediations with higher order. Remediations
          must not have the same order.
        displayName: Order
        path: escalatingRemediations[0].order
      - description: "RemediationTemplate is a reference to a remediation template
          provided by a remediation provider. \n If a node needs remediation the controller
          will create an object from this template and then it should be picked up
          by a remediation provider."
        displayName: Remediation Template
        path: escalatingRemediations[0].remediationTemplate
      - description: "Timeout defines how long NHC will wait for the node getting
          healthy before the next remediation (if any) will be used. When the last
          remediation times out, the overall remediation is considered as failed.
          As a safeguard for preventing parallel remediations, a minimum of 60s is
          enforced. \n Expects a string of decimal numbers each with optional fraction
          and a unit suffix, eg \"300ms\", \"1.5h\" or \"2h45m\". Valid time units
          are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"."
        displayName: Timeout
        path: escalatingRemediations[0].timeout
      - description: HealthyDelay is the time before NHC would allow a node to be
          healthy again. A negative value means that NHC will never consider the node
          healthy and a manual intervention is expected
        displayName: Healthy Delay
        path: healthyDelay
      - description: Remediation is allowed if no more than "MaxUnhealthy" nodes selected
          by "selector" are not healthy. Expects either a non-negative integer value
          or a percentage value. Percentage values must be positive whole numbers
          and are capped at 100%. 0% is valid and will block all remediation. MaxUnhealthy
          should not be used with remediators that delete nodes (e.g. MachineDeletionRemediation),
          as this breaks the logic for counting healthy and unhealthy nodes. MinHealthy
          and MaxUnhealthy are configuring the same aspect, and they cannot be used
          at the same time.
        displayName: Max Unhealthy
        path: maxUnhealthy
      - description: Remediation is allowed if at least "MinHealthy" nodes selected
          by "selector" are healthy. Expects either a non-negative integer value or
          a percentage value. Percentage values must be positive whole numbers and
          are capped at 100%. 100% is valid and will block all remediation. MinHealthy
          and MaxUnhealthy are configuring the same aspect, and they cannot be used
          at the same time.
        displayName: Min Healthy
        path: minHealthy
      - description: 'PauseRequests will prevent any new remediation to start, while
          in-flight remediations keep running. Each entry is free form, and ideally
          represents the requested party reason for this pausing - i.e: "imaginary-cluster-upgrade-manager-operator"'
        displayName: Pause Requests
        path: pauseRequests
      - description: "RemediationTemplate is a reference to a remediation template
          provided by an infrastructure provider. \n If a node needs remediation the
          controller will create an object from this template and then it should be
          picked up by a remediation provider. \n Mutually exclusive with EscalatingRemediations"
        displayName: Remediation Template
        path: remediationTemplate
      - description: "Label selector to match nodes whose health will be exercised.
          \n Selecting both control-plane and worker nodes in one NHC CR is highly
          discouraged and can result in undesired behaviour. \n Note: mandatory now
          for above reason, but for backwards compatibility existing CRs will continue
          to work with an empty selector, which matches all nodes."
        displayName: Selector
        path: selector
      - description: UnhealthyConditions contains a list of the conditions that determine
          whether a node is considered unhealthy.  The conditions are combined in
          a logical OR, i.e. if any of the conditions is met, the node is unhealthy.
        displayName: Unhealthy Conditions
        path: unhealthyConditions
      - description: "Duration of the condition specified when a node is considered
          unhealthy. \n Expects a string of decimal numbers each with optional fraction
          and a unit suffix, eg \"300ms\", \"1.5h\" or \"2h45m\". Valid time units
          are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"."
        displayName: Duration
        path: unhealthyConditions[0].duration
      - description: The condition status in the node's status to watch for. Typically
          False, True or Unknown.
        displayName: Status
        path: unhealthyConditions[0].status
      - description: The condition type in the node's status to watch for.
        displayName: Type
        path: unhealthyConditions[0].type
      statusDescriptors:
      - description: 'Represents the observations of a NodeHealthCheck''s current
          state. Known .status.conditions.type are: "Disabled"'
        displayName: Conditions
        path: conditions
        x-descriptors:
        - urn:alm:descriptor:io.kubernetes.conditions
      - description: HealthyNodes specified the number of healthy nodes observed
        displayName: Healthy Nodes
        path: healthyNodes
      - description: InFlightRemediations records the timestamp when remediation triggered
          per node. Deprecated in favour of UnhealthyNodes.
        displayName: In Flight Remediations
        path: inFlightRemediations
      - description: LastUpdateTime is the last time the status was updated.
        displayName: Last Update Time
        path: lastUpdateTime
      - description: ObservedNodes specified the number of nodes observed by using
          the NHC spec.selector
        displayName: Observed Nodes
        path: observedNodes
      - description: Phase represents the current phase of this Config. Known phases
          are Disabled, Paused, Remediating and Enabled, based on:\n - the status
          of the Disabled condition\n - the value of PauseRequests\n - the value of
          InFlightRemediations
        displayName: Phase
        path: phase
        x-descriptors:
        - urn:alm:descriptor:io.kubernetes.phase
      - description: Reason explains the current phase in more detail.
        displayName: Reason
        path: reason
        x-descriptors:
        - urn:alm:descriptor:io.kubernetes.phase:reason
      - description: UnhealthyNodes tracks currently unhealthy nodes and their remediations.
        displayName: Unhealthy Nodes
        path: unhealthyNodes
      - description: ConditionsHealthyTimestamp is RFC 3339 date and time at which
          the unhealthy conditions didn't match anymore. The remediation CR will be
          deleted at that time, but the node will still be tracked as unhealthy until
          all remediation CRs are actually deleted, when remediators finished cleanup
          and removed their finalizers.
        displayName: Conditions Healthy Timestamp
        path: unhealthyNodes[0].conditionsHealthyTimestamp
      - description: HealthyDelayed notes whether a node should be considered healthy,
          but isn't due to NodeHealthCheckSpec.HealthyDelay configuration.
        displayName: Healthy Delayed
        path: unhealthyNodes[0].healthyDelayed
      - description: Name is the name of the unhealthy node
        displayName: Name
        path: unhealthyNodes[0].name
      - description: Remediations tracks the remediations created for this node
        displayName: Remediations
        path: unhealthyNodes[0].remediations
      - description: Resource is the reference to the remediation CR which was created
        displayName: Resource
        path: unhealthyNodes[0].remediations[0].resource
      - description: Started is the creation time of the remediation CR
        displayName: Started
        path: unhealthyNodes[0].remediations[0].started
      - description: TemplateName is required when using several templates of the
          same kind
        displayName: Template Name
        path: unhealthyNodes[0].remediations[0].templateName
      - description: TimedOut is the time when the remediation timed out. Applicable
          for escalating remediations only.
        displayName: Timed Out
        path: unhealthyNodes[0].remediations[0].timedOut
      version: v1alpha1
  description: |
    ### Introduction
    Hardware is imperfect, and software contains bugs. When node level failures such as kernel hangs or dead NICs
    occur, the work required from the cluster does not decrease - workloads from affected nodes need to be
    restarted somewhere.

    However some workloads, such as RWO volumes and StatefulSets, may require at-most-one semantics.
    Failures affecting these kind of workloads risk data loss and/or corruption if nodes (and the workloads
    running on them) are assumed to be dead whenever we stop hearing from them. For this reason it is important
    to know that the node has reached a safe state before initiating recovery of the workload.

    Unfortunately it is not always practical to require admin intervention in order to confirm the node’s true status.
    In order to automate the recovery of exclusive workloads, we provide operators for failure detection
    and remediation.

    ### Failure detection: Node Health Check operator
    The “Node Health Check” (NHC) operator checks each Node’s set of
    NodeConditions (eg. NotReady) against the criteria and thresholds defined in
    NodeHealthCheck configuration. If the Node is deemed to be in a failed
    state, NHC will initiate recovery by using the SIG Cluster API's “External
    Remediation” API to instantiate the configured remediation template which
    specifies the mechanism/controller to be used.

    ### Failure handling: External remediators
    There are multiple remediators for handling node failure that we recommend:
    - Self Node Remediation (SNR)
    - Fence Agents Remediation (FAR)
    - Machine Deletion Remediation (MDR)

    #### Self Node Remediation (SNR)
    SNR uses watchdog timers and heuristics to ensure nodes enter a safe state
    (no longer hosting workloads) within a known and finite period of time,
    before signaling to the system that all Pods on the failed Node are no longer active
    and can be relocated elsewhere.
    In the case of transient errors, the watchdog’s actions will also result in
    the node rebooting and rejoining the cluster - restoring capacity.

    #### Fence Agents Remediation (FAR)
    FAR uses well-known agents to fence unhealthy nodes, and eventually FAR remediates the nodes.
    The remediation includes rebooting the unhealthy node using a fence agent,
    and then evicting workloads from the unhealthy node.

    #### Machine Deletion Remediation (MDR)
    MDR is limited to OpenShift, and it uses Machine API for reprovisioning unhealthy nodes by deleting their machines.
  displayName: Node Health Check Operator
  icon:
  - base64data: iVBORw0KGgoAAAANSUhEUgA
  ...
  ...
  ...CYII=
    mediatype: image/png
  install:
    spec:
      clusterPermissions:
      - rules:
        - apiGroups:
          - ""
          resources:
          - namespaces
          verbs:
          - create
          - get
        - apiGroups:
          - ""
          resources:
          - nodes
          - pods
          verbs:
          - get
          - list
          - watch
        - apiGroups:
          - apps
          resources:
          - deployments
          verbs:
          - get
          - list
          - watch
        - apiGroups:
          - config.openshift.io
          resources:
          - clusterversions
          - featuregates
          - infrastructures
          verbs:
          - get
          - list
          - watch
        - apiGroups:
          - console.openshift.io
          resources:
          - consoleplugins
          verbs:
          - create
          - delete
          - get
          - list
          - patch
          - update
          - watch
        - apiGroups:
          - coordination.k8s.io
          resources:
          - leases
          verbs:
          - create
          - delete
          - get
          - list
          - patch
          - update
          - watch
        - apiGroups:
          - machine.openshift.io
          resources:
          - machinehealthchecks
          verbs:
          - get
          - list
          - patch
          - update
          - watch
        - apiGroups:
          - machine.openshift.io
          resources:
          - machinehealthchecks/status
          verbs:
          - get
          - patch
          - update
        - apiGroups:
          - machine.openshift.io
          resources:
          - machines
          verbs:
          - get
          - list
          - watch
        - apiGroups:
          - policy
          resources:
          - poddisruptionbudgets
          verbs:
          - get
          - list
          - watch
        - apiGroups:
          - rbac.authorization.k8s.io
          resources:
          - clusterrolebindings
          - clusterroles
          verbs:
          - '*'
        - apiGroups:
          - remediation.medik8s.io
          resources:
          - nodehealthchecks
          verbs:
          - create
          - delete
          - get
          - list
          - patch
          - update
          - watch
        - apiGroups:
          - remediation.medik8s.io
          resources:
          - nodehealthchecks/finalizers
          verbs:
          - update
        - apiGroups:
          - remediation.medik8s.io
          resources:
          - nodehealthchecks/status
          verbs:
          - get
          - patch
          - update
        - apiGroups:
          - authentication.k8s.io
          resources:
          - tokenreviews
          verbs:
          - create
        - apiGroups:
          - authorization.k8s.io
          resources:
          - subjectaccessreviews
          verbs:
          - create
        serviceAccountName: node-healthcheck-controller-manager
      deployments:
      - label:
          app.kubernetes.io/component: controller-manager
          app.kubernetes.io/name: node-healthcheck-operator
        name: node-healthcheck-controller-manager
        spec:
          replicas: 2
          selector:
            matchLabels:
              app.kubernetes.io/component: controller-manager
              app.kubernetes.io/name: node-healthcheck-operator
          strategy: {}
          template:
            metadata:
              annotations:
                kubectl.kubernetes.io/default-container: manager
              creationTimestamp: null
              labels:
                app.kubernetes.io/component: controller-manager
                app.kubernetes.io/name: node-healthcheck-operator
            spec:
              affinity:
                nodeAffinity:
                  preferredDuringSchedulingIgnoredDuringExecution:
                  - preference:
                      matchExpressions:
                      - key: node-role.kubernetes.io/infra
                        operator: Exists
                    weight: 3
                  - preference:
                      matchExpressions:
                      - key: node-role.kubernetes.io/master
                        operator: Exists
                    weight: 1
                  - preference:
                      matchExpressions:
                      - key: node-role.kubernetes.io/control-plane
                        operator: Exists
                    weight: 1
              containers:
              - args:
                - --secure-listen-address=0.0.0.0:8443
                - --http2-disable
                - --upstream=http://127.0.0.1:8080/
                - --logtostderr=true
                - --v=0
                - --tls-cert-file=/etc/tls/private/tls.crt
                - --tls-private-key-file=/etc/tls/private/tls.key
                image: registry.redhat.io/openshift4/ose-kube-rbac-proxy-rhel9@sha256:3d3333285fd6736d11ea830fb7fe1f2b8d3e304d682a876458e18eb1173f271d
                name: kube-rbac-proxy
                ports:
                - containerPort: 8443
                  name: https
                  protocol: TCP
                resources:
                  limits:
                    cpu: 500m
                    memory: 128Mi
                  requests:
                    cpu: 5m
                    memory: 64Mi
                securityContext:
                  allowPrivilegeEscalation: false
                  capabilities:
                    drop:
                    - ALL
                volumeMounts:
                - mountPath: /etc/tls/private
                  name: tls-config
                  readOnly: true
              - args:
                - --health-probe-bind-address=:8081
                - --metrics-bind-address=127.0.0.1:8080
                - --leader-elect
                command:
                - /manager
                env:
                - name: DEPLOYMENT_NAMESPACE
                  valueFrom:
                    fieldRef:
                      fieldPath: metadata.namespace
                image: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d
                livenessProbe:
                  httpGet:
                    path: /healthz
                    port: 8081
                  initialDelaySeconds: 15
                  periodSeconds: 20
                name: manager
                readinessProbe:
                  httpGet:
                    path: /readyz
                    port: 8081
                  initialDelaySeconds: 5
                  periodSeconds: 10
                resources:
                  requests:
                    cpu: 100m
                    memory: 20Mi
                securityContext:
                  allowPrivilegeEscalation: false
                  capabilities:
                    drop:
                    - ALL
              priorityClassName: system-cluster-critical
              securityContext:
                runAsNonRoot: true
                seccompProfile:
                  type: RuntimeDefault
              serviceAccountName: node-healthcheck-controller-manager
              terminationGracePeriodSeconds: 10
              tolerations:
              - effect: NoSchedule
                key: node-role.kubernetes.io/master
                operator: Exists
              - effect: NoSchedule
                key: node-role.kubernetes.io/control-plane
                operator: Exists
              - effect: NoSchedule
                key: node-role.kubernetes.io/infra
                operator: Exists
              - effect: NoExecute
                key: node-role.kubernetes.io/infra
                operator: Exists
              volumes:
              - name: tls-config
                secret:
                  secretName: node-healthcheck-tls
      - label:
          app.kubernetes.io/component: node-remediation-console-plugin
          app.kubernetes.io/name: node-healthcheck-operator
        name: node-healthcheck-node-remediation-console-plugin
        spec:
          replicas: 1
          selector:
            matchLabels:
              app.kubernetes.io/component: node-remediation-console-plugin
              app.kubernetes.io/name: node-healthcheck-operator
          strategy: {}
          template:
            metadata:
              creationTimestamp: null
              labels:
                app.kubernetes.io/component: node-remediation-console-plugin
                app.kubernetes.io/name: node-healthcheck-operator
            spec:
              affinity:
                nodeAffinity:
                  preferredDuringSchedulingIgnoredDuringExecution:
                  - preference:
                      matchExpressions:
                      - key: node-role.kubernetes.io/infra
                        operator: Exists
                    weight: 3
                  - preference:
                      matchExpressions:
                      - key: node-role.kubernetes.io/master
                        operator: Exists
                    weight: 1
                  - preference:
                      matchExpressions:
                      - key: node-role.kubernetes.io/control-plane
                        operator: Exists
                    weight: 1
              containers:
              - image: registry.redhat.io/workload-availability/node-remediation-console-rhel9@sha256:cc0f671b126cde10476f8cc2061c6903932e7bf5e3dbf4e8bdda9add70d5deea
                name: node-remediation-console-plugin
                ports:
                - containerPort: 9443
                  name: nrc-server
                  protocol: TCP
                resources:
                  requests:
                    cpu: 10m
                    memory: 50Mi
                securityContext:
                  allowPrivilegeEscalation: false
                  capabilities:
                    drop:
                    - ALL
                volumeMounts:
                - mountPath: /var/serving-cert
                  name: nrc-plugin-cert
                  readOnly: true
              securityContext:
                runAsNonRoot: true
                seccompProfile:
                  type: RuntimeDefault
              tolerations:
              - effect: NoSchedule
                key: node-role.kubernetes.io/master
                operator: Exists
              - effect: NoSchedule
                key: node-role.kubernetes.io/control-plane
                operator: Exists
              - effect: NoSchedule
                key: node-role.kubernetes.io/infra
                operator: Exists
              - effect: NoExecute
                key: node-role.kubernetes.io/infra
                operator: Exists
              volumes:
              - name: nrc-plugin-cert
                secret:
                  defaultMode: 420
                  secretName: nrc-plugin-cert
      permissions:
      - rules:
        - apiGroups:
          - ""
          resources:
          - configmaps
          verbs:
          - get
          - list
          - watch
          - create
          - update
          - patch
          - delete
        - apiGroups:
          - coordination.k8s.io
          resources:
          - leases
          verbs:
          - get
          - list
          - watch
          - create
          - update
          - patch
          - delete
        - apiGroups:
          - ""
          resources:
          - events
          verbs:
          - create
          - patch
        serviceAccountName: node-healthcheck-controller-manager
    strategy: deployment
  installModes:
  - supported: false
    type: OwnNamespace
  - supported: false
    type: SingleNamespace
  - supported: false
    type: MultiNamespace
  - supported: true
    type: AllNamespaces
  keywords:
  - NHC
  - Self Node Remediation
  - SNR
  - Remediation
  - Fencing
  - medik8s
  - k8s
  links:
  - name: Node Healthcheck Operator
    url: https://access.redhat.com/documentation/en-us/workload_availability_for_red_hat_openshift/25.8/html/remediation_fencing_and_maintenance
  - name: Source Code
    url: https://github.com/medik8s/node-healthcheck-operator
  maintainers:
  - email: team-dragonfly@redhat.com
    name: Dragonfly Team
  maturity: alpha
  minKubeVersion: 1.20.0
  provider:
    name: Red Hat
    url: https://www.redhat.com
  relatedImages:
  - image: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d
    name: manager
  - image: registry.redhat.io/openshift4/ose-kube-rbac-proxy-rhel9@sha256:3d3333285fd6736d11ea830fb7fe1f2b8d3e304d682a876458e18eb1173f271d
    name: kube-rbac-proxy
  replaces: node-healthcheck-operator.v0.9.1
  version: 0.10.0
  webhookdefinitions:
  - admissionReviewVersions:
    - v1
    containerPort: 443
    deploymentName: node-healthcheck-controller-manager
    failurePolicy: Fail
    generateName: vnodehealthcheck.kb.io
    rules:
    - apiGroups:
      - remediation.medik8s.io
      apiVersions:
      - v1alpha1
      operations:
      - CREATE
      - UPDATE
      - DELETE
      resources:
      - nodehealthchecks
    sideEffects: None
    targetPort: 9443
    type: ValidatingAdmissionWebhook
    webhookPath: /validate-remediation-medik8s-io-v1alpha1-nodehealthcheck
status:
  certsLastUpdated: "2025-09-11T19:56:11Z"
  certsRotateAt: "2027-09-10T19:56:10Z"
  cleanup: {}
  conditions:
  - lastTransitionTime: "2025-09-11T19:56:07Z"
    lastUpdateTime: "2025-09-11T19:56:07Z"
    message: requirements not yet checked
    phase: Pending
    reason: RequirementsUnknown
  - lastTransitionTime: "2025-09-11T19:56:07Z"
    lastUpdateTime: "2025-09-11T19:56:07Z"
    message: one or more requirements couldn't be found
    phase: Pending
    reason: RequirementsNotMet
  - lastTransitionTime: "2025-09-11T19:56:10Z"
    lastUpdateTime: "2025-09-11T19:56:10Z"
    message: all requirements found, attempting install
    phase: InstallReady
    reason: AllRequirementsMet
  - lastTransitionTime: "2025-09-11T19:56:10Z"
    lastUpdateTime: "2025-09-11T19:56:10Z"
    message: waiting for install components to report healthy
    phase: Installing
    reason: InstallSucceeded
  - lastTransitionTime: "2025-09-11T19:56:10Z"
    lastUpdateTime: "2025-09-11T19:56:11Z"
    message: 'installing: waiting for deployment node-healthcheck-controller-manager
      to become ready: deployment "node-healthcheck-controller-manager" not available:
      Deployment does not have minimum availability.'
    phase: Installing
    reason: InstallWaiting
  - lastTransitionTime: "2025-09-11T19:56:21Z"
    lastUpdateTime: "2025-09-11T19:56:21Z"
    message: install strategy completed with no errors
    phase: Succeeded
    reason: InstallSucceeded
  lastTransitionTime: "2025-09-11T19:56:21Z"
  lastUpdateTime: "2025-09-11T19:56:21Z"
  message: The operator is running in openshift-workload-availability but is managing
    this namespace
  phase: Succeeded
  reason: Copied
  requirementStatus:
  - group: operators.coreos.com
    kind: ClusterServiceVersion
    message: CSV minKubeVersion (1.20.0) less than server version (v1.31.11)
    name: node-healthcheck-operator.v0.10.0
    status: Present
    version: v1alpha1
  - group: apiextensions.k8s.io
    kind: CustomResourceDefinition
    message: CRD is present and Established condition is true
    name: nodehealthchecks.remediation.medik8s.io
    status: Present
    uuid: 3d57988f-a83a-409c-b968-25d18e0c8b1f
    version: v1
  - dependents:
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["create","get"],"apiGroups":[""],"resources":["namespaces"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["get","list","watch"],"apiGroups":[""],"resources":["nodes","pods"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["get","list","watch"],"apiGroups":["apps"],"resources":["deployments"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["get","list","watch"],"apiGroups":["config.openshift.io"],"resources":["clusterversions","featuregates","infrastructures"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["create","delete","get","list","patch","update","watch"],"apiGroups":["console.openshift.io"],"resources":["consoleplugins"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["create","delete","get","list","patch","update","watch"],"apiGroups":["coordination.k8s.io"],"resources":["leases"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["get","list","patch","update","watch"],"apiGroups":["machine.openshift.io"],"resources":["machinehealthchecks"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["get","patch","update"],"apiGroups":["machine.openshift.io"],"resources":["machinehealthchecks/status"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["get","list","watch"],"apiGroups":["machine.openshift.io"],"resources":["machines"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["get","list","watch"],"apiGroups":["policy"],"resources":["poddisruptionbudgets"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["*"],"apiGroups":["rbac.authorization.k8s.io"],"resources":["clusterrolebindings","clusterroles"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["create","delete","get","list","patch","update","watch"],"apiGroups":["remediation.medik8s.io"],"resources":["nodehealthchecks"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["update"],"apiGroups":["remediation.medik8s.io"],"resources":["nodehealthchecks/finalizers"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["get","patch","update"],"apiGroups":["remediation.medik8s.io"],"resources":["nodehealthchecks/status"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["create"],"apiGroups":["authentication.k8s.io"],"resources":["tokenreviews"]}
      status: Satisfied
      version: v1
    - group: rbac.authorization.k8s.io
      kind: PolicyRule
      message: cluster rule:{"verbs":["create"],"apiGroups":["authorization.k8s.io"],"resources":["subjectaccessreviews"]}
      status: Satisfied
      version: v1
    group: ""
    kind: ServiceAccount
    message: ""
    name: node-healthcheck-controller-manager
    status: Present
    version: v1
[kni@cert-rhosp-02 ~]$ oc get pods node-healthcheck-controller-manager-56687f5d99-978vw -o yaml
apiVersion: v1
kind: Pod
metadata:
  annotations:
    alm-examples: |-
      [
        {
          "apiVersion": "remediation.medik8s.io/v1alpha1",
          "kind": "NodeHealthCheck",
          "metadata": {
            "name": "nodehealthcheck-sample"
          },
          "spec": {
            "minHealthy": "51%",
            "remediationTemplate": {
              "apiVersion": "self-node-remediation.medik8s.io/v1alpha1",
              "kind": "SelfNodeRemediationTemplate",
              "name": "self-node-remediation-automatic-strategy-template",
              "namespace": "openshift-operators"
            },
            "selector": {
              "matchExpressions": [
                {
                  "key": "node-role.kubernetes.io/worker",
                  "operator": "Exists"
                }
              ]
            },
            "unhealthyConditions": [
              {
                "duration": "300s",
                "status": "False",
                "type": "Ready"
              },
              {
                "duration": "300s",
                "status": "Unknown",
                "type": "Ready"
              }
            ]
          }
        }
      ]
    capabilities: Basic Install
    categories: OpenShift Optional
    console.openshift.io/plugins: '["node-remediation-console-plugin"]'
    containerImage: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d
    createdAt: "2025-09-01 18:07:56"
    description: Detect failed Nodes and trigger remediation with a remediation operator.
    features.operators.openshift.io/cnf: "false"
    features.operators.openshift.io/cni: "false"
    features.operators.openshift.io/csi: "false"
    features.operators.openshift.io/disconnected: "true"
    features.operators.openshift.io/fips-compliant: "true"
    features.operators.openshift.io/proxy-aware: "false"
    features.operators.openshift.io/tls-profiles: "false"
    features.operators.openshift.io/token-auth-aws: "false"
    features.operators.openshift.io/token-auth-azure: "false"
    features.operators.openshift.io/token-auth-gcp: "false"
    k8s.ovn.org/pod-networks: '{"default":{"ip_addresses":["10.129.0.91/23"],"mac_address":"0a:58:0a:81:00:5b","gateway_ips":["10.129.0.1"],"routes":[{"dest":"10.128.0.0/14","nextHop":"10.129.0.1"},{"dest":"172.30.0.0/16","nextHop":"10.129.0.1"},{"dest":"169.254.0.5/32","nextHop":"10.129.0.1"},{"dest":"100.64.0.0/16","nextHop":"10.129.0.1"}],"ip_address":"10.129.0.91/23","gateway_ip":"10.129.0.1","role":"primary"}}'
    k8s.v1.cni.cncf.io/network-status: |-
      [{
          "name": "ovn-kubernetes",
          "interface": "eth0",
          "ips": [
              "10.129.0.91"
          ],
          "mac": "0a:58:0a:81:00:5b",
          "default": true,
          "dns": {}
      }]
    kubectl.kubernetes.io/default-container: manager
    olm.operatorGroup: openshift-workload-availability-hjmlc
    olm.operatorNamespace: openshift-workload-availability
    olm.skipRange: '>=0.9.0 <0.10.0'
    olm.targetNamespaces: ""
    olmcahash: 873236d06f3853f2453b6e868f8a3cdfaa6495b9b3ea303a917a306c0ff415a9
    openshift.io/scc: restricted-v2
    operatorframework.io/properties: '{"properties":[{"type":"olm.gvk","value":{"group":"remediation.medik8s.io","kind":"NodeHealthCheck","version":"v1alpha1"}},{"type":"olm.package","value":{"packageName":"node-healthcheck-operator","version":"0.10.0"}}]}'
    operatorframework.io/suggested-namespace: openshift-workload-availability
    operatorframework.io/suggested-namespace-template: '{"kind":"Namespace","apiVersion":"v1","metadata":{"name":"openshift-workload-availability","annotations":{"openshift.io/node-selector":""}}}'
    operators.openshift.io/valid-subscription: '["OpenShift Kubernetes Engine", "OpenShift
      Container Platform", "OpenShift Platform Plus"]'
    operators.operatorframework.io/builder: operator-sdk-v1.33.0
    operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
    repository: https://github.com/medik8s/node-healthcheck-operator
    seccomp.security.alpha.kubernetes.io/pod: runtime/default
    support: Red Hat
  creationTimestamp: "2025-09-11T19:56:11Z"
  generateName: node-healthcheck-controller-manager-56687f5d99-
  labels:
    app.kubernetes.io/component: controller-manager
    app.kubernetes.io/name: node-healthcheck-operator
    pod-template-hash: 56687f5d99
  name: node-healthcheck-controller-manager-56687f5d99-978vw
  namespace: openshift-workload-availability
  ownerReferences:
  - apiVersion: apps/v1
    blockOwnerDeletion: true
    controller: true
    kind: ReplicaSet
    name: node-healthcheck-controller-manager-56687f5d99
    uid: 5cb729b2-4c5a-476b-b3aa-522a494f6003
  resourceVersion: "428818"
  uid: 977b842c-142d-4e12-8200-3d38f2d0403d
spec:
  affinity:
    nodeAffinity:
      preferredDuringSchedulingIgnoredDuringExecution:
      - preference:
          matchExpressions:
          - key: node-role.kubernetes.io/infra
            operator: Exists
        weight: 3
      - preference:
          matchExpressions:
          - key: node-role.kubernetes.io/master
            operator: Exists
        weight: 1
      - preference:
          matchExpressions:
          - key: node-role.kubernetes.io/control-plane
            operator: Exists
        weight: 1
  containers:
  - args:
    - --secure-listen-address=0.0.0.0:8443
    - --http2-disable
    - --upstream=http://127.0.0.1:8080/
    - --logtostderr=true
    - --v=0
    - --tls-cert-file=/etc/tls/private/tls.crt
    - --tls-private-key-file=/etc/tls/private/tls.key
    env:
    - name: OPERATOR_CONDITION_NAME
      value: node-healthcheck-operator.v0.10.0
    image: registry.redhat.io/openshift4/ose-kube-rbac-proxy-rhel9@sha256:3d3333285fd6736d11ea830fb7fe1f2b8d3e304d682a876458e18eb1173f271d
    imagePullPolicy: IfNotPresent
    name: kube-rbac-proxy
    ports:
    - containerPort: 8443
      name: https
      protocol: TCP
    resources:
      limits:
        cpu: 500m
        memory: 128Mi
      requests:
        cpu: 5m
        memory: 64Mi
    securityContext:
      allowPrivilegeEscalation: false
      capabilities:
        drop:
        - ALL
      runAsUser: 1000740000
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
    volumeMounts:
    - mountPath: /etc/tls/private
      name: tls-config
      readOnly: true
    - mountPath: /apiserver.local.config/certificates
      name: apiservice-cert
    - mountPath: /tmp/k8s-webhook-server/serving-certs
      name: webhook-cert
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: kube-api-access-965wf
      readOnly: true
  - args:
    - --health-probe-bind-address=:8081
    - --metrics-bind-address=127.0.0.1:8080
    - --leader-elect
    command:
    - /manager
    env:
    - name: DEPLOYMENT_NAMESPACE
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.namespace
    - name: OPERATOR_CONDITION_NAME
      value: node-healthcheck-operator.v0.10.0
    image: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d
    imagePullPolicy: IfNotPresent
    livenessProbe:
      failureThreshold: 3
      httpGet:
        path: /healthz
        port: 8081
        scheme: HTTP
      initialDelaySeconds: 15
      periodSeconds: 20
      successThreshold: 1
      timeoutSeconds: 1
    name: manager
    readinessProbe:
      failureThreshold: 3
      httpGet:
        path: /readyz
        port: 8081
        scheme: HTTP
      initialDelaySeconds: 5
      periodSeconds: 10
      successThreshold: 1
      timeoutSeconds: 1
    resources:
      requests:
        cpu: 100m
        memory: 20Mi
    securityContext:
      allowPrivilegeEscalation: false
      capabilities:
        drop:
        - ALL
      runAsUser: 1000740000
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
    volumeMounts:
    - mountPath: /apiserver.local.config/certificates
      name: apiservice-cert
    - mountPath: /tmp/k8s-webhook-server/serving-certs
      name: webhook-cert
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: kube-api-access-965wf
      readOnly: true
  dnsPolicy: ClusterFirst
  enableServiceLinks: true
  imagePullSecrets:
  - name: node-healthcheck-controller-manager-dockercfg-rkknp
  nodeName: master-0-2
  preemptionPolicy: PreemptLowerPriority
  priority: 2000000000
  priorityClassName: system-cluster-critical
  restartPolicy: Always
  schedulerName: default-scheduler
  securityContext:
    fsGroup: 1000740000
    runAsNonRoot: true
    seLinuxOptions:
      level: s0:c27,c19
    seccompProfile:
      type: RuntimeDefault
  serviceAccount: node-healthcheck-controller-manager
  serviceAccountName: node-healthcheck-controller-manager
  terminationGracePeriodSeconds: 10
  tolerations:
  - effect: NoSchedule
    key: node-role.kubernetes.io/master
    operator: Exists
  - effect: NoSchedule
    key: node-role.kubernetes.io/control-plane
    operator: Exists
  - effect: NoSchedule
    key: node-role.kubernetes.io/infra
    operator: Exists
  - effect: NoExecute
    key: node-role.kubernetes.io/infra
    operator: Exists
  - effect: NoExecute
    key: node.kubernetes.io/not-ready
    operator: Exists
    tolerationSeconds: 300
  - effect: NoExecute
    key: node.kubernetes.io/unreachable
    operator: Exists
    tolerationSeconds: 300
  - effect: NoSchedule
    key: node.kubernetes.io/memory-pressure
    operator: Exists
  volumes:
  - name: tls-config
    secret:
      defaultMode: 420
      secretName: node-healthcheck-tls
  - name: apiservice-cert
    secret:
      defaultMode: 420
      items:
      - key: tls.crt
        path: apiserver.crt
      - key: tls.key
        path: apiserver.key
      secretName: node-healthcheck-controller-manager-service-cert
  - name: webhook-cert
    secret:
      defaultMode: 420
      items:
      - key: tls.crt
        path: tls.crt
      - key: tls.key
        path: tls.key
      secretName: node-healthcheck-controller-manager-service-cert
  - name: kube-api-access-965wf
    projected:
      defaultMode: 420
      sources:
      - serviceAccountToken:
          expirationSeconds: 3607
          path: token
      - configMap:
          items:
          - key: ca.crt
            path: ca.crt
          name: kube-root-ca.crt
      - downwardAPI:
          items:
          - fieldRef:
              apiVersion: v1
              fieldPath: metadata.namespace
            path: namespace
      - configMap:
          items:
          - key: service-ca.crt
            path: service-ca.crt
          name: openshift-service-ca.crt
status:
  conditions:
  - lastProbeTime: null
    lastTransitionTime: "2025-09-11T19:56:12Z"
    status: "True"
    type: PodReadyToStartContainers
  - lastProbeTime: null
    lastTransitionTime: "2025-09-11T19:56:11Z"
    status: "True"
    type: Initialized
  - lastProbeTime: null
    lastTransitionTime: "2025-09-11T19:56:21Z"
    status: "True"
    type: Ready
  - lastProbeTime: null
    lastTransitionTime: "2025-09-11T19:56:21Z"
    status: "True"
    type: ContainersReady
  - lastProbeTime: null
    lastTransitionTime: "2025-09-11T19:56:11Z"
    status: "True"
    type: PodScheduled
  containerStatuses:
  - containerID: cri-o://55cbcc09d13718123e0e5ff9c0f4bfd9c06638163ca7317f1a858b3aabc08c86
    image: registry.redhat.io/openshift4/ose-kube-rbac-proxy-rhel9@sha256:3d3333285fd6736d11ea830fb7fe1f2b8d3e304d682a876458e18eb1173f271d
    imageID: registry.redhat.io/openshift4/ose-kube-rbac-proxy-rhel9@sha256:3d3333285fd6736d11ea830fb7fe1f2b8d3e304d682a876458e18eb1173f271d
    lastState: {}
    name: kube-rbac-proxy
    ready: true
    restartCount: 0
    started: true
    state:
      running:
        startedAt: "2025-09-11T19:56:11Z"
    volumeMounts:
    - mountPath: /etc/tls/private
      name: tls-config
      readOnly: true
      recursiveReadOnly: Disabled
    - mountPath: /apiserver.local.config/certificates
      name: apiservice-cert
    - mountPath: /tmp/k8s-webhook-server/serving-certs
      name: webhook-cert
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: kube-api-access-965wf
      readOnly: true
      recursiveReadOnly: Disabled
  - containerID: cri-o://7feef310c450465bce29810bf463b21f1dbb7a54efeac5d91bc64d7622367644
    image: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d
    imageID: registry.redhat.io/workload-availability/node-healthcheck-rhel9-operator@sha256:2160f2688e5b098c0bae7597cebcd380019b82ae02f4506d07c2ed85f0b3664d
    lastState: {}
    name: manager
    ready: true
    restartCount: 0
    started: true
    state:
      running:
        startedAt: "2025-09-11T19:56:11Z"
    volumeMounts:
    - mountPath: /apiserver.local.config/certificates
      name: apiservice-cert
    - mountPath: /tmp/k8s-webhook-server/serving-certs
      name: webhook-cert
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: kube-api-access-965wf
      readOnly: true
      recursiveReadOnly: Disabled
  hostIP: 192.168.123.126
  hostIPs:
  - ip: 192.168.123.126
  phase: Running
  podIP: 10.129.0.91
  podIPs:
  - ip: 10.129.0.91
  qosClass: Burstable
  startTime: "2025-09-11T19:56:11Z"