Uploaded image for project: 'OpenShift Bugs'
  1. OpenShift Bugs
  2. OCPBUGS-63192

PrometheusKubernetesListWatchFailures alert failed to trigger on Hypershift Hosted cluster of 4.20 having power worker nodes

XMLWordPrintable

    • Icon: Bug Bug
    • Resolution: Unresolved
    • Icon: Undefined Undefined
    • None
    • 4.20
    • Monitoring
    • None
    • Quality / Stability / Reliability
    • False
    • Hide

      None

      Show
      None
    • None
    • None
    • None
    • None
    • None
    • None
    • None
    • None
    • None
    • None
    • None
    • None
    • None
    • None

      Description of problem:

      PrometheusKubernetesListWatchFailures alert failed to trigger on Hypershift Hosted cluster of 4.20 having power worker nodes

      Version-Release number of selected component (if applicable):

          

      How reproducible:

          

      Steps to Reproduce:

      1. create namespace ns1 and deploy example-app
      ******
      apiVersion: v1
      kind: Namespace
      metadata:
        name: ns1
      ******
      % oc apply -n ns1 -f -<<EOF
      apiVersion: apps/v1
      kind: Deployment
      metadata:
        labels:
          app: prometheus-example-app
        name: prometheus-example-app
      spec:
        replicas: 1
        selector:
          matchLabels:
            app: prometheus-example-app
        template:
          metadata:
            labels:
              app: prometheus-example-app
          spec:
            containers:
            - image: quay.io/openshifttest/prometheus-example-app@sha256:382dc349f82d730b834515e402b48a9c7e2965d0efbc42388bd254f424f6193e
              imagePullPolicy: IfNotPresent
              name: prometheus-example-app
      ---
      apiVersion: v1
      kind: Service
      metadata:
        labels:
          app: prometheus-example-app
        name: prometheus-example-app
      spec:
        ports:
        - port: 8080
          protocol: TCP
          targetPort: 8080
          name: web
        selector:
          app: prometheus-example-app
        type: ClusterIP
      ---
      apiVersion: monitoring.coreos.com/v1
      kind: ServiceMonitor
      metadata:
        name: prometheus-example-monitor
      spec:
        endpoints:
        - interval: 5s
          port: web
        selector:
          matchLabels:
            app: prometheus-example-app
      EOF
      
      2. label ns1 
      % oc label ns ns1 openshift.io/cluster-monitoring=true
      
      3. wait for few mins and check metrics
      % token=$(oc create token prometheus-k8s -n openshift-monitoring)
      % oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/query?' --data-urlencode 'query=ALERTS{alertname="PrometheusKubernetesListWatchFailures"}' | jq

      Actual results:

      wait for a few minutes but can not see the alert triggered
      % oc -n ns1 get pod
      NAME                                      READY   STATUS    RESTARTS   AGE
      prometheus-example-app-7f45f56845-5v6ww   1/1     Running   0          33m
      % oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/query?' --data-urlencode 'query=ALERTS{alertname="PrometheusKubernetesListWatchFailures"}' | jq
        % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                       Dload  Upload   Total   Spent    Left  Speed
      100   150  100    77  100    73    427    405 --:--:-- --:--:-- --:--:--   833
      {
        "status": "success",
        "data": {
          "resultType": "vector",
          "result": [],
          "analysis": {}
        }
      }

      Expected results:

      usuall should see alert triggered, for example:
      % oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/query?' --data-urlencode 'query=ALERTS{alertname="PrometheusKubernetesListWatchFailures"}' | jq
        % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                       Dload  Upload   Total   Spent    Left  Speed
      100   927  100   854  100    73  23722   2027 --:--:-- --:--:-- --:--:-- 25750
      {
        "status": "success",
        "data": {
          "resultType": "vector",
          "result": [
            {
              "metric": {
                "__name__": "ALERTS",
                "alertname": "PrometheusKubernetesListWatchFailures",
                "alertstate": "firing",
                "container": "kube-rbac-proxy",
                "endpoint": "metrics",
                "instance": "10.128.2.177:9092",
                "job": "prometheus-k8s",
                "namespace": "openshift-monitoring",
                "pod": "prometheus-k8s-0",
                "prometheus": "openshift-monitoring/k8s",
                "service": "prometheus-k8s",
                "severity": "warning"
              },
              "value": [
                1760602812.948,
                "1"
              ]
            },

      Additional info:

      % oc get node
      NAME                                             STATUS   ROLES    AGE   VERSION
      worker-1.hypershift-420.qe-ppc64le.cis.ibm.net   Ready    worker   19d   v1.33.4
      worker-3.hypershift-420.qe-ppc64le.cis.ibm.net   Ready    worker   9d    v1.33.4    
      
      
      % oc -n openshift-monitoring get prometheusrules prometheus-k8s-prometheus-rules -ojsonpath='{.spec.groups[].rules[?(@.alert=="PrometheusKubernetesListWatchFailures")]}' |jq
      {
        "alert": "PrometheusKubernetesListWatchFailures",
        "annotations": {
          "description": "Kubernetes service discovery of Prometheus {{$labels.namespace}}/{{$labels.pod}} is experiencing {{ printf \"%.0f\" $value }} failures with LIST/WATCH requests to the Kubernetes API in the last 5 minutes.",
          "runbook_url": "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusKubernetesListWatchFailures.md",
          "summary": "Requests in Kubernetes SD are failing."
        },
        "expr": "increase(prometheus_sd_kubernetes_failures_total{job=~\"prometheus-k8s|prometheus-user-workload\"}[5m]) > 0\n",
        "for": "15m",
        "labels": {
          "severity": "warning"
        }
      }
      
      
      % oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/query?' --data-urlencode 'query=prometheus_sd_kubernetes_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}' | jq
        % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                       Dload  Upload   Total   Spent    Left  Speed
      100   884  100   779  100   105   4233    570 --:--:-- --:--:-- --:--:--  4804
      {
        "status": "success",
        "data": {
          "resultType": "vector",
          "result": [
            {
              "metric": {
                "__name__": "prometheus_sd_kubernetes_failures_total",
                "container": "kube-rbac-proxy",
                "endpoint": "metrics",
                "instance": "10.134.0.65:9092",
                "job": "prometheus-k8s",
                "namespace": "openshift-monitoring",
                "pod": "prometheus-k8s-0",
                "prometheus": "openshift-monitoring/k8s",
                "service": "prometheus-k8s"
              },
              "value": [
                1760603878.181,
                "0"
              ]
            },

              jfajersk@redhat.com Jan Fajerski
              tagao@redhat.com Tai Gao
              None
              None
              Junqi Zhao Junqi Zhao
              None
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

                Created:
                Updated: