Uploaded image for project: 'OpenShift Bugs'
  1. OpenShift Bugs
  2. OCPBUGS-4521

all kubelet targets are down after a few hours

    XMLWordPrintable

Details

    • Important
    • MON Sprint 228
    • 1
    • Approved
    • False
    • Hide

      None

      Show
      None

    Description

      Description of problem:

      this bug is found when verify OCPBUGS-2873, upgrade from 4.12.0-0.nightly-2022-12-04-160656 to 4.13.0-0.nightly-2022-12-04-194803, Done applying 4.13.0-0.nightly-2022-12-04-194803 at "2022-12-05T03:03:56Z"(UTC time), all targets were UP after upgrade, but since  Dec 5, 2022, 20:47 (UTC time), TargetDown alert for kubelet was fired , see from http://pastebin.test.redhat.com/1084049, all kubelet 10250 targets are down for "server returned HTTP status 401 Unauthorized", kubelet targets include

      10250/metrics/cadvisor
      10250/metrics
      10250/metrics/probes

      Version-Release number of selected component (if applicable):

      upgrade from 4.12.0-0.nightly-2022-12-04-160656 to 4.13.0-0.nightly-2022-12-04-194803
      it affects only 4.13.

      How reproducible:

      not sure if it's regression issue for https://github.com/openshift/cluster-monitoring-operator/pull/1827 or the issue related to upgrade

      Steps to Reproduce:

      1. upgrade from 4.12.0-0.nightly-2022-12-04-160656 to 4.13.0-0.nightly-2022-12-04-194803 and check all targets' status
      2.
      3.
      

      Actual results:

      all kubelet targets are down

      Expected results:

      should not down

      Additional info:

      this bug affects admin UI, since some grahps use metrics exposed by kubelet, kubelet servicemonitor file see below

      # oc -n openshift-monitoring get servicemonitor kubelet -oyaml
      apiVersion: monitoring.coreos.com/v1
      kind: ServiceMonitor
      metadata:
        creationTimestamp: "2022-12-06T02:37:25Z"
        generation: 1
        labels:
          app.kubernetes.io/name: kubelet
          app.kubernetes.io/part-of: openshift-monitoring
          k8s-app: kubelet
        name: kubelet
        namespace: openshift-monitoring
        resourceVersion: "18888"
        uid: 85835270-7ceb-4db9-a51b-f645db0f7329
      spec:
        endpoints:
        - bearerTokenSecret:
            key: ""
          honorLabels: true
          interval: 30s
          metricRelabelings:
          - action: drop
            regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
            sourceLabels:
            - __name__
          - action: drop
            regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
            sourceLabels:
            - __name__
          - action: drop
            regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers)
            sourceLabels:
            - __name__
          - action: drop
            regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
            sourceLabels:
            - __name__
          - action: drop
            regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
            sourceLabels:
            - __name__
          - action: drop
            regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
            sourceLabels:
            - __name__
          - action: drop
            regex: transformation_(transformation_latencies_microseconds|failures_total)
            sourceLabels:
            - __name__
          - action: drop
            regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
            sourceLabels:
            - __name__
          port: https-metrics
          relabelings:
          - action: replace
            sourceLabels:
            - __metrics_path__
            targetLabel: metrics_path
          scheme: https
          scrapeTimeout: 30s
          tlsConfig:
            ca: {}
            caFile: /etc/prometheus/configmaps/kubelet-serving-ca-bundle/ca-bundle.crt
            cert: {}
            certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt
            keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key
        - bearerTokenSecret:
            key: ""
          honorLabels: true
          honorTimestamps: false
          interval: 30s
          metricRelabelings:
          - action: drop
            regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
            sourceLabels:
            - __name__
          - action: drop
            regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);;
            sourceLabels:
            - __name__
            - pod
            - namespace
          - action: drop
            regex: (container_blkio_device_usage_total);.+
            sourceLabels:
            - __name__
            - container
          - action: drop
            regex: container_memory_failures_total
            sourceLabels:
            - __name__
          - action: replace
            regex: container_fs_usage_bytes
            replacement: "true"
            sourceLabels:
            - __name__
            targetLabel: __tmp_keep_metric
          - action: drop
            regex: ;(container_fs_.*);.+
            sourceLabels:
            - __tmp_keep_metric
            - __name__
            - container
          - action: labeldrop
            regex: __tmp_keep_metric
          path: /metrics/cadvisor
          port: https-metrics
          relabelings:
          - action: replace
            sourceLabels:
            - __metrics_path__
            targetLabel: metrics_path
          scheme: https
          scrapeTimeout: 30s
          tlsConfig:
            ca: {}
            caFile: /etc/prometheus/configmaps/kubelet-serving-ca-bundle/ca-bundle.crt
            cert: {}
            certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt
            keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key
        - bearerTokenSecret:
            key: ""
          honorLabels: true
          interval: 30s
          path: /metrics/probes
          port: https-metrics
          relabelings:
          - action: replace
            sourceLabels:
            - __metrics_path__
            targetLabel: metrics_path
          scheme: https
          scrapeTimeout: 30s
          tlsConfig:
            ca: {}
            caFile: /etc/prometheus/configmaps/kubelet-serving-ca-bundle/ca-bundle.crt
            cert: {}
            certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt
            keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key
        - bearerTokenSecret:
            key: ""
          interval: 30s
          port: https-metrics
          relabelings:
          - action: replace
            regex: (.+)(?::\d+)
            replacement: $1:9537
            sourceLabels:
            - __address__
            targetLabel: __address__
          - action: replace
            replacement: crio
            sourceLabels:
            - endpoint
            targetLabel: endpoint
          - action: replace
            replacement: crio
            targetLabel: job
        jobLabel: k8s-app
        namespaceSelector:
          matchNames:
          - kube-system
        selector:
          matchLabels:
            k8s-app: kubelet
      

       

      Attachments

        Issue Links

          Activity

            People

              team-mco Team MCO
              juzhao@redhat.com Junqi Zhao
              Sunil Choudhary Sunil Choudhary
              Votes:
              0 Vote for this issue
              Watchers:
              11 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: