Uploaded image for project: 'OpenShift Logging'
  1. OpenShift Logging
  2. LOG-2294

[Vector] Vector internal metrics are not exposed via HTTPS due to which OpenShift Monitoring Prometheus service cannot scrape the metrics endpoint.

XMLWordPrintable

    • Logging (Core) - Sprint 216, Logging (Core) - Sprint 217

      Vector TP

      OCP Version: 4.10.0-0.nightly-2022-02-26-230022

      Description of the problem:

      Vector Collector metrics exported by the Prometheus sink are bind to the port 24231 and only accessible via HTTP. 

      [sinks.prometheus_output]
      type = "prometheus_exporter"
      inputs = ["internal_metrics"]
      address = "0.0.0.0:24231"
      default_namespace = "collector" 

      The OpenShift Monitoring Prometheus service expects the Vector Collector metrics endpoint listening on HTTPS port 24231 (port: metrics).  

      oc get servicemonitors.monitoring.coreos.com collector -o yaml
      apiVersion: monitoring.coreos.com/v1
      kind: ServiceMonitor
      metadata:
        creationTimestamp: "2022-03-02T11:00:17Z"
        generation: 1
        name: collector
        namespace: openshift-logging
        ownerReferences:
        - apiVersion: logging.openshift.io/v1
          controller: true
          kind: ClusterLogging
          name: instance
          uid: a000e25d-d6dd-444e-843d-f827c2bc79cf
        resourceVersion: "141175"
        uid: 995e45dc-4ac9-4d6e-b857-0fb383ea6513
      spec:
        endpoints:
        - bearerTokenSecret:
            key: ""
          path: /metrics
          port: metrics
          scheme: https
          tlsConfig:
            ca: {}
            caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
            cert: {}
            serverName: collector.openshift-logging.svc
        - bearerTokenSecret:
            key: ""
          path: /metrics
          port: logfile-metrics
          scheme: https
          tlsConfig:
            ca: {}
            caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
            cert: {}
            serverName: collector.openshift-logging.svc
        jobLabel: monitor-collector
        namespaceSelector:
          matchNames:
          - openshift-logging
        selector:
          matchLabels:
            logging-infra: support 

      The Prometheus service is not able to scrape the metrics endpoint and the Collector target is shown as down.

      Steps to reproduce the issue:

      1 Deploy ClusterLogging instance with Vector as collector.

      apiVersion: "logging.openshift.io/v1"
      kind: "ClusterLogging"
      metadata:
        name: "instance" 
        namespace: "openshift-logging"
      spec:
        managementState: "Managed"  
        logStore:
          type: "elasticsearch"  
          retentionPolicy: 
            application:
              maxAge: 7d
            infra:
              maxAge: 7d
            audit:
              maxAge: 7d
          elasticsearch:
            nodeCount: 3 
            storage: {}
            resources: 
                requests:
                  memory: "1Gi"
            proxy: 
              resources:
                limits:
                  memory: 256Mi
                requests:
                  memory: 256Mi
            redundancyPolicy: "SingleRedundancy"
        visualization:
          type: "kibana"  
          kibana:
            replicas: 1
        collection:
          logs:
            type: "vector"  
            vector: {} 

      2 Check that the Prometheus Monitoring targets on the OpenShift Console - Observe - Targets - Metrics target are shows as down for Collector.

      3 Access the Collector metrics endpoint via HTTPS and HTTP. It fails via HTTPS but is accessible via HTTP. 

      $ oc exec collector-fmvf5 -c collector -- curl -k -H "Authorization: Bearer `oc sa get-token prometheus-k8s -n openshift-monitoring`" -s -H "Content-type: application/json" https://collector.openshift-logging.svc:24231/metrics
      command terminated with exit code 35
      
      $ oc exec collector-fmvf5 -c collector -- curl -k -H "Authorization: Bearer `oc sa get-token prometheus-k8s -n openshift-monitoring`" -s -H "Content-type: application/json" http://collector.openshift-logging.svc:24231/metrics
      # HELP vector_adaptive_concurrency_averaged_rtt adaptive_concurrency_averaged_rtt
      # TYPE vector_adaptive_concurrency_averaged_rtt histogram
      vector_adaptive_concurrency_averaged_rtt_bucket{component_kind="sink",component_name="default",component_type="elasticsearch",le="-inf"} 0 1646216619065
      vector_adaptive_concurrency_averaged_rtt_bucket{component_kind="sink",component_name="default",component_type="elasticsearch",le="0.015625"} 31 1646216619065
      vector_adaptive_concurrency_averaged_rtt_bucket{component_kind="sink",component_name="default",component_type="elasticsearch",le="0.03125"} 72 1646216619065
      vector_adaptive_concurrency_averaged_rtt_bucket{component_kind="sink",component_name="default",component_type="elasticsearch",le="0.0625"} 85 1646216619065
      vector_adaptive_concurrency_averaged_rtt_bucket{component_kind="sink",component_name="default",component_type="elasticsearch",le="0.125"} 86 1646216619065
      vector_adaptive_concurrency_averaged_rtt_bucket{component_kind="sink",component_name="default",component_type="elasticsearch",le="0.25"} 87 1646216619065
      vector_adaptive_concurrency_averaged_rtt_bucket{component_kind="sink",component_name="default",component_type="elasticsearch",le="0.5"} 87 1646216619065
      vector_adaptive_concurrency_averaged_rtt_bucket{component_kind="sink",component_name="default",component_type="elasticsearch",le="0"} 87 1646216619065 

      4 Extract and check the Vector config.

      # Logs from containers (including openshift containers)
      [sources.raw_container_logs]
      type = "kubernetes_logs"
      auto_partial_merge = true
      exclude_paths_glob_patterns = ["/var/log/pods/openshift-logging_collector-*/*/*.log", "/var/log/pods/openshift-logging_elasticsearch-*/*/*.log", "/var/log/pods/openshift-logging_kibana-*/*/*.log"][sources.raw_journal_logs]
      type = "journald"[sources.internal_metrics]
      type = "internal_metrics"[transforms.container_logs]
      type = "remap"
      inputs = ["raw_container_logs"]
      source = '''
        level = "unknown"
        if match(.message,r'(Warning|WARN|W[0-9]+|level=warn|Value:warn|"level":"warn")'){
          level = "warn"
        } else if match(.message, r'Info|INFO|I[0-9]+|level=info|Value:info|"level":"info"'){
          level = "info"
        } else if match(.message, r'Error|ERROR|E[0-9]+|level=error|Value:error|"level":"error"'){
          level = "error"
        } else if match(.message, r'Debug|DEBUG|D[0-9]+|level=debug|Value:debug|"level":"debug"'){
          level = "debug"
        }
        .level = level  .pipeline_metadata.collector.name = "vector"
        .pipeline_metadata.collector.version = "0.14.1"
        ip4, err = get_env_var("NODE_IPV4")
        .pipeline_metadata.collector.ipaddr4 = ip4
        received, err = format_timestamp(now(),"%+")
        .pipeline_metadata.collector.received_at = received
        .pipeline_metadata.collector.error = err
       '''[transforms.journal_logs]
      type = "remap"
      inputs = ["raw_journal_logs"]
      source = '''
        level = "unknown"
        if match(.message,r'(Warning|WARN|W[0-9]+|level=warn|Value:warn|"level":"warn")'){
          level = "warn"
        } else if match(.message, r'Info|INFO|I[0-9]+|level=info|Value:info|"level":"info"'){
          level = "info"
        } else if match(.message, r'Error|ERROR|E[0-9]+|level=error|Value:error|"level":"error"'){
          level = "error"
        } else if match(.message, r'Debug|DEBUG|D[0-9]+|level=debug|Value:debug|"level":"debug"'){
          level = "debug"
        }
        .level = level  .pipeline_metadata.collector.name = "vector"
        .pipeline_metadata.collector.version = "0.14.1"
        ip4, err = get_env_var("NODE_IPV4")
        .pipeline_metadata.collector.ipaddr4 = ip4
        received, err = format_timestamp(now(),"%+")
        .pipeline_metadata.collector.received_at = received
        .pipeline_metadata.collector.error = err
       '''
      [transforms.route_container_logs]
      type = "route"
      inputs = ["container_logs"]
      route.app = '!((starts_with!(.kubernetes.pod_namespace,"kube")) || (starts_with!(.kubernetes.pod_namespace,"openshift")) || (.kubernetes.pod_namespace == "default"))'
      route.infra = '(starts_with!(.kubernetes.pod_namespace,"kube")) || (starts_with!(.kubernetes.pod_namespace,"openshift")) || (.kubernetes.pod_namespace == "default")'
      # Rename log stream to "application"
      [transforms.application]
      type = "remap"
      inputs = ["route_container_logs.app"]
      source = """
      .log_type = "application"
      """
      # Rename log stream to "infrastructure"
      [transforms.infrastructure]
      type = "remap"
      inputs = ["route_container_logs.infra","journal_logs"]
      source = """
      .log_type = "infrastructure"
      """
      [transforms.pipeline_0_]
      type = "remap"
      inputs = ["application","infrastructure"]
      source = """
      .
      """
      # Adding _id field
      [transforms.default_add_es_id]
      type = "remap"
      inputs = ["pipeline_0_"]
      source = """
      index = "default"
      if (.log_type == "application"){
        index = "app"
      }
      if (.log_type == "infrastructure"){
        index = "infra"
      }
      if (.log_type == "audit"){
        index = "audit"
      }
      ."write-index"=index+"-write"
      ._id = encode_base64(uuid_v4())
      """[transforms.default_dedot_and_flatten]
      type = "lua"
      inputs = ["default_add_es_id"]
      version = "2"
      hooks.process = "process"
      source = """
          function process(event, emit)
              if event.log.kubernetes == nil then
                  return
              end
              dedot(event.log.kubernetes.pod_labels)
              -- create "flat_labels" key
              event.log.kubernetes.flat_labels = {}
              i = 1
              -- flatten the labels
              for k,v in pairs(event.log.kubernetes.pod_labels) do
                event.log.kubernetes.flat_labels[i] = k.."="..v
                i=i+1
              end
              -- delete the "pod_labels" key
              event.log.kubernetes["pod_labels"] = nil
              emit(event)
          end    function dedot(map)
              if map == nil then
                  return
              end
              local new_map = {}
              local changed_keys = {}
              for k, v in pairs(map) do
                  local dedotted = string.gsub(k, "%.", "_")
                  if dedotted ~= k then
                      new_map[dedotted] = v
                      changed_keys[k] = true
                  end
              end
              for k in pairs(changed_keys) do
                  map[k] = nil
              end
              for k, v in pairs(new_map) do
                  map[k] = v
              end
          end
      """[sinks.default]
      type = "elasticsearch"
      inputs = ["default_dedot_and_flatten"]
      endpoint = "https://elasticsearch.openshift-logging.svc:9200"
      index = "{{ write-index }}"
      request.timeout_secs = 2147483648
      bulk_action = "create"
      id_key = "_id"
      # TLS Config
      [sinks.default.tls]
      key_file = "/var/run/ocp-collector/secrets/collector/tls.key"
      crt_file = "/var/run/ocp-collector/secrets/collector/tls.crt"
      ca_file = "/var/run/ocp-collector/secrets/collector/ca-bundle.crt"
      [sinks.prometheus_output]
      type = "prometheus_exporter"
      inputs = ["internal_metrics"]
      address = "0.0.0.0:24231"
      default_namespace = "collector" 

      Additional notes:

      With Fluentd we bind the metrics endpoint via HTTPS:

      # Prometheus Monitoring
      <source>
        @type prometheus
        bind "#{ENV['POD_IP']}"
        <ssl>
          enable true
          certificate_path "#{ENV['METRICS_CERT'] || '/etc/fluent/metrics/tls.crt'}"
          private_key_path "#{ENV['METRICS_KEY'] || '/etc/fluent/metrics/tls.key'}"
        </ssl>
      </source><source>
        @type prometheus_monitor
        <labels>
          hostname ${hostname}
        </labels>
      </source> 

       

              prangupt Pranjal Gupta (Inactive)
              rhn-support-ikanse Ishwar Kanse
              Ishwar Kanse Ishwar Kanse
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

                Created:
                Updated:
                Resolved: