-
Bug
-
Resolution: Done
-
Undefined
-
netobserv-1.5-candidate
-
None
-
Quality / Stability / Reliability
-
False
-
-
None
-
Critical
-
None
-
None
-
NetObserv - Sprint 249
-
None
-
None
-
None
Description of problem:
The cypress test for FlowRTT feature verification is failing on ppc64le due to the empty panels without the flowlogs data in the Network Traffic UI. To verify the issue I have manualy deployed the flowcollector with flowRTT enabled and the RTT metrics in "spec.processor.metrics.includeList" flowcollector CRD by following steps from https://polarion.engineering.redhat.com/polarion/#/project/OSE/workitem?id=OCP-68246. When checked the eBPF pods status all pods were in "CrashLoopBackOff" state and the data were not coming up on the network-traffic UI page. #Cluster details: [root@rdr-noo-415-rc5-bastion-0 ~]# oc get clusterversion NAME VERSION AVAILABLE PROGRESSING SINCE STATUS version 4.15.0-rc.5 True False 39h Cluster version is 4.15.0-rc.5 [root@rdr-noo-415-rc5-bastion-0 ~]# oc version Client Version: 4.15.0-rc.5 Kustomize Version: v5.0.4-0.20230601165947-6ce0bf390ce3 Server Version: 4.15.0-rc.5 Kubernetes Version: v1.28.6+0fb4726 [root@rdr-noo-415-rc5-bastion-0 ~]# arch ppc64le [root@rdr-noo-415-rc5-bastion-0 ~]# #Operator status: [root@rdr-noo-415-rc5-bastion-0 ~]# oc get csv NAME DISPLAY VERSION REPLACES PHASE loki-operator.v5.8.3 Loki Operator 5.8.3 loki-operator.v5.8.2 Succeeded network-observability-operator.v1.5.0 Network Observability 1.5.0 network-observability-operator.v1.4.2 Succeeded #flowlogs and loki pods [root@rdr-noo-415-rc5-bastion-0 ~]# oc get po NAME READY STATUS RESTARTS AGE flowlogs-pipeline-2g756 1/1 Running 0 54m flowlogs-pipeline-7rcv6 1/1 Running 0 54m flowlogs-pipeline-qhwsf 1/1 Running 0 54m flowlogs-pipeline-wb7ql 1/1 Running 0 54m flowlogs-pipeline-zbb4m 1/1 Running 0 54m lokistack-compactor-0 1/1 Running 0 58m lokistack-distributor-77db85d4f-4bpmp 1/1 Running 0 58m lokistack-distributor-77db85d4f-hzvvs 1/1 Running 0 58m lokistack-gateway-5db4574d87-8qbqh 2/2 Running 0 58m lokistack-gateway-5db4574d87-m2x2f 2/2 Running 0 58m lokistack-index-gateway-0 1/1 Running 0 58m lokistack-index-gateway-1 1/1 Running 0 58m lokistack-ingester-0 1/1 Running 0 58m lokistack-ingester-1 1/1 Running 0 57m lokistack-querier-7774f6f866-bn9ld 1/1 Running 0 58m lokistack-querier-7774f6f866-jwkpz 1/1 Running 0 58m lokistack-query-frontend-5bf5fd8796-4wrd7 1/1 Running 0 58m lokistack-query-frontend-5bf5fd8796-xghpg 1/1 Running 0 58m netobserv-plugin-7c678dfc6b-jxxgk 1/1 Running 0 54m [root@rdr-noo-415-rc5-bastion-0 ~]# #eBPF pods [root@rdr-noo-415-rc5-bastion-0 ~]# oc get po -n netobserv-privileged NAME READY STATUS RESTARTS AGE netobserv-ebpf-agent-4mjf9 0/1 CrashLoopBackOff 15 (3m9s ago) 55m netobserv-ebpf-agent-frmd2 0/1 CrashLoopBackOff 15 (3m6s ago) 55m netobserv-ebpf-agent-nrjrh 0/1 CrashLoopBackOff 15 (3m9s ago) 55m netobserv-ebpf-agent-pfbhv 0/1 CrashLoopBackOff 15 (3m48s ago) 55m netobserv-ebpf-agent-rrmxq 0/1 CrashLoopBackOff 15 (2m57s ago) 55m [root@rdr-noo-415-rc5-bastion-0 ~]# # eBPF pod logs [root@rdr-noo-415-rc5-bastion-0 ~]# oc logs netobserv-ebpf-agent-4mjf9 -n netobserv-privileged time="2024-02-14T10:44:09Z" level=info msg="starting NetObserv eBPF Agent" time="2024-02-14T10:44:09Z" level=info msg="initializing Flows agent" component=agent.Flows time="2024-02-14T10:44:09Z" level=fatal msg="can't instantiate NetObserv eBPF Agent" error="failed to attach the BPF program to tcpReceiveFentry: create raw tracepoint: not supported" [root@rdr-noo-415-rc5-bastion-0 ~]# Currenly this issue producing only on ppc64le for 4.12 to 4.15 OCP versions
flowcollector CRD:
[root@rdr-noo-415-rc5-bastion-0 ~]# oc get flowcollector cluster -n netobserv -o yaml
apiVersion: flows.netobserv.io/v1beta2
kind: FlowCollector
metadata:
annotations:
flows.netobserv.io/flowcollectorlegacy-namespace: netobserv
flows.netobserv.io/flpparent-namespace: netobserv
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"flows.netobserv.io/v1beta2","kind":"FlowCollector","metadata":{"annotations":{},"name":"cluster"},"spec":{"agent":{"ebpf":{"cacheActiveTimeout":"5s","cacheMaxFlows":100000,"excludeInterfaces":["lo"],"features":["FlowRTT"],"imagePullPolicy":"IfNotPresent","kafkaBatchSize":10485760,"logLevel":"info","resources":{"limits":{"memory":"800Mi"},"requests":{"cpu":"100m","memory":"50Mi"}},"sampling":1},"ipfix":{"cacheActiveTimeout":"20s","cacheMaxFlows":400,"clusterNetworkOperator":{"namespace":"openshift-network-operator"},"forceSampleAll":false,"ovnKubernetes":{"containerName":"ovnkube-node","daemonSetName":"ovnkube-node","namespace":"ovn-kubernetes"},"sampling":400},"type":"eBPF"},"consolePlugin":{"advanced":{"port":9001,"register":true},"autoscaler":{"maxReplicas":3,"metrics":[{"resource":{"name":"cpu","target":{"averageUtilization":50,"type":"Utilization"}},"type":"Resource"}],"minReplicas":1,"status":"Disabled"},"imagePullPolicy":"IfNotPresent","logLevel":"debug","portNaming":{"enable":true,"portNames":{"3100":"loki"}},"quickFilters":[{"default":true,"filter":{"dst_namespace!":"openshift-,netobserv","src_namespace!":"openshift-,netobserv"},"name":"Applications"},{"filter":{"dst_namespace":"openshift-,netobserv","src_namespace":"openshift-,netobserv"},"name":"Infrastructure"},{"default":true,"filter":{"dst_kind":"Pod","src_kind":"Pod"},"name":"Pods network"},{"filter":{"dst_kind":"Service"},"name":"Services network"}],"replicas":1,"resources":{"limits":{"memory":"100Mi"},"requests":{"cpu":"100m","memory":"50Mi"}}},"deploymentModel":"Direct","exporters":[],"kafka":{"address":"kafka-cluster-kafka-bootstrap.netobserv","tls":{"caCert":{"certFile":"ca.crt","name":"kafka-cluster-cluster-ca-cert","type":"secret"},"enable":false,"insecureSkipVerify":false,"userCert":{"certFile":"user.crt","certKey":"user.key","name":"flp-kafka","type":"secret"}},"topic":"network-flows"},"loki":{"advanced":{"writeMaxBackoff":"5s","writeMinBackoff":"1s"},"enable":true,"lokiStack":{"name":"lokistack"},"maxRetries":2,"mode":"LokiStack","writeBatchSize":10485760,"writeBatchWait":"1s"},"namespace":"netobserv","processor":{"advanced":{"dropUnusedFields":true,"enableKubeProbes":true,"healthPort":8080,"port":2055},"imagePullPolicy":"IfNotPresent","kafkaConsumerAutoscaler":{"maxReplicas":0,"status":"Disabled"},"kafkaConsumerBatchSize":10485760,"kafkaConsumerQueueCapacity":1000,"kafkaConsumerReplicas":3,"logLevel":"info","metrics":{"includeList":["node_ingress_bytes_total","workload_ingress_bytes_total","namespace_flows_total","node_rtt_seconds","namespace_rtt_seconds","workload_rtt_seconds"],"server":{"port":9102,"tls":{"type":"Disabled"}}},"resources":{"limits":{"memory":"800Mi"},"requests":{"cpu":"100m","memory":"100Mi"}}}}}
creationTimestamp: "2024-02-14T09:52:10Z"
finalizers:
- flows.netobserv.io/finalizer
generation: 4
name: cluster
resourceVersion: "952785"
uid: 5f6d3d13-d559-4ff2-86dd-4308adbc6e73
spec:
agent:
ebpf:
cacheActiveTimeout: 5s
cacheMaxFlows: 100000
excludeInterfaces:
- lo
features:
- FlowRTT
imagePullPolicy: IfNotPresent
kafkaBatchSize: 10485760
logLevel: info
resources:
limits:
memory: 800Mi
requests:
cpu: 100m
memory: 50Mi
sampling: 1
ipfix:
cacheActiveTimeout: 20s
cacheMaxFlows: 400
clusterNetworkOperator:
namespace: openshift-network-operator
ovnKubernetes:
containerName: ovnkube-node
daemonSetName: ovnkube-node
namespace: ovn-kubernetes
sampling: 400
type: eBPF
consolePlugin:
autoscaler:
maxReplicas: 3
metrics:
- resource:
name: cpu
target:
averageUtilization: 50
type: Utilization
type: Resource
minReplicas: 1
status: Disabled
enable: true
imagePullPolicy: IfNotPresent
logLevel: debug
portNaming:
enable: true
portNames:
"3100": loki
quickFilters:
- default: true
filter:
dst_namespace!: openshift-,netobserv
src_namespace!: openshift-,netobserv
name: Applications
- filter:
dst_namespace: openshift-,netobserv
src_namespace: openshift-,netobserv
name: Infrastructure
- default: true
filter:
dst_kind: Pod
src_kind: Pod
name: Pods network
- filter:
dst_kind: Service
name: Services network
replicas: 1
resources:
limits:
memory: 100Mi
requests:
cpu: 100m
memory: 50Mi
deploymentModel: Direct
exporters: []
kafka:
address: kafka-cluster-kafka-bootstrap.netobserv
sasl:
clientIDReference: {}
clientSecretReference: {}
type: Disabled
tls:
caCert:
certFile: ca.crt
name: kafka-cluster-cluster-ca-cert
type: secret
userCert:
certFile: user.crt
certKey: user.key
name: flp-kafka
type: secret
topic: network-flows
loki:
enable: true
lokiStack:
name: lokistack
manual:
authToken: Disabled
ingesterUrl: http://loki:3100/
querierUrl: http://loki:3100/
statusTls:
caCert: {}
userCert: {}
tenantID: netobserv
tls:
caCert: {}
userCert: {}
microservices:
ingesterUrl: http://loki-distributor:3100/
querierUrl: http://loki-query-frontend:3100/
tenantID: netobserv
tls:
caCert: {}
userCert: {}
mode: LokiStack
monolithic:
tenantID: netobserv
tls:
caCert: {}
userCert: {}
url: http://loki:3100/
readTimeout: 30s
writeBatchSize: 10485760
writeBatchWait: 1s
writeTimeout: 10s
namespace: netobserv
processor:
imagePullPolicy: IfNotPresent
kafkaConsumerAutoscaler:
maxReplicas: 0
status: Disabled
kafkaConsumerBatchSize: 10485760
kafkaConsumerQueueCapacity: 1000
kafkaConsumerReplicas: 3
logLevel: info
logTypes: Flows
metrics:
includeList:
- node_ingress_bytes_total
- workload_ingress_bytes_total
- namespace_flows_total
- node_rtt_seconds
- namespace_rtt_seconds
- workload_rtt_seconds
server:
port: 9102
tls:
type: Disabled
multiClusterDeployment: false
resources:
limits:
memory: 800Mi
requests:
cpu: 100m
memory: 100Mi
status:
conditions:
- lastTransitionTime: "2024-02-14T09:52:20Z"
message: 4 ready components, 0 with failure, 0 pending
reason: Ready
status: "True"
type: Ready
- lastTransitionTime: "2024-02-14T09:52:20Z"
message: ""
reason: Ready
status: "True"
type: FlowCollectorLegacyReady
- lastTransitionTime: "2024-02-14T09:52:11Z"
message: ""
reason: Ready
status: "True"
type: MonitoringReady
- lastTransitionTime: "2024-02-14T11:40:43Z"
message: ""
reason: Ready
status: "True"
type: FLPParentReady
- lastTransitionTime: "2024-02-14T09:52:13Z"
message: ""
reason: Ready
status: "True"
type: FLPMonolithReady
- lastTransitionTime: "2024-02-14T09:52:11Z"
message: Transformer only used with Kafka
reason: ComponentUnused
status: Unknown
type: FLPTransformOnlyReady
- lastTransitionTime: "2024-02-14T09:52:11Z"
message: Ingester only used with Kafka and without eBPF
reason: ComponentUnused
status: Unknown
type: FLPIngestOnlyReady
[root@rdr-noo-415-rc5-bastion-0 ~]#
Steps to Reproduce:
Install the netobserv operator v1.5.0 with the latest build and deploy the flowcollector with the below changes for enabling the flowRTT feature: 1. Enable the flowRTT to the eBPF agent spec.ebpf.features - FlowRTT 2. Add the following items to metrics list spec.processor.metrics.includeList: - node_rtt_seconds - namespace_rtt_seconds - workload_rtt_seconds
Actual results:
Unable to get the data in the metric panels on the network-traffic page.
Expected results:
Should be able to see the flowlogs data on network-traffic and flowRTT panels.