Uploaded image for project: 'OpenShift Service Mesh'
  1. OpenShift Service Mesh
  2. OSSM-6809

2-5% of requests for bookinfo app fail with 500 Internal Server Error [SMCPv2.6]

XMLWordPrintable

    • Icon: Bug Bug
    • Resolution: Done
    • Icon: Critical Critical
    • OSSM 2.6.0
    • OSSM 2.6.0
    • Customer Impact, Envoy
    • None
    • False
    • None
    • False
    • Hide
      • Install latest 2.6.0 operator ( e.g. iib registry-proxy.engineering.redhat.com/rh-osbs/iib:763273 )
      • Create OSSM in the istio-system project with SMCP v2.6
        oc new-project istio-system
        oc apply -f - <<EOF
        apiVersion: maistra.io/v2
        kind: ServiceMeshControlPlane
        metadata:
          name: basic
        spec:
          policy:
            type: Istiod
          addons:
            grafana:
              enabled: true
            kiali:
              enabled: true
            prometheus:
              enabled: true
          telemetry:
            type: Istiod
          proxy:
            accessLogging:
              file:
                encoding: TEXT
                format: >
                  { 
                  "start_time": "%START_TIME%",
                  "method": "%REQ(:METHOD)%",
                  "path":"%REQ(X-ENVOY-ORIGINAL-PATH?:PATH)%",
                  "protocol": "%PROTOCOL%",
                  "response_code": "%RESPONSE_CODE%",
                  "response_flags": "%RESPONSE_FLAGS%",
                  "response_code_details": "%RESPONSE_CODE_DETAILS%",
                  "connection_termination_details": "%CONNECTION_TERMINATION_DETAILS%",
                  "downstream_transport_failure_reason": "%DOWNSTREAM_TRANSPORT_FAILURE_REASON%",
                  "upstream_transport_failure_reason": "%UPSTREAM_TRANSPORT_FAILURE_REASON%",
                  "duration": "%DURATION%",
                  "x_forwarded_for": "%REQ(X-FORWARDED-FOR)%", 
                  "user_agent": "%REQ(USER-AGENT)%",
                  "x_request_id": "%REQ(X-REQUEST-ID)%", 
                  "x_b3_traceid": "%REQ(X-B3-TRACEID)%", 
                  "x_b3_parentspanid": "%REQ(X-B3-PARENTSPANID)%",
                  "x_b3_spanid": "%REQ(X-B3-SPANID)%",
                  "x_b3_sampled": "%REQ(X-B3-SAMPLED)%",
                  "appd_singularityheader": "%REQ(SINGULARITYHEADER)%", 
                  "authority": "%REQ(:AUTHORITY)%",
                  "upstream_host": "%UPSTREAM_HOST%",
                  "upstream_cluster": "%UPSTREAM_CLUSTER%",
                  "upstream_local_address": "%UPSTREAM_LOCAL_ADDRESS%",
                  "downstream_local_address": "%DOWNSTREAM_LOCAL_ADDRESS%",
                  "downstream_remote_address": "%DOWNSTREAM_REMOTE_ADDRESS%",
                  "requested_server_name": "%REQUESTED_SERVER_NAME%", 
                  "route_name":"%ROUTE_NAME%",
                  "virtual_cluster_name": "%VIRTUAL_CLUSTER_NAME%",
                  "upstream_cluster": "%UPSTREAM_CLUSTER%",
                  "request_server_name": "%REQUESTED_SERVER_NAME%",
                  "content_type": "%RESP(CONTENT-TYPE)%",
                  "request_content_length": "%REQ(CONTENT-LENGTH)%", 
                  "response_content_length": "%RESP(CONTENT-LENGTH)%", 
                  "downstream_peer_uri_san": "%DOWNSTREAM_PEER_URI_SAN%" 
                  }
                name: /dev/stdout
          general:
            logging:
              componentLevels:
                all: debug
          version: v2.6
        EOF
        oc wait --for condition=Ready -n istio-system smcp/basic --timeout 300s
        
      • Install bookinfo app with traffic generator
        git clone --branch v1.73 https://github.com/kiali/kiali.git /tmp/kiali
        cd /tmp/kiali/hack/istio/
        ./download-istio.sh -iv 1.20.8
        ./install-bookinfo-demo.sh --traffic-generator
        
      • After that, check kiali traffic generator pod logs which will be showing how many requests failed with which status
        oc -n bookinfo logs -f $(oc get pod -n bookinfo -l app=kiali-traffic-generator -o custom-columns=NAME:.metadata.name --no-headers)
        
      • You can also look into the kiali that productpage app is in degradated state ( same as in the screenshot )
      Show
      Install latest 2.6.0 operator ( e.g. iib registry-proxy.engineering.redhat.com/rh-osbs/iib:763273 ) Create OSSM in the istio-system project with SMCP v2.6 oc new -project istio-system oc apply -f - <<EOF apiVersion: maistra.io/v2 kind: ServiceMeshControlPlane metadata: name: basic spec: policy: type: Istiod addons: grafana: enabled: true kiali: enabled: true prometheus: enabled: true telemetry: type: Istiod proxy: accessLogging: file: encoding: TEXT format: > { "start_time" : "%START_TIME%" , "method" : "%REQ(:METHOD)%" , "path" : "%REQ(X-ENVOY-ORIGINAL-PATH?:PATH)%" , "protocol" : "%PROTOCOL%" , "response_code" : "%RESPONSE_CODE%" , "response_flags" : "%RESPONSE_FLAGS%" , "response_code_details" : "%RESPONSE_CODE_DETAILS%" , "connection_termination_details" : "%CONNECTION_TERMINATION_DETAILS%" , "downstream_transport_failure_reason" : "%DOWNSTREAM_TRANSPORT_FAILURE_REASON%" , "upstream_transport_failure_reason" : "%UPSTREAM_TRANSPORT_FAILURE_REASON%" , "duration" : "%DURATION%" , "x_forwarded_for" : "%REQ(X-FORWARDED-FOR)%" , "user_agent" : "%REQ(USER-AGENT)%" , "x_request_id" : "%REQ(X-REQUEST-ID)%" , "x_b3_traceid" : "%REQ(X-B3-TRACEID)%" , "x_b3_parentspanid" : "%REQ(X-B3-PARENTSPANID)%" , "x_b3_spanid" : "%REQ(X-B3-SPANID)%" , "x_b3_sampled" : "%REQ(X-B3-SAMPLED)%" , "appd_singularityheader" : "%REQ(SINGULARITYHEADER)%" , "authority" : "%REQ(:AUTHORITY)%" , "upstream_host" : "%UPSTREAM_HOST%" , "upstream_cluster" : "%UPSTREAM_CLUSTER%" , "upstream_local_address" : "%UPSTREAM_LOCAL_ADDRESS%" , "downstream_local_address" : "%DOWNSTREAM_LOCAL_ADDRESS%" , "downstream_remote_address" : "%DOWNSTREAM_REMOTE_ADDRESS%" , "requested_server_name" : "%REQUESTED_SERVER_NAME%" , "route_name" : "%ROUTE_NAME%" , "virtual_cluster_name" : "%VIRTUAL_CLUSTER_NAME%" , "upstream_cluster" : "%UPSTREAM_CLUSTER%" , "request_server_name" : "%REQUESTED_SERVER_NAME%" , "content_type" : "%RESP(CONTENT-TYPE)%" , "request_content_length" : "%REQ(CONTENT-LENGTH)%" , "response_content_length" : "%RESP(CONTENT-LENGTH)%" , "downstream_peer_uri_san" : "%DOWNSTREAM_PEER_URI_SAN%" } name: /dev/stdout general: logging: componentLevels: all: debug version: v2.6 EOF oc wait -- for condition=Ready -n istio-system smcp/basic --timeout 300s Install bookinfo app with traffic generator git clone --branch v1.73 https: //github.com/kiali/kiali.git /tmp/kiali cd /tmp/kiali/hack/istio/ ./download-istio.sh -iv 1.20.8 ./install-bookinfo-demo.sh --traffic-generator After that, check kiali traffic generator pod logs which will be showing how many requests failed with which status oc -n bookinfo logs -f $(oc get pod -n bookinfo -l app=kiali-traffic-generator -o custom-columns=NAME:.metadata.name --no-headers) You can also look into the kiali that productpage app is in degradated state ( same as in the screenshot )

      I have OSSM 2.6 with bookinfo app (from istio 1.20.7) and kiali traffic generator. Kiali traffic generator is sending a one request per second to the productinfo application.

      With SMCP 2.5, there is not any problem and there is not any error responses. (so the Success ratio is 100%)

      However, with SMCP v2.6 (after the proxy update to v2.6), around 5% of request finish with `500 Internal Server Error`. So, potentially, there can be some bug in the 2.6 proxy.

      e.g. success ratio after 5677 requests:

      Requests [total, rate, throughput] 5677, 1.00, 0.98
      Duration [total, attack, wait] 1h34m36.021402398s, 1h34m35.999907282s, 21.495116ms
      Latencies [mean, 50, 95, 99, max] 52.914991ms, 30.119942ms, 47.099169ms, 1.018287559s, 1.026896435s
      Bytes In [total, mean] 31137432, 5484.84
      Bytes Out [total, mean] 0, 0.00
      Success [ratio] 97.80%
      Status Codes [code:count] 200:5552 500:125
      Error Set:
      500 Internal Server Error
      

      e.g. one failed request:
      on ingress proxy:

      {  "start_time": "2024-07-10T14:12:05.222Z", "method": "GET", "path":"/productpage", "protocol": "HTTP/1.1", "response_code": "500", "response_flags": "-", "response_code_details": "via_upstream", "connection_termination_details": "-", "downstream_transport_failure_reason": "-", "upstream_transport_failure_reason": "-", "bytes_received":"0", "bytes_sent": "29428", "duration": "1030", "upstream_service_time": "1029",  "x_forwarded_for": "192.169.3.238,10.131.0.2",  "user_agent": "Go-http-client/1.1", "x_request_id": "58c20a74-5ce9-43df-85c6-c61fa24cb4ca",  "x_b3_traceid": "-",  "x_b3_parentspanid": "-", "x_b3_spanid": "-", "x_b3_sampled": "-", "appd_singularityheader": "-",  "authority": "istio-ingressgateway-istio-system.apps.user-rhos-d-08.servicemesh.rhqeaws.com", "upstream_host": "10.128.3.131:9080", "upstream_cluster": "outbound|9080||productpage.bookinfo.svc.cluster.local", "upstream_local_address": "10.131.1.61:35430", "downstream_local_address": "10.131.1.61:8080", "downstream_remote_address": "10.131.0.2:37746", "requested_server_name": "-",  "route_name":"-", "virtual_cluster_name": "-", "upstream_cluster": "outbound|9080||productpage.bookinfo.svc.cluster.local", "request_server_name": "-", "content_type": "text/html; charset=utf-8", "request_content_length": "-",  "response_content_length": "29428",  "downstream_peer_uri_san": "-"  }
      

      on productpage proxy:

      {  "start_time": "2024-07-10T14:12:05.222Z", "method": "GET", "path":"/productpage", "protocol": "HTTP/1.1", "response_code": "500", "response_flags": "-", "response_code_details": "via_upstream", "connection_termination_details": "-", "downstream_transport_failure_reason": "-", "upstream_transport_failure_reason": "-", "bytes_received":"0", "bytes_sent": "29428", "duration": "1026", "upstream_service_time": "1026",  "x_forwarded_for": "192.169.3.238,10.131.0.2",  "user_agent": "Go-http-client/1.1", "x_request_id": "58c20a74-5ce9-43df-85c6-c61fa24cb4ca",  "x_b3_traceid": "-",  "x_b3_parentspanid": "-", "x_b3_spanid": "-", "x_b3_sampled": "-", "appd_singularityheader": "-",  "authority": "istio-ingressgateway-istio-system.apps.user-rhos-d-08.servicemesh.rhqeaws.com", "upstream_host": "10.128.3.131:9080", "upstream_cluster": "inbound|9080||", "upstream_local_address": "127.0.0.6:56667", "downstream_local_address": "10.128.3.131:9080", "downstream_remote_address": "10.131.0.2:0", "requested_server_name": "outbound_.9080_._.productpage.bookinfo.svc.cluster.local",  "route_name":"default", "virtual_cluster_name": "-", "upstream_cluster": "inbound|9080||", "request_server_name": "outbound_.9080_._.productpage.bookinfo.svc.cluster.local", "content_type": "text/html; charset=utf-8", "request_content_length": "-",  "response_content_length": "29428",  "downstream_peer_uri_san": "spiffe://cluster.local/ns/istio-system/sa/istio-ingressgateway-service-account"  }
      

      =======

      The `./install-bookinfo-demo.sh --traffic-generator` script from the reproducer installs bookinfo app from ${ISTIO_DIR}/samples/bookinfo/platform/kube where ISTIO_DIR is downloaded istio 1.20.7 from https://github.com/istio/istio/releases/download/1.20.7/istio-1.20.7-linux-amd64.tar.gz

      =======

      The attachments contain logs from all proxies (istio ingress and all sidecars) from the run

      Requests      [total, rate, throughput]  141, 1.01, 0.98
      Duration      [total, attack, wait]      2m20.03753311s, 2m20.000010211s, 37.522899ms
      Latencies     [mean, 50, 95, 99, max]    86.967573ms, 44.321093ms, 264.406723ms, 1.037286855s, 1.042565675s
      Bytes In      [total, mean]              798802, 5665.26
      Bytes Out     [total, mean]              0, 0.00
      Success       [ratio]                    97.16%
      Status Codes  [code:count]               200:137  500:4  
      Error Set:
      500 Internal Server Error
      

      the "500" errors are in the istio ingress and productpage proxy logs.

            jsantana@redhat.com Jonh Wendell
            mkralik@redhat.com Matej Kralik
            Votes:
            0 Vote for this issue
            Watchers:
            5 Start watching this issue

              Created:
              Updated:
              Resolved: