jina-ai · jacobowitz · May 6, 2022 · May 3, 2022 · May 3, 2022 · May 4, 2022
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
@@ -283,6 +283,47 @@ jobs:
           flags: ${{ steps.test.outputs.codecov_flag }}
           fail_ci_if_error: false
 
+  k8s-failures-test:
+    needs: [ commit-lint, lint-flake-8, code-injection ]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Prepare enviroment
+        run: |
+          docker build -f Dockerfiles/pip.Dockerfile -t jinaai/jina:test-pip .
+          python -m pip install --upgrade pip
+          python -m pip install wheel
+          pip install ".[all]" --no-cache-dir
+          jina
+          export JINA_LOG_LEVEL="ERROR"
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Test k8s
+        run: |
+          curl --proto '=https' --tlsv1.2 -sSfL https://run.linkerd.io/install | sh
+          curl --proto '=https' --tlsv1.2 -sSfL https://linkerd.github.io/linkerd-smi/install | sh
+          pytest -v -s --suppress-no-test-exit-code --force-flaky --min-passes 1 --max-runs 5 --cov=jina --cov-report=xml ./tests/k8s/test_k8s_failures.py
+        timeout-minutes: 30
+        env:
+          JINA_K8S_USE_TEST_PIP: 1
+      - name: Check codecov file
+        id: check_files
+        uses: andstor/file-existence-action@v1
+        with:
+          files: "coverage.xml"
+      - name: Upload coverage from test to Codecov
+        uses: codecov/codecov-action@v2
+        if: steps.check_files.outputs.files_exists == 'true' && ${{ matrix.python-version }} == '3.7'
+        with:
+          file: coverage.xml
+          name: ${{ matrix.test-path }}-codecov
+          flags: ${{ steps.test.outputs.codecov_flag }}
+          fail_ci_if_error: false
+
   docker-compose-test:
     needs: update-schema
     runs-on: ubuntu-latest
@@ -344,7 +385,7 @@ jobs:
   # just for blocking the merge until all parallel core-test are successful
   success-all-steps:
     runs-on: ubuntu-latest
-    needs: [core-test, import-test, hub-test, k8s-test, docker-compose-test, docker-image-test, benchmark-pre-release, update-schema, update-docker]
+    needs: [core-test, import-test, hub-test, k8s-test, k8s-failures-test, docker-compose-test, docker-image-test, benchmark-pre-release, update-schema, update-docker]
     if: always()
     steps:
       - uses: technote-space/workflow-conclusion-action@v2

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -191,7 +191,48 @@ jobs:
       - name: Test k8s
         run: |
           curl --proto '=https' --tlsv1.2 -sSfL https://run.linkerd.io/install | sh
-          pytest -v -s --suppress-no-test-exit-code --force-flaky --min-passes 1 --max-runs 5 --cov=jina --cov-report=xml ./tests/k8s/test_*.py
+          pytest -v -s --suppress-no-test-exit-code --force-flaky --min-passes 1 --max-runs 5 --cov=jina --cov-report=xml ./tests/k8s/test_k8s.py ./tests/k8s/test_graceful_request_handling.py
+        timeout-minutes: 30
+        env:
+          JINA_K8S_USE_TEST_PIP: 1
+      - name: Check codecov file
+        id: check_files
+        uses: andstor/file-existence-action@v1
+        with:
+          files: "coverage.xml"
+      - name: Upload coverage from test to Codecov
+        uses: codecov/codecov-action@v2
+        if: steps.check_files.outputs.files_exists == 'true' && ${{ matrix.python-version }} == '3.7'
+        with:
+          file: coverage.xml
+          name: ${{ matrix.test-path }}-codecov
+          flags: ${{ steps.test.outputs.codecov_flag }}
+          fail_ci_if_error: false
+
+  k8s-failures-test:
+    needs: [ commit-lint, lint-flake-8, code-injection ]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Prepare enviroment
+        run: |
+          docker build -f Dockerfiles/pip.Dockerfile -t jinaai/jina:test-pip .
+          python -m pip install --upgrade pip
+          python -m pip install wheel
+          pip install ".[all]" --no-cache-dir
+          jina
+          export JINA_LOG_LEVEL="ERROR"
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Test k8s
+        run: |
+          curl --proto '=https' --tlsv1.2 -sSfL https://run.linkerd.io/install | sh
+          curl --proto '=https' --tlsv1.2 -sSfL https://linkerd.github.io/linkerd-smi/install | sh
+          pytest -v -s --suppress-no-test-exit-code --force-flaky --min-passes 1 --max-runs 5 --cov=jina --cov-report=xml ./tests/k8s/test_k8s_failures.py
         timeout-minutes: 30
         env:
           JINA_K8S_USE_TEST_PIP: 1
@@ -336,7 +377,7 @@ jobs:
   # just for blocking the merge until all parallel core-test are successful
   success-all-test:
     runs-on: ubuntu-latest
-    needs: [core-test, import-test, hub-test, k8s-test, docker-compose-test, docker-image-test, check-docstring, check-black, code-injection]
+    needs: [core-test, import-test, hub-test, k8s-test, k8s-failures-test, docker-compose-test, docker-image-test, check-docstring, check-black, code-injection]
     if: always()
     steps:
       - uses: technote-space/workflow-conclusion-action@v2

diff --git a/jina/serve/networking.py b/jina/serve/networking.py
@@ -636,18 +636,7 @@ async def task_wrapper(
                         timeout=timeout,
                     )
                 except AioRpcError as e:
-                    # connection failures and cancelled requests should be retried
-                    # all other cases should not be retried and will be raised immediately
-                    # connection failures have the code grpc.StatusCode.UNAVAILABLE
-                    # cancelled requests have the code grpc.StatusCode.CANCELLED
-                    # requests usually gets cancelled when the server shuts down
-                    # retries for cancelled requests will hit another replica in K8s
-                    if (
-                        e.code() != grpc.StatusCode.UNAVAILABLE
-                        and e.code() != grpc.StatusCode.CANCELLED
-                    ):
-                        raise
-                    elif e.code() == grpc.StatusCode.UNAVAILABLE and i == 2:
+                    if i == 2:
                         self._logger.debug(f'GRPC call failed, retries exhausted')
                         raise
                     else:

diff --git a/tests/k8s/conftest.py b/tests/k8s/conftest.py
@@ -57,6 +57,39 @@ def _install_linkderd(self, kind_cluster):
 
         print(f'linkerd check yields {out.decode() if out else "nothing"}')
 
+    def install_linkderd_smi(self):
+        self._log.info('Installing Linkerd SMI to Cluster...')
+        proc = subprocess.Popen(
+            [f'{Path.home()}/.linkerd2/bin/linkerd-smi', 'install'],
+            stdout=subprocess.PIPE,
+            env={"KUBECONFIG": str(self._cluster.kubeconfig_path)},
+        )
+        kube_out = subprocess.check_output(
+            (
+                str(self._cluster.kubectl_path),
+                'apply',
+                '-f',
+                '-',
+            ),
+            stdin=proc.stdout,
+            env=os.environ,
+        )
+        self._log.info('Poll status of linkerd smi install')
+        returncode = proc.poll()
+        self._log.info(
+            f'Installing Linkerd to Cluster returned code {returncode}, kubectl output was {kube_out}'
+        )
+        if returncode is not None and returncode != 0:
+            raise Exception(f"Installing linkerd failed with {returncode}")
+
+        self._log.info('check linkerd status')
+        out = subprocess.check_output(
+            [f'{Path.home()}/.linkerd2/bin/linkerd-smi', 'check'],
+            env=os.environ,
+        )
+
+        print(f'linkerd check yields {out.decode() if out else "nothing"}')
+
     def _set_kube_config(self):
         self._log.info(f'Setting KUBECONFIG to {self._kube_config_path}')
         os.environ['KUBECONFIG'] = self._kube_config_path
@@ -93,6 +126,7 @@ def image_name_tag_map():
         'test-executor': '0.13.1',
         'slow-process-executor': '0.14.1',
         'executor-merger': '0.1.1',
+        'set-text-executor': '0.1.1',
         'jinaai/jina': 'test-pip',
     }
 

diff --git a/tests/k8s/fault-inject.yml b/tests/k8s/fault-inject.yml
@@ -0,0 +1,72 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: error-injector
+  namespace: test-failure-scenarios
+data:
+ nginx.conf: |-
+    events {}
+    http {
+        server {
+          listen 9090;
+            location / {
+                return 500;
+            }
+        }
+    }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: error-injector
+  namespace: test-failure-scenarios
+  labels:
+    app: error-injector
+spec:
+  selector:
+    matchLabels:
+      app: error-injector
+  replicas: 1
+  template:
+    metadata:
+      annotations:
+        linkerd.io/inject: enabled
+      labels:
+        app: error-injector
+    spec:
+      containers:
+        - name: nginx
+          image: nginx:alpine
+          volumeMounts:
+            - name: nginx-config
+              mountPath: /etc/nginx/nginx.conf
+              subPath: nginx.conf
+      volumes:
+        - name: nginx-config
+          configMap:
+            name: error-injector
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: error-injector
+  namespace: test-failure-scenarios
+spec:
+  ports:
+  - name: service
+    port: 9090
+  selector:
+    app: error-injector
+---
+apiVersion: split.smi-spec.io/v1alpha1
+kind: TrafficSplit
+metadata:
+  name: error-split
+  namespace: test-failure-scenarios
+spec:
+  service: executor0
+  backends:
+  - service: executor0
+    weight: 950m
+  - service: error-injector
+    weight: 50m
diff --git a/tests/k8s/set-text-executor/.dockerignore b/tests/k8s/set-text-executor/.dockerignore
@@ -0,0 +1,10 @@
+.git
+.venv
+.github
+.pytest_cache
+tests
+__pycache__
+scripts
+env
+executor-clip-deployment.yml
+gateway-deployment.yml
diff --git a/tests/k8s/set-text-executor/Dockerfile b/tests/k8s/set-text-executor/Dockerfile
@@ -0,0 +1,8 @@
+# TODO use fixed jina version for deterministic execution
+FROM jinaai/jina:test-pip
+
+# setup the workspace
+COPY . /workspace
+WORKDIR /workspace
+
+ENTRYPOINT ["jina", "executor", "--uses", "config.yml"]
diff --git a/tests/k8s/set-text-executor/config.yml b/tests/k8s/set-text-executor/config.yml
@@ -0,0 +1,4 @@
+jtype: TagTextExecutor
+metas:
+  py_modules:
+    - debug_executor.py
diff --git a/tests/k8s/set-text-executor/debug_executor.py b/tests/k8s/set-text-executor/debug_executor.py
@@ -0,0 +1,19 @@
+import os
+import time
+
+from jina import DocumentArray, Executor, requests
+
+
+class TagTextExecutor(Executor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.pod_uid = os.environ['POD_UID']
+
+    @requests
+    def process(self, docs: DocumentArray, *args, **kwargs):
+        for doc in docs:
+            doc.tags['replica_uid'] = self.pod_uid
+            doc.tags['time'] = time.time()
+            doc.text += f'_{self.pod_uid}'
+
+        return docs
diff --git a/tests/k8s/test_k8s.py b/tests/k8s/test_k8s.py
@@ -95,7 +95,12 @@ async def create_all_flow_deployments_and_wait_ready(
                 api_response.status.ready_replicas is not None
                 and api_response.status.ready_replicas == expected_num_replicas
             ):
+                logger.info(f'Deploymnt {deployment_name} is now ready')
              
5B67
   deployments_ready.append(deployment_name)
+            else:
+                logger.info(
+                    f'Deploymnt {deployment_name} is not ready yet: ready_replicas is {api_response.status.ready_replicas} not equal to {expected_num_replicas}'
+                )
 
         for deployment_name in deployments_ready:
             deployment_names.remove(deployment_name)