SOP034 - Wait for BDC to be Healthy
===================================

Blocks until the Big Data Cluster is healthy, or the specified timeout
expires.

The min\_pod\_count parameter indicates that the health check will not
pass until at least this number of pods exists in the cluster. If any
existing pods beyond this limit are unhealthy, the cluster is not
healthy.

Steps
-----

### Parameters

In [None]:
timeout = 600  # amount of time to wait before cluster is healthy:  default to 10 minutes
check_interval = 30  # amount of time between health checks - default 30 seconds
min_pod_count = 10  # minimum number of healthy pods required to assert health

### Instantiate Kubernetes client

In [None]:
# Instantiate the Python Kubernetes client into 'api' variable

import os

try:
    from kubernetes import client, config
    from kubernetes.stream import stream

    if "KUBERNETES_SERVICE_PORT" in os.environ and "KUBERNETES_SERVICE_HOST" in os.environ:
        config.load_incluster_config()
    else:
        try:
            config.load_kube_config()
        except:
            display(Markdown(f'HINT: Use [TSG118 - Configure Kubernetes config](../repair/tsg118-configure-kube-config.ipynb) to resolve this issue.'))
            raise
    api = client.CoreV1Api()

    print('Kubernetes client instantiated')
except ImportError:
    from IPython.display import Markdown
    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))
    raise

### Get the namespace for the big data cluster

Get the namespace of the Big Data Cluster from the Kuberenetes API.

**NOTE:**

If there is more than one Big Data Cluster in the target Kubernetes
cluster, then either:

-   set \[0\] to the correct value for the big data cluster.
-   set the environment variable AZDATA\_NAMESPACE, before starting
    Azure Data Studio.

In [None]:
# Place Kubernetes namespace name for BDC into 'namespace' variable

if "AZDATA_NAMESPACE" in os.environ:
    namespace = os.environ["AZDATA_NAMESPACE"]
else:
    try:
        namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name
    except IndexError:
        from IPython.display import Markdown
        display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))
        display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))
        display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))
        raise

print('The kubernetes namespace for your big data cluster is: ' + namespace)

### Define functions

In [None]:
import threading
import time
import sys
import os
from IPython.display import Markdown

isRunning = True


def all_containers_ready(pod):
    """helper method returns true if all the containers within the given pod are ready

    Arguments:
        pod {v1Pod} -- Metadata retrieved from the api call to.
    """
         
    return all(map(lambda c: c.ready is True, pod.status.container_statuses))


def pod_is_ready(pod):
    """tests that the pod, and all containers are ready

    Arguments:
        pod {v1Pod} -- Metadata retrieved from api call.
    """

    return "job-name" in pod.metadata.labels or (pod.status.phase == "Running" and all_containers_ready(pod))


def waitReady():
    """Waits for all pods, and containers to become ready.
    """
    while isRunning:
        try:
            pods = get_pods()
            allReady = len(pods.items) >= min_pod_count and all(map(pod_is_ready, pods.items))

            if allReady:
                return True
            else:
                display(Markdown(get_pod_failures(pods)))
                display(Markdown(f"cluster not healthy, rechecking in {check_interval} seconds."))

            time.sleep(check_interval)
        except Exception as ex:
            last_error_message = str(ex)
            display(Markdown(last_error_message))
            time.sleep(check_interval)
            


def get_pod_failures(pods=None):
    """Returns a status message for any pods that are not ready.
    """
    results = ""
    if not pods:
        pods = get_pods()

    for pod in pods.items:
        if "job-name" not in pod.metadata.labels:
            if pod.status and pod.status.container_statuses:
                for container in filter(lambda c: c.ready is False, pod.status.container_statuses):
                    results = results + "Container {0} in Pod {1} is not ready. Reported status: {2} <br/>".format(container.name, pod.metadata.name, container.state)       
            else:
                results = results + "Pod {0} is not ready.  <br/>".format(pod.metadata.name)
    return results


def get_pods():
    """Returns a list of pods by namespace, or all namespaces if no namespace is specified
    """
    pods = None
    if namespace is not None:
        display(Markdown(f'Checking namespace {namespace}'))
        pods = api.list_namespaced_pod(namespace, _request_timeout=30) 
    else:
        display(Markdown('Checking all namespaces'))
        pods = api.list_pod_for_all_namespaces(_request_timeout=30)
    return pods


mt = threading.Thread(target=waitReady)
mt.start()
mt.join(timeout=timeout)

if mt.isAlive():
    from IPython.display import Markdown
    display(Markdown("Timeout waiting for cluster to become healthy after {0} seconds.".format(timeout)))
    failures = get_pod_failures()
    display(Markdown("The following pods/containers are not in a healthy state"))
    display(Markdown(failures))
    raise SystemExit("Timeout waiting for pods to become ready.", failures)
else:
    display(Markdown('Cluster is healthy'))

isRunning = False


In [None]:
print('Notebook execution complete.')