{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"SOP034 - Wait for BDC to be Healthy\n",
"===================================\n",
"\n",
"Blocks until the Big Data Cluster is healthy, or the specified timeout\n",
"expires.\n",
"\n",
"The min\\_pod\\_count parameter indicates that the health check will not\n",
"pass until at least this number of pods exists in the cluster. If any\n",
"existing pods beyond this limit are unhealthy, the cluster is not\n",
"healthy.\n",
"\n",
"Steps\n",
"-----\n",
"\n",
"### Parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"timeout = 600 # amount of time to wait before cluster is healthy: default to 10 minutes\n",
"check_interval = 30 # amount of time between health checks - default 30 seconds\n",
"min_pod_count = 10 # minimum number of healthy pods required to assert health"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Instantiate Kubernetes client"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"hide_input"
]
},
"outputs": [],
"source": [
"# Instantiate the Python Kubernetes client into 'api' variable\n",
"\n",
"import os\n",
"\n",
"try:\n",
" from kubernetes import client, config\n",
" from kubernetes.stream import stream\n",
"\n",
" if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
" config.load_incluster_config()\n",
" else:\n",
" try:\n",
" config.load_kube_config()\n",
" except:\n",
" display(Markdown(f'HINT: Use [TSG118 - Configure Kubernetes config](../repair/tsg118-configure-kube-config.ipynb) to resolve this issue.'))\n",
" raise\n",
" api = client.CoreV1Api()\n",
"\n",
" print('Kubernetes client instantiated')\n",
"except ImportError:\n",
" from IPython.display import Markdown\n",
" display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
" raise"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get the namespace for the big data cluster\n",
"\n",
"Get the namespace of the Big Data Cluster from the Kuberenetes API.\n",
"\n",
"**NOTE:**\n",
"\n",
"If there is more than one Big Data Cluster in the target Kubernetes\n",
"cluster, then either:\n",
"\n",
"- set \\[0\\] to the correct value for the big data cluster.\n",
"- set the environment variable AZDATA\\_NAMESPACE, before starting\n",
" Azure Data Studio."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"hide_input"
]
},
"outputs": [],
"source": [
"# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
"\n",
"if \"AZDATA_NAMESPACE\" in os.environ:\n",
" namespace = os.environ[\"AZDATA_NAMESPACE\"]\n",
"else:\n",
" try:\n",
" namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
" except IndexError:\n",
" from IPython.display import Markdown\n",
" display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
" display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
" display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
" raise\n",
"\n",
"print('The kubernetes namespace for your big data cluster is: ' + namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import threading\n",
"import time\n",
"import sys\n",
"import os\n",
"from IPython.display import Markdown\n",
"\n",
"isRunning = True\n",
"\n",
"\n",
"def all_containers_ready(pod):\n",
" \"\"\"helper method returns true if all the containers within the given pod are ready\n",
"\n",
" Arguments:\n",
" pod {v1Pod} -- Metadata retrieved from the api call to.\n",
" \"\"\"\n",
" \n",
" return all(map(lambda c: c.ready is True, pod.status.container_statuses))\n",
"\n",
"\n",
"def pod_is_ready(pod):\n",
" \"\"\"tests that the pod, and all containers are ready\n",
"\n",
" Arguments:\n",
" pod {v1Pod} -- Metadata retrieved from api call.\n",
" \"\"\"\n",
"\n",
" return \"job-name\" in pod.metadata.labels or (pod.status.phase == \"Running\" and all_containers_ready(pod))\n",
"\n",
"\n",
"def waitReady():\n",
" \"\"\"Waits for all pods, and containers to become ready.\n",
" \"\"\"\n",
" while isRunning:\n",
" try:\n",
" pods = get_pods()\n",
" allReady = len(pods.items) >= min_pod_count and all(map(pod_is_ready, pods.items))\n",
"\n",
" if allReady:\n",
" return True\n",
" else:\n",
" display(Markdown(get_pod_failures(pods)))\n",
" display(Markdown(f\"cluster not healthy, rechecking in {check_interval} seconds.\"))\n",
"\n",
" time.sleep(check_interval)\n",
" except Exception as ex:\n",
" last_error_message = str(ex)\n",
" display(Markdown(last_error_message))\n",
" time.sleep(check_interval)\n",
" \n",
"\n",
"\n",
"def get_pod_failures(pods=None):\n",
" \"\"\"Returns a status message for any pods that are not ready.\n",
" \"\"\"\n",
" results = \"\"\n",
" if not pods:\n",
" pods = get_pods()\n",
"\n",
" for pod in pods.items:\n",
" if \"job-name\" not in pod.metadata.labels:\n",
" if pod.status and pod.status.container_statuses:\n",
" for container in filter(lambda c: c.ready is False, pod.status.container_statuses):\n",
" results = results + \"Container {0} in Pod {1} is not ready. Reported status: {2}
\".format(container.name, pod.metadata.name, container.state) \n",
" else:\n",
" results = results + \"Pod {0} is not ready.
\".format(pod.metadata.name)\n",
" return results\n",
"\n",
"\n",
"def get_pods():\n",
" \"\"\"Returns a list of pods by namespace, or all namespaces if no namespace is specified\n",
" \"\"\"\n",
" pods = None\n",
" if namespace is not None:\n",
" display(Markdown(f'Checking namespace {namespace}'))\n",
" pods = api.list_namespaced_pod(namespace, _request_timeout=30) \n",
" else:\n",
" display(Markdown('Checking all namespaces'))\n",
" pods = api.list_pod_for_all_namespaces(_request_timeout=30)\n",
" return pods\n",
"\n",
"\n",
"mt = threading.Thread(target=waitReady)\n",
"mt.start()\n",
"mt.join(timeout=timeout)\n",
"\n",
"if mt.isAlive():\n",
" from IPython.display import Markdown\n",
" display(Markdown(\"Timeout waiting for cluster to become healthy after {0} seconds.\".format(timeout)))\n",
" failures = get_pod_failures()\n",
" display(Markdown(\"The following pods/containers are not in a healthy state\"))\n",
" display(Markdown(failures))\n",
" raise SystemExit(\"Timeout waiting for pods to become ready.\", failures)\n",
"else:\n",
" display(Markdown('Cluster is healthy'))\n",
"\n",
"isRunning = False\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('Notebook execution complete.')"
]
}
],
"nbformat": 4,
"nbformat_minor": 5,
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"azdata": {
"side_effects": false
}
}
}