{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"CER050 - Wait for BDC to be Healthy\n",
"===================================\n",
"\n",
"This notebook will wait until the Big Data Cluster has returned to a\n",
"healthy state, after the `Controller` pod and pods that use `PolyBase`\n",
"have been restarted to load the new certificates.\n",
"\n",
"Steps\n",
"-----\n",
"\n",
"### Parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"timeout = 600 # amount of time to wait before cluster is healthy: default to 10 minutes\n",
"check_interval = 30 # amount of time between health checks - default 30 seconds\n",
"min_pod_count = 10 # minimum number of healthy pods required to assert health"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Instantiate Kubernetes client"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"hide_input"
]
},
"outputs": [],
"source": [
"# Instantiate the Python Kubernetes client into 'api' variable\n",
"\n",
"import os\n",
"\n",
"try:\n",
" from kubernetes import client, config\n",
" from kubernetes.stream import stream\n",
"\n",
" if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
" config.load_incluster_config()\n",
" else:\n",
" try:\n",
" config.load_kube_config()\n",
" except:\n",
" display(Markdown(f'HINT: Use [TSG118 - Configure Kubernetes config](../repair/tsg118-configure-kube-config.ipynb) to resolve this issue.'))\n",
" raise\n",
" api = client.CoreV1Api()\n",
"\n",
" print('Kubernetes client instantiated')\n",
"except ImportError:\n",
" from IPython.display import Markdown\n",
" display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
" raise"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get the namespace for the big data cluster\n",
"\n",
"Get the namespace of the Big Data Cluster from the Kuberenetes API.\n",
"\n",
"**NOTE:**\n",
"\n",
"If there is more than one Big Data Cluster in the target Kubernetes\n",
"cluster, then either:\n",
"\n",
"- set \\[0\\] to the correct value for the big data cluster.\n",
"- set the environment variable AZDATA\\_NAMESPACE, before starting\n",
" Azure Data Studio."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"hide_input"
]
},
"outputs": [],
"source": [
"# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
"\n",
"if \"AZDATA_NAMESPACE\" in os.environ:\n",
" namespace = os.environ[\"AZDATA_NAMESPACE\"]\n",
"else:\n",
" try:\n",
" namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
" except IndexError:\n",
" from IPython.display import Markdown\n",
" display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
" display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
" display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
" raise\n",
"\n",
"print('The kubernetes namespace for your big data cluster is: ' + namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import threading\n",
"import time\n",
"import sys\n",
"import os\n",
"from IPython.display import Markdown\n",
"\n",
"isRunning = True\n",
"\n",
"\n",
"def all_containers_ready(pod):\n",
" \"\"\"helper method returns true if all the containers within the given pod are ready\n",
"\n",
" Arguments:\n",
" pod {v1Pod} -- Metadata retrieved from the api call to.\n",
" \"\"\"\n",
" \n",
" return all(map(lambda c: c.ready is True, pod.status.container_statuses))\n",
"\n",
"\n",
"def pod_is_ready(pod):\n",
" \"\"\"tests that the pod, and all containers are ready\n",
"\n",
" Arguments:\n",
" pod {v1Pod} -- Metadata retrieved from api call.\n",
" \"\"\"\n",
"\n",
" return \"job-name\" in pod.metadata.labels or (pod.status.phase == \"Running\" and all_containers_ready(pod))\n",
"\n",
"\n",
"def waitReady():\n",
" \"\"\"Waits for all pods, and containers to become ready.\n",
" \"\"\"\n",
" while isRunning:\n",
" try:\n",
" pods = get_pods()\n",
" allReady = len(pods.items) >= min_pod_count and all(map(pod_is_ready, pods.items))\n",
"\n",
" if allReady:\n",
" return True\n",
" else:\n",
" display(Markdown(get_pod_failures(pods)))\n",
" display(Markdown(f\"cluster not healthy, rechecking in {check_interval} seconds.\"))\n",
"\n",
" time.sleep(check_interval)\n",
" except Exception as ex:\n",
" last_error_message = str(ex)\n",
" display(Markdown(last_error_message))\n",
" time.sleep(check_interval)\n",
" \n",
"\n",
"\n",
"def get_pod_failures(pods=None):\n",
" \"\"\"Returns a status message for any pods that are not ready.\n",
" \"\"\"\n",
" results = \"\"\n",
" if not pods:\n",
" pods = get_pods()\n",
"\n",
" for pod in pods.items:\n",
" if \"job-name\" not in pod.metadata.labels:\n",
" if pod.status and pod.status.container_statuses:\n",
" for container in filter(lambda c: c.ready is False, pod.status.container_statuses):\n",
" results = results + \"Container {0} in Pod {1} is not ready. Reported status: {2}
\".format(container.name, pod.metadata.name, container.state) \n",
" else:\n",
" results = results + \"Pod {0} is not ready.
\".format(pod.metadata.name)\n",
" return results\n",
"\n",
"\n",
"def get_pods():\n",
" \"\"\"Returns a list of pods by namespace, or all namespaces if no namespace is specified\n",
" \"\"\"\n",
" pods = None\n",
" if namespace is not None:\n",
" display(Markdown(f'Checking namespace {namespace}'))\n",
" pods = api.list_namespaced_pod(namespace, _request_timeout=30) \n",
" else:\n",
" display(Markdown('Checking all namespaces'))\n",
" pods = api.list_pod_for_all_namespaces(_request_timeout=30)\n",
" return pods\n",
"\n",
"\n",
"mt = threading.Thread(target=waitReady)\n",
"mt.start()\n",
"mt.join(timeout=timeout)\n",
"\n",
"if mt.isAlive():\n",
" from IPython.display import Markdown\n",
" display(Markdown(\"Timeout waiting for cluster to become healthy after {0} seconds.\".format(timeout)))\n",
" failures = get_pod_failures()\n",
" display(Markdown(\"The following pods/containers are not in a healthy state\"))\n",
" display(Markdown(failures))\n",
" raise SystemExit(\"Timeout waiting for pods to become ready.\", failures)\n",
"else:\n",
" display(Markdown('Cluster is healthy'))\n",
"\n",
"isRunning = False\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('Notebook execution complete.')"
]
}
],
"nbformat": 4,
"nbformat_minor": 5,
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"azdata": {
"side_effects": true
}
}
}