{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "TSG036 - Controller logs\n", "========================\n", "\n", "Get the last \u2018n\u2019 hours of controller logs.\n", "\n", "Steps\n", "-----\n", "\n", "### Parameters" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "import re\n", "from datetime import datetime\n", "\n", "tail_lines = 500\n", "\n", "# The controller log files are kept in a yyyy-mm-dd folder structure\n", "#\n", "d = datetime.utcnow()\n", "date = \"{0}-{1:02d}-{2:02d}\".format(d.year, d.month, d.day)\n", "folder = f\"/var/log/controller/{date}\"\n", "\n", "pod = None # All\n", "container = 'controller'\n", "log_files = [ f'{folder}/controller.log', f'{folder}/kube.log', f'{folder}/controller.out', f'{folder}/access.log' ]\n", "\n", "expressions_to_analyze = [\n", " re.compile(\".{26} WARN \"),\n", " re.compile(\".{26} ERROR \")\n", "]\n", "\n", "print(\"Log files to get:\")\n", "print(log_files)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Instantiate Kubernetes client" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide_input" ] }, "outputs": [], "source": [ "# Instantiate the Python Kubernetes client into 'api' variable\n", "\n", "import os\n", "from IPython.display import Markdown\n", "\n", "try:\n", " from kubernetes import client, config\n", " from kubernetes.stream import stream\n", "\n", " if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n", " config.load_incluster_config()\n", " else:\n", " try:\n", " config.load_kube_config()\n", " except:\n", " display(Markdown(f'HINT: Use [TSG118 - Configure Kubernetes config](../repair/tsg118-configure-kube-config.ipynb) to resolve this issue.'))\n", " raise\n", " api = client.CoreV1Api()\n", "\n", " print('Kubernetes client instantiated')\n", "except ImportError:\n", " display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n", " raise" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get the namespace for the big data cluster\n", "\n", "Get the namespace of the Big Data Cluster from the Kuberenetes API.\n", "\n", "**NOTE:**\n", "\n", "If there is more than one Big Data Cluster in the target Kubernetes\n", "cluster, then either:\n", "\n", "- set \\[0\\] to the correct value for the big data cluster.\n", "- set the environment variable AZDATA\\_NAMESPACE, before starting\n", " Azure Data Studio." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide_input" ] }, "outputs": [], "source": [ "# Place Kubernetes namespace name for BDC into 'namespace' variable\n", "\n", "if \"AZDATA_NAMESPACE\" in os.environ:\n", " namespace = os.environ[\"AZDATA_NAMESPACE\"]\n", "else:\n", " try:\n", " namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n", " except IndexError:\n", " from IPython.display import Markdown\n", " display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n", " display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n", " display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n", " raise\n", "\n", "print('The kubernetes namespace for your big data cluster is: ' + namespace)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get tail for log" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide_input" ] }, "outputs": [], "source": [ "# Display the last 'tail_lines' of files in 'log_files' list\n", "\n", "pods = api.list_namespaced_pod(namespace)\n", "\n", "entries_for_analysis = []\n", "\n", "for p in pods.items:\n", " if pod is None or p.metadata.name == pod:\n", " for c in p.spec.containers:\n", " if container is None or c.name == container:\n", " for log_file in log_files:\n", " print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n", " try:\n", " output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n", " except Exception:\n", " print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n", " else:\n", " for line in output.split('\\n'):\n", " for expression in expressions_to_analyze:\n", " if expression.match(line):\n", " entries_for_analysis.append(line)\n", " print(line)\n", "print(\"\")\n", "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Analyze log entries and suggest relevant Troubleshooting Guides" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide_input" ] }, "outputs": [], "source": [ "# Analyze log entries and suggest further relevant troubleshooting guides\n", "from IPython.display import Markdown\n", "import os\n", "import json\n", "import requests\n", "import ipykernel\n", "import datetime\n", "\n", "from urllib.parse import urljoin\n", "from notebook import notebookapp\n", "\n", "def get_notebook_name():\n", " \"\"\"Return the full path of the jupyter notebook. Some runtimes (e.g. ADS) \n", " have the kernel_id in the filename of the connection file. If so, the \n", " notebook name at runtime can be determined using `list_running_servers`.\n", " Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n", " the connection file, therefore we are unable to establish the filename\n", " \"\"\"\n", " connection_file = os.path.basename(ipykernel.get_connection_file())\n", " \n", " # If the runtime has the kernel_id in the connection filename, use it to\n", " # get the real notebook name at runtime, otherwise, use the notebook \n", " # filename from build time.\n", " try: \n", " kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n", " except:\n", " pass\n", " else:\n", " for servers in list(notebookapp.list_running_servers()):\n", " try:\n", " response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n", " except:\n", " pass\n", " else:\n", " for nn in json.loads(response.text):\n", " if nn['kernel']['id'] == kernel_id:\n", " return nn['path']\n", "\n", "def load_json(filename):\n", " with open(filename, encoding=\"utf8\") as json_file:\n", " return json.load(json_file)\n", "\n", "def get_notebook_rules():\n", " \"\"\"Load the notebook rules from the metadata of this notebook (in the .ipynb file)\"\"\"\n", " file_name = get_notebook_name()\n", "\n", " if file_name == None:\n", " return None\n", " else:\n", " j = load_json(file_name)\n", "\n", " if \"azdata\" not in j[\"metadata\"] or \\\n", " \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n", " \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n", " return []\n", " else:\n", " return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n", "\n", "rules = get_notebook_rules()\n", "\n", "if rules == None:\n", " print(\"\")\n", " print(f\"Log Analysis only available when run in Azure Data Studio. Not available when run in azdata.\")\n", "else:\n", " print(f\"Applying the following {len(rules)} rules to {len(entries_for_analysis)} log entries for analysis, looking for HINTs to further troubleshooting.\")\n", " print(rules)\n", " hints = 0\n", " if len(rules) > 0:\n", " for entry in entries_for_analysis:\n", " for rule in rules:\n", " if entry.find(rule[0]) != -1:\n", " print (entry)\n", "\n", " display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n", " hints = hints + 1\n", "\n", " print(\"\")\n", " print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print('Notebook execution complete.')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Related\n", "-------\n", "\n", "- [TSG027 - Observe cluster\n", " deployment](../diagnose/tsg027-observe-bdc-create.ipynb)" ] } ], "nbformat": 4, "nbformat_minor": 5, "metadata": { "kernelspec": { "name": "python3", "display_name": "Python 3" }, "azdata": { "side_effects": false, "expert": { "log_analyzer_rules": [ [ "doc is missing key: /data", "TSG038", "TSG038 - BDC create failures due to - doc is missing key", "../repair/tsg038-doc-is-missing-key-error.ipynb" ], [ "Failed when starting controller service. System.TimeoutException:\nOperation timed out after 10 minutes", "TSG057", "TSG057 - Failed when starting controller service. System.TimeoutException", "../repair/tsg057-failed-when-starting-controller.ipynb" ] ] } } } }