|
|
@@ -0,0 +1,541 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "TSG024 - Namenode is in safe mode\n",
|
|
|
+ "=================================\n",
|
|
|
+ "\n",
|
|
|
+ "HDFS can get itself into Safe mode. For example if too many Pods are\n",
|
|
|
+ "re-cycled too quickly in the Storage Pool then Safe mode may be\n",
|
|
|
+ "automatically enabled.\n",
|
|
|
+ "\n",
|
|
|
+ "When starting a spark session, the user may see (for example, when\n",
|
|
|
+ "trying to start a PySpark or PySpark3 session in a notebook from Azure\n",
|
|
|
+ "Data Studio):\n",
|
|
|
+ "\n",
|
|
|
+ "> The code failed because of a fatal error: Error sending http request\n",
|
|
|
+ "> and maximum retry encountered..\n",
|
|
|
+ ">\n",
|
|
|
+ "> Some things to try: a) Make sure Spark has enough available resources\n",
|
|
|
+ "> for Jupyter to create a Spark context. b) Contact your Jupyter\n",
|
|
|
+ "> administrator to make sure the Spark magics library is configured\n",
|
|
|
+ "> correctly. c) Restart the kernel.\n",
|
|
|
+ "\n",
|
|
|
+ "Use this notebook to run a report to understand more about HDFS, and\n",
|
|
|
+ "optionally move the cluster out of Safe mode if it is safe to do.\n",
|
|
|
+ "\n",
|
|
|
+ "Steps\n",
|
|
|
+ "-----\n",
|
|
|
+ "\n",
|
|
|
+ "### Common functions\n",
|
|
|
+ "\n",
|
|
|
+ "Define helper functions used in this notebook."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {
|
|
|
+ "tags": [
|
|
|
+ "hide_input"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
|
|
|
+ "import sys\n",
|
|
|
+ "import os\n",
|
|
|
+ "import re\n",
|
|
|
+ "import json\n",
|
|
|
+ "import platform\n",
|
|
|
+ "import shlex\n",
|
|
|
+ "import shutil\n",
|
|
|
+ "import datetime\n",
|
|
|
+ "\n",
|
|
|
+ "from subprocess import Popen, PIPE\n",
|
|
|
+ "from IPython.display import Markdown\n",
|
|
|
+ "\n",
|
|
|
+ "retry_hints = {}\n",
|
|
|
+ "error_hints = {}\n",
|
|
|
+ "install_hint = {}\n",
|
|
|
+ "\n",
|
|
|
+ "first_run = True\n",
|
|
|
+ "rules = None\n",
|
|
|
+ "\n",
|
|
|
+ "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
|
|
|
+ " \"\"\"\n",
|
|
|
+ " Run shell command, stream stdout, print stderr and optionally return output\n",
|
|
|
+ " \"\"\"\n",
|
|
|
+ " MAX_RETRIES = 5\n",
|
|
|
+ " output = \"\"\n",
|
|
|
+ " retry = False\n",
|
|
|
+ "\n",
|
|
|
+ " global first_run\n",
|
|
|
+ " global rules\n",
|
|
|
+ "\n",
|
|
|
+ " if first_run:\n",
|
|
|
+ " first_run = False\n",
|
|
|
+ " rules = load_rules()\n",
|
|
|
+ "\n",
|
|
|
+ " # shlex.split is required on bash and for Windows paths with spaces\n",
|
|
|
+ " #\n",
|
|
|
+ " cmd_actual = shlex.split(cmd)\n",
|
|
|
+ "\n",
|
|
|
+ " # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
|
|
|
+ " #\n",
|
|
|
+ " user_provided_exe_name = cmd_actual[0].lower()\n",
|
|
|
+ "\n",
|
|
|
+ " # When running python, use the python in the ADS sandbox ({sys.executable})\n",
|
|
|
+ " #\n",
|
|
|
+ " if cmd.startswith(\"python \"):\n",
|
|
|
+ " cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
|
|
|
+ "\n",
|
|
|
+ " # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
|
|
|
+ " # with:\n",
|
|
|
+ " #\n",
|
|
|
+ " # UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
|
|
|
+ " #\n",
|
|
|
+ " # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
|
|
|
+ " #\n",
|
|
|
+ " if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
|
|
|
+ " os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
|
|
|
+ "\n",
|
|
|
+ " # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
|
|
|
+ " #\n",
|
|
|
+ " which_binary = None\n",
|
|
|
+ "\n",
|
|
|
+ " # Special case for CURL on Windows. The version of CURL in Windows System32 does not work to\n",
|
|
|
+ " # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\". If another instance\n",
|
|
|
+ " # of CURL exists on the machine use that one. (Unfortunately the curl.exe in System32 is almost\n",
|
|
|
+ " # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
|
|
|
+ " # look for the 2nd installation of CURL in the path)\n",
|
|
|
+ " if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
|
|
|
+ " path = os.getenv('PATH')\n",
|
|
|
+ " for p in path.split(os.path.pathsep):\n",
|
|
|
+ " p = os.path.join(p, \"curl.exe\")\n",
|
|
|
+ " if os.path.exists(p) and os.access(p, os.X_OK):\n",
|
|
|
+ " if p.lower().find(\"system32\") == -1:\n",
|
|
|
+ " cmd_actual[0] = p\n",
|
|
|
+ " which_binary = p\n",
|
|
|
+ " break\n",
|
|
|
+ "\n",
|
|
|
+ " # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
|
|
|
+ " # seems to be required for .msi installs of azdata.cmd/az.cmd. (otherwise Popen returns FileNotFound) \n",
|
|
|
+ " #\n",
|
|
|
+ " # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
|
|
|
+ " #\n",
|
|
|
+ " if which_binary == None:\n",
|
|
|
+ " which_binary = shutil.which(cmd_actual[0])\n",
|
|
|
+ "\n",
|
|
|
+ " if which_binary == None:\n",
|
|
|
+ " if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
|
|
|
+ " display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
|
|
|
+ "\n",
|
|
|
+ " raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
|
|
|
+ " else: \n",
|
|
|
+ " cmd_actual[0] = which_binary\n",
|
|
|
+ "\n",
|
|
|
+ " start_time = datetime.datetime.now().replace(microsecond=0)\n",
|
|
|
+ "\n",
|
|
|
+ " print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
|
|
|
+ " print(f\" using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
|
|
|
+ " print(f\" cwd: {os.getcwd()}\")\n",
|
|
|
+ "\n",
|
|
|
+ " # Command-line tools such as CURL and AZDATA HDFS commands output\n",
|
|
|
+ " # scrolling progress bars, which causes Jupyter to hang forever, to\n",
|
|
|
+ " # workaround this, use no_output=True\n",
|
|
|
+ " #\n",
|
|
|
+ "\n",
|
|
|
+ " # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
|
|
|
+ " #\n",
|
|
|
+ " wait = True \n",
|
|
|
+ "\n",
|
|
|
+ " try:\n",
|
|
|
+ " if no_output:\n",
|
|
|
+ " p = Popen(cmd_actual)\n",
|
|
|
+ " else:\n",
|
|
|
+ " p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
|
|
|
+ " with p.stdout:\n",
|
|
|
+ " for line in iter(p.stdout.readline, b''):\n",
|
|
|
+ " line = line.decode()\n",
|
|
|
+ " if return_output:\n",
|
|
|
+ " output = output + line\n",
|
|
|
+ " else:\n",
|
|
|
+ " if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
|
|
|
+ " regex = re.compile(' \"(.*)\"\\: \"(.*)\"') \n",
|
|
|
+ " match = regex.match(line)\n",
|
|
|
+ " if match:\n",
|
|
|
+ " if match.group(1).find(\"HTML\") != -1:\n",
|
|
|
+ " display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
|
|
|
+ " else:\n",
|
|
|
+ " display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
|
|
|
+ "\n",
|
|
|
+ " wait = False\n",
|
|
|
+ " break # otherwise infinite hang, have not worked out why yet.\n",
|
|
|
+ " else:\n",
|
|
|
+ " print(line, end='')\n",
|
|
|
+ " if rules is not None:\n",
|
|
|
+ " apply_expert_rules(line)\n",
|
|
|
+ "\n",
|
|
|
+ " if wait:\n",
|
|
|
+ " p.wait()\n",
|
|
|
+ " except FileNotFoundError as e:\n",
|
|
|
+ " if install_hint is not None:\n",
|
|
|
+ " display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
|
|
|
+ "\n",
|
|
|
+ " raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
|
|
|
+ "\n",
|
|
|
+ " exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
|
|
|
+ "\n",
|
|
|
+ " if not no_output:\n",
|
|
|
+ " for line in iter(p.stderr.readline, b''):\n",
|
|
|
+ " line_decoded = line.decode()\n",
|
|
|
+ "\n",
|
|
|
+ " # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
|
|
|
+ " # print this empty \"ERR:\" as it confuses.\n",
|
|
|
+ " #\n",
|
|
|
+ " if line_decoded == \"\":\n",
|
|
|
+ " continue\n",
|
|
|
+ " \n",
|
|
|
+ " print(f\"STDERR: {line_decoded}\", end='')\n",
|
|
|
+ "\n",
|
|
|
+ " if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
|
|
|
+ " exit_code_workaround = 1\n",
|
|
|
+ "\n",
|
|
|
+ " if user_provided_exe_name in error_hints:\n",
|
|
|
+ " for error_hint in error_hints[user_provided_exe_name]:\n",
|
|
|
+ " if line_decoded.find(error_hint[0]) != -1:\n",
|
|
|
+ " display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
|
|
|
+ "\n",
|
|
|
+ " if rules is not None:\n",
|
|
|
+ " apply_expert_rules(line_decoded)\n",
|
|
|
+ "\n",
|
|
|
+ " if user_provided_exe_name in retry_hints:\n",
|
|
|
+ " for retry_hint in retry_hints[user_provided_exe_name]:\n",
|
|
|
+ " if line_decoded.find(retry_hint) != -1:\n",
|
|
|
+ " if retry_count < MAX_RETRIES:\n",
|
|
|
+ " print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
|
|
|
+ " retry_count = retry_count + 1\n",
|
|
|
+ " output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
|
|
|
+ "\n",
|
|
|
+ " if return_output:\n",
|
|
|
+ " return output\n",
|
|
|
+ " else:\n",
|
|
|
+ " return\n",
|
|
|
+ "\n",
|
|
|
+ " elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
|
|
|
+ "\n",
|
|
|
+ " # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
|
|
|
+ " # don't wait here, if success known above\n",
|
|
|
+ " #\n",
|
|
|
+ " if wait: \n",
|
|
|
+ " if p.returncode != 0:\n",
|
|
|
+ " raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
|
|
|
+ " else:\n",
|
|
|
+ " if exit_code_workaround !=0 :\n",
|
|
|
+ " raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ " print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
|
|
|
+ "\n",
|
|
|
+ " if return_output:\n",
|
|
|
+ " return output\n",
|
|
|
+ "\n",
|
|
|
+ "def load_json(filename):\n",
|
|
|
+ " with open(filename, encoding=\"utf8\") as json_file:\n",
|
|
|
+ " return json.load(json_file)\n",
|
|
|
+ "\n",
|
|
|
+ "def load_rules():\n",
|
|
|
+ "\n",
|
|
|
+ " try:\n",
|
|
|
+ "\n",
|
|
|
+ " # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
|
|
|
+ " #\n",
|
|
|
+ " j = load_json(\"tsg024-name-node-is-in-safe-mode.ipynb\")\n",
|
|
|
+ "\n",
|
|
|
+ " except:\n",
|
|
|
+ " pass # If the user has renamed the book, we can't load ourself. NOTE: Is there a way in Jupyter, to know your own filename?\n",
|
|
|
+ "\n",
|
|
|
+ " else:\n",
|
|
|
+ " if \"metadata\" in j and \\\n",
|
|
|
+ " \"azdata\" in j[\"metadata\"] and \\\n",
|
|
|
+ " \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
|
|
|
+ " \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
|
|
|
+ "\n",
|
|
|
+ " rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
|
|
|
+ "\n",
|
|
|
+ " rules.sort() # Sort rules, so they run in priority order (the [0] element). Lowest value first.\n",
|
|
|
+ "\n",
|
|
|
+ " # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
|
|
|
+ "\n",
|
|
|
+ " return rules\n",
|
|
|
+ "\n",
|
|
|
+ "def apply_expert_rules(line):\n",
|
|
|
+ "\n",
|
|
|
+ " global rules\n",
|
|
|
+ "\n",
|
|
|
+ " for rule in rules:\n",
|
|
|
+ "\n",
|
|
|
+ " # rules that have 9 elements are the injected (output) rules (the ones we want). Rules\n",
|
|
|
+ " # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
|
|
|
+ " # not ../repair/tsg029-nb-name.ipynb)\n",
|
|
|
+ " if len(rule) == 9:\n",
|
|
|
+ " notebook = rule[1]\n",
|
|
|
+ " cell_type = rule[2]\n",
|
|
|
+ " output_type = rule[3] # i.e. stream or error\n",
|
|
|
+ " output_type_name = rule[4] # i.e. ename or name \n",
|
|
|
+ " output_type_value = rule[5] # i.e. SystemExit or stdout\n",
|
|
|
+ " details_name = rule[6] # i.e. evalue or text \n",
|
|
|
+ " expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
|
|
|
+ "\n",
|
|
|
+ " # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
|
|
|
+ "\n",
|
|
|
+ " if re.match(expression, line, re.DOTALL):\n",
|
|
|
+ "\n",
|
|
|
+ " # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
|
|
|
+ "\n",
|
|
|
+ " match_found = True\n",
|
|
|
+ "\n",
|
|
|
+ " display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "print('Common functions defined successfully.')\n",
|
|
|
+ "\n",
|
|
|
+ "# Hints for binary (transient fault) retry, (known) error and install guide\n",
|
|
|
+ "#\n",
|
|
|
+ "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
|
|
|
+ "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
|
|
|
+ "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### Instantiate Kubernetes client"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {
|
|
|
+ "tags": [
|
|
|
+ "hide_input"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Instantiate the Python Kubernetes client into 'api' variable\n",
|
|
|
+ "\n",
|
|
|
+ "import os\n",
|
|
|
+ "\n",
|
|
|
+ "try:\n",
|
|
|
+ " from kubernetes import client, config\n",
|
|
|
+ " from kubernetes.stream import stream\n",
|
|
|
+ "\n",
|
|
|
+ " if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
|
|
|
+ " config.load_incluster_config()\n",
|
|
|
+ " else:\n",
|
|
|
+ " config.load_kube_config()\n",
|
|
|
+ "\n",
|
|
|
+ " api = client.CoreV1Api()\n",
|
|
|
+ "\n",
|
|
|
+ " print('Kubernetes client instantiated')\n",
|
|
|
+ "except ImportError:\n",
|
|
|
+ " from IPython.display import Markdown\n",
|
|
|
+ " display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
|
|
|
+ " raise"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### Get the namespace for the big data cluster\n",
|
|
|
+ "\n",
|
|
|
+ "Get the namespace of the big data cluster from the Kuberenetes API.\n",
|
|
|
+ "\n",
|
|
|
+ "NOTE: If there is more than one big data cluster in the target\n",
|
|
|
+ "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
|
|
|
+ "cluster."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {
|
|
|
+ "tags": [
|
|
|
+ "hide_input"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
|
|
|
+ "\n",
|
|
|
+ "try:\n",
|
|
|
+ " namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
|
|
|
+ "except IndexError:\n",
|
|
|
+ " from IPython.display import Markdown\n",
|
|
|
+ " display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
|
|
|
+ " display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
|
|
|
+ " display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
|
|
|
+ " raise\n",
|
|
|
+ "\n",
|
|
|
+ "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### Get the name of the namenode pod"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "namenode_pod = run(f'kubectl get pod --selector=role=namenode -n {namespace} -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
|
|
|
+ "\n",
|
|
|
+ "print ('Namenode pod name: ' + namenode_pod)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### Get the `hdfs dfsadmin` report"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "name=namenode_pod\n",
|
|
|
+ "container='hadoop'\n",
|
|
|
+ "\n",
|
|
|
+ "command='hdfs dfsadmin -report'\n",
|
|
|
+ "\n",
|
|
|
+ "string=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)\n",
|
|
|
+ "\n",
|
|
|
+ "print(string)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### Set the text that identifies this issue"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "precondition_text=\"Safe mode is ON\""
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### PRECONDITION CHECK"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "if precondition_text not in string:\n",
|
|
|
+ " raise Exception(\"PRECONDITION NON-MATCH: 'tsg024-name-node-is-in-safe-mode' is not a match for an active problem\")\n",
|
|
|
+ "\n",
|
|
|
+ "print(\"PRECONDITION MATCH: 'tsg024-name-node-is-in-safe-mode' is a match for an active problem in this cluster\")"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "Resolution\n",
|
|
|
+ "----------\n",
|
|
|
+ "\n",
|
|
|
+ "NOTE: Only if it is determined there are no missing, corrupt or under\n",
|
|
|
+ "replicated blocks that should not be ignored is it safe to take the name\n",
|
|
|
+ "node out of safe mode. Use `hdfs dfsadmin -report` and `hdfs fsck` to\n",
|
|
|
+ "understand more about missing, corrupt or under replicated blocks.\n",
|
|
|
+ "\n",
|
|
|
+ "### Move the namenode out of safe mode"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "command='hdfs dfsadmin -safemode leave'\n",
|
|
|
+ "\n",
|
|
|
+ "string=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)\n",
|
|
|
+ "\n",
|
|
|
+ "print(string)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### Validate - Verify the namenode is no longer in safe mode\n",
|
|
|
+ "\n",
|
|
|
+ "Validate that the text \u2018Safe mode is ON\u2019 is no longer in the\n",
|
|
|
+ "`hdfs dfsadmin -report` output"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "command='hdfs dfsadmin -report'\n",
|
|
|
+ "\n",
|
|
|
+ "string=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)\n",
|
|
|
+ "\n",
|
|
|
+ "if precondition_text in string:\n",
|
|
|
+ " raise SystemExit ('FAILED - hdfs dfsadmin -report output still contains: ' + precondition_text)\n",
|
|
|
+ "\n",
|
|
|
+ "print ('SUCCESS - hdfs dfsadmin -report output no longer contains: ' + precondition_text)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "print('Notebook execution complete.')"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 5,
|
|
|
+ "metadata": {
|
|
|
+ "kernelspec": {
|
|
|
+ "name": "python3",
|
|
|
+ "display_name": "Python 3"
|
|
|
+ },
|
|
|
+ "azdata": {
|
|
|
+ "side_effects": false
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|