Browse Source

Add tsg books

barbaravaldez 5 years ago
parent
commit
b5c29867d6
100 changed files with 28382 additions and 0 deletions
  1. BIN
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/.DS_Store
  2. 2 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/_config.yml
  3. 280 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/_data/toc.yml
  4. 21 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/readme.md
  5. 335 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop005-az-login.ipynb
  6. 335 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop006-az-logout.ipynb
  7. 291 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop007-get-key-version-information.ipynb
  8. 321 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop011-set-kubernetes-context.ipynb
  9. 293 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop028-azdata-login.ipynb
  10. 344 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop033-azdata-logout.ipynb
  11. 213 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop034-wait-cluster-healthly.ipynb
  12. 29 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/readme.md
  13. 317 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg027-observe-bdc-create.ipynb
  14. 169 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg029-find-dumps-in-the-cluster.ipynb
  15. 147 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg032-get-cpu-and-memory-for-all-containers.ipynb
  16. 319 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg037-determine-primary-master-replica.ipynb
  17. 469 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg055-time-curl-to-sparkhead.ipynb
  18. 441 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg060-get-disk-space-for-all-pvcs.ipynb
  19. 293 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg078-is-cluster-healthy.ipynb
  20. 293 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg079-generate-controller-core-dump.ipynb
  21. 135 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg086-run-top-for-all-containers.ipynb
  22. 414 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg087-use-hadoop-fs.ipynb
  23. 417 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg108-controller-failed-to-upgrade.ipynb
  24. 33 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/readme.md
  25. 332 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop008-brew-install-odbc-for-sql-server.ipynb
  26. 318 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop010-upgrade-bdc.ipynb
  27. 351 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop036-install-kubectl.ipynb
  28. 344 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop037-uninstall-kubectl.ipynb
  29. 341 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop038-install-az.ipynb
  30. 332 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop039-uninstall-az.ipynb
  31. 334 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop040-upgrade-pip.ipynb
  32. 352 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop054-uninstall-azdata.ipynb
  33. 291 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop055-install-azdata.ipynb
  34. 350 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop059-install-kubernetes-module.ipynb
  35. 350 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop060-uninstall-kubernetes-module.ipynb
  36. 364 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop062-install-ipython-sql-module.ipynb
  37. 62 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop069-install-odbc-driver-for-sql-server.ipynb
  38. 39 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/readme.md
  39. 281 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg030-get-errorlog-from-all-pods.ipynb
  40. 279 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg031-get-polybase-logs-for-all-pods.ipynb
  41. 283 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg034-get-livy-logs.ipynb
  42. 283 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg035-get-sparkhistory-logs.ipynb
  43. 308 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg036-get-controller-logs.ipynb
  44. 320 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg046-get-knox-logs.ipynb
  45. 277 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg073-get-influxdb-logs.ipynb
  46. 280 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg076-get-elastic-search-logs.ipynb
  47. 277 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg077-get-kibana-logs.ipynb
  48. 282 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg088-get-datanode-logs.ipynb
  49. 280 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg090-get-nodemanager-logs.ipynb
  50. 280 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg092-get-all-supervisord-log-tails.ipynb
  51. 254 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg093-get-all-agent-log-tails.ipynb
  52. 277 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg094-get-grafana-logs.ipynb
  53. 280 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg095-get-namenode-logs.ipynb
  54. 280 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg096-get-zookeeper-logs.ipynb
  55. 19 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/readme.md
  56. 291 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg001-copy-logs.ipynb
  57. 193 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg061-tail-bdc-container-logs.ipynb
  58. 188 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg062-tail-bdc-previous-container-logs.ipynb
  59. 337 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg083-run-kubectl-cluster-info-dump.ipynb
  60. 66 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg084-internal-query-process-error.ipynb
  61. 58 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg091-get-azdata-logs.ipynb
  62. 29 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/readme.md
  63. 291 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg003-show-spark-sessions.ipynb
  64. 332 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg004-show-app-list.ipynb
  65. 332 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg012-azdata-bdc-status.ipynb
  66. 311 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg013-azdata-bdc-hdfs-ls.ipynb
  67. 332 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg014-azdata-bdc-endpoint-list.ipynb
  68. 332 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg017-azdata-bdc-config-show.ipynb
  69. 332 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg033-azdata-bdc-sql-status.ipynb
  70. 332 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg049-azdata-bdc-control-status.ipynb
  71. 332 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg068-azdata-bdc-hdfs-status.ipynb
  72. 332 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg069-azdata-bdc-gateway-status.ipynb
  73. 311 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg070-use-azdata-sql-query.ipynb
  74. 51 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/readme.md
  75. 351 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg006-view-system-pod-status.ipynb
  76. 390 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg007-view-bdc-pod-status.ipynb
  77. 337 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg008-get-k8s-version-info.ipynb
  78. 351 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg009-get-nodes.ipynb
  79. 348 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg010-get-kubernetes-contexts.ipynb
  80. 371 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg015-view-k8s-services-for-bdc.ipynb
  81. 418 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg016-describe-all-pods-in-bdc-namespace.ipynb
  82. 371 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg020-describe-all-nodes.ipynb
  83. 337 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg021-get-k8s-cluster-info.ipynb
  84. 337 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg022-get-external-ip-of-kubeadm-host.ipynb
  85. 393 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg023-run-kubectl-get-all.ipynb
  86. 485 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg042-get-hosting-node-and-data-log-mount.ipynb
  87. 337 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg063-get-storage-classes.ipynb
  88. 376 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg064-get-persistent-volume-claims.ipynb
  89. 376 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg065-get-secrets-for-bdc-namespace.ipynb
  90. 408 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg066-get-kubernetes-events.ipynb
  91. 338 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg072-get-persistent-volumes.ipynb
  92. 356 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb
  93. 436 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg089-describe-non-running-pods-in-bdc.ipynb
  94. 374 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg097-get-statefulsets.ipynb
  95. 374 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg098-get-replicasets.ipynb
  96. 377 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg099-get-daemonsets.ipynb
  97. 20 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/readme.md
  98. 33 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/repair/readme.md
  99. 541 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/repair/tsg024-name-node-is-in-safe-mode.ipynb
  100. 154 0
      Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/repair/tsg028-restart-nodemanager-in-storage-pool.ipynb

BIN
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/.DS_Store


+ 2 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/_config.yml

@@ -0,0 +1,2 @@
+title: Operations and Support - SQL Server 2019 Big Data Clusters
+description: A collection of notebooks to help operate and support SQL Server Big Data Clusters.

+ 280 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/_data/toc.yml

@@ -0,0 +1,280 @@
+- title: Welcome
+  url: /readme
+  not_numbered: true
+- title: Search
+  search: true
+
+- title: Troubleshooters
+  url: /troubleshooters/readme
+  not_numbered: true
+  expand_sections: true
+  sections:
+  - title: TSG100 - The Big Data Cluster troubleshooter
+    url: troubleshooters/tsg100-troubleshoot-bdc
+  - title: TSG101 - SQL Server troubleshooter
+    url: troubleshooters/tsg101-troubleshoot-sql-server
+  - title: TSG102 - HDFS troubleshooter
+    url: troubleshooters/tsg102-troubleshoot-hdfs
+  - title: TSG103 - Spark troubleshooter
+    url: troubleshooters/tsg103-troubleshoot-spark
+  - title: TSG104 - Control troubleshooter
+    url: troubleshooters/tsg104-troubleshoot-control
+  - title: TSG105 - Gateway troubleshooter
+    url: troubleshooters/tsg105-troubleshoot-gateway
+  - title: TSG106 - App troubleshooter
+    url: troubleshooters/tsg106-troubleshoot-app
+- title: Log Analyzers
+  url: /log-analyzers/readme
+  not_numbered: true
+  expand_sections: true
+  sections:
+  - title: TSG046 - Knox gateway logs
+    url: log-analyzers/tsg046-get-knox-logs
+  - title: TSG036 - Controller logs
+    url: log-analyzers/tsg036-get-controller-logs
+  - title: TSG034 - Livy logs
+    url: log-analyzers/tsg034-get-livy-logs
+  - title: TSG035 - Spark History logs
+    url: log-analyzers/tsg035-get-sparkhistory-logs
+  - title: TSG030 - SQL Server errorlog files
+    url: log-analyzers/tsg030-get-errorlog-from-all-pods
+  - title: TSG031 - SQL Server PolyBase logs
+    url: log-analyzers/tsg031-get-polybase-logs-for-all-pods
+  - title: TSG095 - Hadoop namenode logs
+    url: log-analyzers/tsg095-get-namenode-logs
+  - title: TSG090 - Yarn nodemanager logs
+    url: log-analyzers/tsg090-get-nodemanager-logs
+  - title: TSG088 - Hadoop datanode logs
+    url: log-analyzers/tsg088-get-datanode-logs
+  - title: TSG096 - Zookeeper logs
+    url: log-analyzers/tsg096-get-zookeeper-logs
+  - title: TSG073 - InfluxDB logs
+    url: log-analyzers/tsg073-get-influxdb-logs
+  - title: TSG076 - Elastic Search logs
+    url: log-analyzers/tsg076-get-elastic-search-logs
+  - title: TSG077 - Kibana logs
+    url: log-analyzers/tsg077-get-kibana-logs
+  - title: TSG092 - Supervisord log tail for all containers in BDC
+    url: log-analyzers/tsg092-get-all-supervisord-log-tails
+  - title: TSG093 - Agent log tail for all containers in BDC
+    url: log-analyzers/tsg093-get-all-agent-log-tails
+  - title: TSG094 - Grafana logs
+    url: log-analyzers/tsg094-get-grafana-logs
+- title: Diagnose
+  url: /diagnose/readme
+  not_numbered: true
+  expand_sections: true
+  sections:
+  - title: TSG027 - Observe cluster deployment
+    url: diagnose/tsg027-observe-bdc-create
+  - title: TSG078 - Is cluster healthy
+    url: diagnose/tsg078-is-cluster-healthy
+  - title: TSG029 - Find dumps in the cluster
+    url: diagnose/tsg029-find-dumps-in-the-cluster
+  - title: TSG032 - CPU and Memory usage for all containers
+    url: diagnose/tsg032-get-cpu-and-memory-for-all-containers
+  - title: TSG060 - Persistent Volume disk space for all BDC PVCs
+    url: diagnose/tsg060-get-disk-space-for-all-pvcs
+  - title: TSG087 - Use hadoop fs CLI on nmnode pod
+    url: diagnose/tsg087-use-hadoop-fs
+  - title: TSG037 - Determine master pool pod hosting primary replica
+    url: diagnose/tsg037-determine-primary-master-replica
+  - title: TSG055 - Time Curl to Sparkhead
+    url: diagnose/tsg055-time-curl-to-sparkhead
+  - title: TSG079 - Generate `controller` core dump
+    url: diagnose/tsg079-generate-controller-core-dump
+  - title: TSG086 - Run `top` in all containers
+    url: diagnose/tsg086-run-top-for-all-containers
+  - title: TSG108 - View the controller upgrade config map
+    url: diagnose/tsg108-controller-failed-to-upgrade
+- title: Repair 
+  url: /repair/readme
+  not_numbered: true
+  expand_sections: false
+  sections:
+  - title: TSG024 - Namenode is in safe mode
+    url: repair/tsg024-name-node-is-in-safe-mode
+  - title: TSG041 - Unable to create a new asynchronous I/O context (increase sysctl fs.aio-max-nr)
+    url: repair/tsg041-increase-fs-aio-max-nr
+  - title: TSG048 - Deployment stuck at "Waiting for controller pod to be up"
+    url: repair/tsg048-create-stuck-waiting-for-controller
+  - title: TSG038 - BDC create failures due to - doc is missing key
+    url: repair/tsg038-doc-is-missing-key-error
+  - title: TSG047 - ConfigException - Expected only one object with name
+    url: repair/tsg047-expected-only-one-object-with-name
+  - title: TSG050 - Cluster create hangs with "timeout expired waiting for volumes to attach or mount for pod"
+    url: repair/tsg050-timeout-expired-waiting-for-volumes
+  - title: TSG057 - Failed when starting controller service. System.TimeoutException
+    url: repair/tsg057-failed-when-starting-controller
+  - title: TSG067 - Failed to complete kube config setup
+    url: repair/tsg067-failed-to-complete-kube-config-setup
+  - title: TSG075 - FailedCreatePodSandBox due to NetworkPlugin cni failed to set up pod
+    url: repair/tsg075-networkplugin-cni-failed-to-setup-pod
+  - title: TSG110 - Azdata returns ApiError
+    url: repair/tsg110-azdata-returns-apierror
+  - title: TSG028 - Restart node manager on all storage pool nodes
+    url: repair/tsg028-restart-nodemanager-in-storage-pool
+  - title: TSG045 - The maximum number of data disks allowed to be attached to a VM of this size (AKS)
+    url: repair/tsg045-max-number-data-disks-allowed
+  - title: TSG109 - Set upgrade timeouts
+    url: repair/tsg109-upgrade-stalled
+- title: Monitor - Big Data Cluster
+  url: /monitor-bdc/readme
+  not_numbered: true
+  expand_sections: true
+  sections:
+  - title: TSG014 - Show BDC endpoints
+    url: monitor-bdc/tsg014-azdata-bdc-endpoint-list
+  - title: TSG012 - Show BDC Status
+    url: monitor-bdc/tsg012-azdata-bdc-status
+  - title: TSG069 - Show Big Data Cluster Gateway status
+    url: monitor-bdc/tsg069-azdata-bdc-gateway-status
+  - title: TSG049 - Show BDC Controller status
+    url: monitor-bdc/tsg049-azdata-bdc-control-status
+  - title: TSG033 - Show BDC SQL status
+    url: monitor-bdc/tsg033-azdata-bdc-sql-status
+  - title: TSG068 - Show BDC HDFS status
+    url: monitor-bdc/tsg068-azdata-bdc-hdfs-status
+  - title: TSG017 - Show BDC Configuration
+    url: monitor-bdc/tsg017-azdata-bdc-config-show
+  - title: TSG004 - Show BDC Apps
+    url: monitor-bdc/tsg004-show-app-list
+  - title: TSG003 - Show BDC Spark sessions
+    url: monitor-bdc/tsg003-show-spark-sessions
+  - title: TSG013 - Show file list in Storage Pool (HDFS)
+    url: monitor-bdc/tsg013-azdata-bdc-hdfs-ls
+  - title: TSG070 - Query SQL master pool
+    url: monitor-bdc/tsg070-use-azdata-sql-query
+- title: Monitor - Kubernetes
+  url: /monitor-k8s/readme
+  not_numbered: true
+  expand_sections: false
+  sections:
+  - title: TSG021 - Get cluster info (Kubernetes)
+    url: monitor-k8s/tsg021-get-k8s-cluster-info
+  - title: TSG008 - Get version information (Kubernetes)
+    url: monitor-k8s/tsg008-get-k8s-version-info
+  - title: TSG081 - Get namespaces (Kubernetes)
+    url: monitor-k8s/tsg081-get-kubernetes-namespaces
+  - title: TSG009 - Get nodes (Kubernetes)
+    url: monitor-k8s/tsg009-get-nodes
+  - title: TSG006 - Get system pod status
+    url: monitor-k8s/tsg006-view-system-pod-status
+  - title: TSG007 - Get BDC pod status
+    url: monitor-k8s/tsg007-view-bdc-pod-status
+  - title: TSG015 - View BDC services (Kubernetes)
+    url: monitor-k8s/tsg015-view-k8s-services-for-bdc
+  - title: TSG097 - Get BDC stateful sets (Kubernetes)
+    url: monitor-k8s/tsg097-get-statefulsets
+  - title: TSG098 - Get BDC replicasets (Kubernetes)
+    url: monitor-k8s/tsg098-get-replicasets
+  - title: TSG099 - Get BDC daemonsets (Kubernetes)
+    url: monitor-k8s/tsg099-get-daemonsets
+  - title: TSG023 - Get all BDC objects (Kubernetes)
+    url: monitor-k8s/tsg023-run-kubectl-get-all
+  - title: TSG063 - Get storage classes (Kubernetes)
+    url: monitor-k8s/tsg063-get-storage-classes
+  - title: TSG072 - Get Persistent Volumes (Kubernetes)
+    url: monitor-k8s/tsg072-get-persistent-volumes
+  - title: TSG064 - Get BDC Persistent Volume Claims
+    url: monitor-k8s/tsg064-get-persistent-volume-claims
+  - title: TSG065 - Get BDC secrets (Kubernetes)
+    url: monitor-k8s/tsg065-get-secrets-for-bdc-namespace
+  - title: TSG066 - Get BDC event (Kubernetes)
+    url: monitor-k8s/tsg066-get-kubernetes-events
+  - title: TSG020- Describe nodes (Kubernetes)
+    url: monitor-k8s/tsg020-describe-all-nodes
+  - title: TSG016- Describe BDC pods
+    url: monitor-k8s/tsg016-describe-all-pods-in-bdc-namespace
+  - title: TSG089 - Describe BDC non-running pods
+    url: monitor-k8s/tsg089-describe-non-running-pods-in-bdc
+  - title: TSG010 - Get configuration contexts
+    url: monitor-k8s/tsg010-get-kubernetes-contexts
+  - title: TSG022 - Get external IP address for kubeadm host
+    url: monitor-k8s/tsg022-get-external-ip-of-kubeadm-host
+  - title: TSG042 - Get `node name` and external mounts for `Data` and `Logs` `PVCs`
+    url: monitor-k8s/tsg042-get-hosting-node-and-data-log-mount
+- title: Logs
+  url: /log-files/readme
+  not_numbered: true
+  expand_sections: false
+  sections:
+  - title: TSG001 - Run azdata copy-logs
+    url: log-files/tsg001-copy-logs
+  - title: TSG091 - Get the azdata CLI logs
+    url: log-files/tsg091-get-azdata-logs
+  - title: TSG083 - Run kubectl cluster-info dump
+    url: log-files/tsg083-run-kubectl-cluster-info-dump
+  - title: TSG061 - Get tail of all container logs for pods in BDC namespace
+    url: log-files/tsg061-tail-bdc-container-logs
+  - title: TSG062 - Get tail of all previous container logs for pods in BDC namespace
+    url: log-files/tsg062-tail-bdc-previous-container-logs
+  - title: TSG084 - Internal Query Processor Error
+    url: log-files/tsg084-internal-query-process-error
+- title: Samples 
+  url: /sample/readme
+  not_numbered: true
+  expand_sections: false
+  sections:
+  - title: SAM001 - Storage Pool (1 of 2) - Load sample data
+    url: sample/sam001-load-sample-data-into-bdc
+  - title: SAM002 - Storage Pool (2 of 2) - Query HDFS
+    url: sample/sam002-query-hdfs-in-sql-server
+  - title: SAM003 - Data Pool Example
+    url: sample/sam003-data-pool
+  - title: SAM008 - Spark using azdata
+    url: sample/sam008-spark-using-azdata
+  - title: SAM009 - HDFS using azdata
+    url: sample/sam009-hdfs-using-azdata
+  - title: SAM010 - App using azdata
+    url: sample/sam010-app-using-azdata
+- title: Install 
+  url: /install/readme
+  not_numbered: true
+  expand_sections: false
+  sections:
+  - title: SOP036 - Install kubectl command line interface
+    url: install/sop036-install-kubectl
+  - title: SOP037 - Uninstall kubectl command line interface
+    url: install/sop037-uninstall-kubectl
+  - title: SOP059 - Install Kubernetes Python module
+    url: install/sop059-install-kubernetes-module
+  - title: SOP060 - Uninstall kubernetes module
+    url: install/sop060-uninstall-kubernetes-module
+  - title: SOP062 - Install ipython-sql and pyodbc modules
+    url: install/sop062-install-ipython-sql-module
+  - title: SOP055 - Install azdata command line interface
+    url: install/sop055-install-azdata
+  - title: SOP054 - Uninstall azdata command line interface
+    url: install/sop054-uninstall-azdata
+  - title: SOP038 - Install azure command line interface
+    url: install/sop038-install-az
+  - title: SOP039 - Uninstall azure command line interface
+    url: install/sop039-uninstall-az
+  - title: SOP040 - Upgrade pip in ADS Python sandbox
+    url: install/sop040-upgrade-pip
+  - title: SOP069 - Install ODBC for SQL Server
+    url: install/sop069-install-odbc-driver-for-sql-server
+  - title: SOP008 - Install unixodbc for Mac
+    url: install/sop008-brew-install-odbc-for-sql-server
+  - title: SOP010 - Upgrade a big data cluster
+    url: install/sop010-upgrade-bdc
+- title: Common 
+  url: /common/readme
+  not_numbered: true
+  expand_sections: false
+  sections:
+  - title: SOP028 - azdata login
+    url: common/sop028-azdata-login
+  - title: SOP033 - azdata logout
+    url: common/sop033-azdata-logout
+  - title: SOP005 - az login
+    url: common/sop005-az-login
+  - title: SOP006 - az logout
+    url: common/sop006-az-logout
+  - title: SOP007 - Version information (azdata, bdc, kubernetes)
+    url: common/sop007-get-key-version-information
+  - title: SOP011 - Set kubernetes configuration context
+    url: common/sop011-set-kubernetes-context
+  - title: SOP034 - Wait for BDC to be Healthy
+    url: common/sop034-wait-cluster-healthly

+ 21 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/readme.md

@@ -0,0 +1,21 @@
+# A set of notebooks used for common scenarios
+
+- The notebooks in this chapter are used as prerequisites for other notebooks, such as login and logout of a cluster.
+
+[Home](../readme.md)
+
+## Notebooks in this Chapter
+- [SOP028 - azdata login
](sop028-azdata-login.ipynb)
+
+- [SOP033 - azdata logout
](sop033-azdata-logout.ipynb)
+
+- [SOP005 - az login
](sop005-az-login.ipynb)
+
+- [SOP006 - az logout
](sop006-az-logout.ipynb)
+
+- [SOP007 - Version information (azdata, bdc, kubernetes)
](sop007-get-key-version-information.ipynb)
+
+- [SOP011 - Set kubernetes configuration context
](sop011-set-kubernetes-context.ipynb)
+
+- [SOP034 - Wait for BDC to be Healthy
](sop034-wait-cluster-healthly.ipynb)
+

+ 335 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop005-az-login.ipynb

@@ -0,0 +1,335 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP005 - az login\n",
+                "=================\n",
+                "\n",
+                "Use the az command line interface to login to Azure.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop005-az-login.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Login to azure"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(\"az login\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [SOP006 - az logout](../common/sop006-az-logout.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 335 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop006-az-logout.ipynb

@@ -0,0 +1,335 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP006 - az logout\n",
+                "==================\n",
+                "\n",
+                "Use the az command line interface to logout of Azure.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop006-az-logout.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Logout of azure"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(\"az logout\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [SOP005 - az login](../common/sop005-az-login.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

File diff suppressed because it is too large
+ 291 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop007-get-key-version-information.ipynb


File diff suppressed because it is too large
+ 321 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop011-set-kubernetes-context.ipynb


File diff suppressed because it is too large
+ 293 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop028-azdata-login.ipynb


+ 344 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop033-azdata-logout.ipynb

@@ -0,0 +1,344 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP033 - azdata logout\n",
+                "======================\n",
+                "\n",
+                "Use the azdata command line interface to logout of a Big Data Cluster.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop033-azdata-logout.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'azdata': ['Endpoint sql-server-master does not exist', 'Endpoint livy does not exist', 'Failed to get state for cluster', 'Endpoint webhdfs does not exist', 'Adaptive Server is unavailable or does not exist', 'Error: Address already in use']}\n",
+                "error_hints = {'azdata': [['azdata login', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['The token is expired', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Reason: Unauthorized', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Max retries exceeded with url: /api/v1/bdc/endpoints', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Look at the controller logs for more details', 'TSG027 - Observe cluster deployment', '../diagnose/tsg027-observe-bdc-create.ipynb'], ['provided port is already allocated', 'TSG062 - Get tail of all previous container logs for pods in BDC namespace', '../log-files/tsg062-tail-bdc-previous-container-logs.ipynb'], ['Create cluster failed since the existing namespace', 'SOP061 - Delete a big data cluster', '../install/sop061-delete-bdc.ipynb'], ['Failed to complete kube config setup', 'TSG067 - Failed to complete kube config setup', '../repair/tsg067-failed-to-complete-kube-config-setup.ipynb'], ['Error processing command: \"ApiError', 'TSG110 - Azdata returns ApiError', '../repair/tsg110-azdata-returns-apierror.ipynb'], ['Error processing command: \"ControllerError', 'TSG036 - Controller logs', '../log-analyzers/tsg036-get-controller-logs.ipynb'], ['ERROR: 500', 'TSG046 - Knox gateway logs', '../log-analyzers/tsg046-get-knox-logs.ipynb'], ['Data source name not found and no default driver specified', 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], [\"Can't open lib 'ODBC Driver 17 for SQL Server\", 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ['Control plane upgrade failed. Failed to upgrade controller.', 'TSG108 - View the controller upgrade config map', '../diagnose/tsg108-controller-failed-to-upgrade.ipynb']]}\n",
+                "install_hint = {'azdata': ['SOP055 - Install azdata command line interface', '../install/sop055-install-azdata.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Use azdata to log out"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('azdata logout')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [SOP028 - azdata login](../common/sop028-azdata-login.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 213 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/common/sop034-wait-cluster-healthly.ipynb

@@ -0,0 +1,213 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP034 - Wait for BDC to be Healthy\n",
+                "===================================\n",
+                "\n",
+                "Blocks until the Big Data Cluster is healthy, or the specified timeout\n",
+                "expires.\n",
+                "\n",
+                "The min\\_pod\\_count parameter indicates that the health check will not\n",
+                "pass until at least this number of pods exists in the cluster. If any\n",
+                "existing pods beyond this limit are unhealthy, the cluster is not\n",
+                "healthy.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "timeout = 600  # amount of time to wait before cluster is healthy:  default to 10 minutes\n",
+                "check_interval = 5  # amount of time between health checks - default 5 seconds\n",
+                "min_pod_count = 10  # minimum number of healthy pods required to assert health"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Wait for cluster to become healthy"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import threading\n",
+                "import time\n",
+                "import sys\n",
+                "\n",
+                "isRunning = True\n",
+                "\n",
+                "def all_containers_ready(pod):\n",
+                "    \"\"\"helper method returns true if all the containers within the given pod are ready\n",
+                "\n",
+                "    Arguments:\n",
+                "        pod {v1Pod} -- Metadata retrieved from the api call to.\n",
+                "    \"\"\"\n",
+                "    return all(map(lambda c: c.ready is True, pod.status.container_statuses))\n",
+                "\n",
+                "def pod_is_ready(pod):\n",
+                "    \"\"\"tests that the pod, and all containers are ready\n",
+                "\n",
+                "    Arguments:\n",
+                "        pod {v1Pod} -- Metadata retrieved from api call.\n",
+                "    \"\"\"\n",
+                "\n",
+                "    return pod.status.phase == \"Running\" and all_containers_ready(pod)\n",
+                "\n",
+                "def waitReady():\n",
+                "    \"\"\"Waits for all pods, and containers to become ready.\n",
+                "    \"\"\"\n",
+                "\n",
+                "    while isRunning:\n",
+                "        try:\n",
+                "            pods = None\n",
+                "\n",
+                "            if namespace is not None:\n",
+                "                display(\"Checking namespace {0}\".format(namespace))\n",
+                "                pods = api.list_namespaced_pod(namespace, _request_timeout=30) \n",
+                "            else:\n",
+                "                display(\"Checking all namespaces\".format(namespace))\n",
+                "                pods = api.list_pod_for_all_namespaces(_request_timeout=30)\n",
+                "\n",
+                "            allReady = len(pods.items) > min_pod_count and all(map(pod_is_ready, pods.items))\n",
+                "\n",
+                "            if allReady:\n",
+                "                cluster_healthy = True\n",
+                "                return True\n",
+                "            else:\n",
+                "                display(\"cluster not healthy, rechecking in {0} seconds.\".format(check_interval))\n",
+                "\n",
+                "            time.sleep(check_interval)\n",
+                "        except:\n",
+                "            last_error_message = str(sys.exc_info())\n",
+                "            display(last_error_message)\n",
+                "            time.sleep(check_interval)\n",
+                "\n",
+                "mt = threading.Thread(target=waitReady)\n",
+                "mt.start()\n",
+                "mt.join(timeout=timeout)\n",
+                "\n",
+                "if mt.isAlive():\n",
+                "    raise SystemExit(\"Timeout waiting for pods to become ready.\")\n",
+                "else:\n",
+                "    display(\"Cluster is healthy\")\n",
+                "\n",
+                "isRunning = False"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 29 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/readme.md

@@ -0,0 +1,29 @@
+# Diagnose notebooks
+
+- A collection of notebooks for diagnosing situations and states with a Big Data Cluster.
+
+[Home](../readme.md)
+
+## Notebooks in this Chapter
+- [TSG027 - Observe cluster deployment
](tsg027-observe-bdc-create.ipynb)
+
+- [TSG078 - Is cluster healthy
](tsg078-is-cluster-healthy.ipynb)
+
+- [TSG029 - Find dumps in the cluster](tsg029-find-dumps-in-the-cluster.ipynb)
+
+- [TSG032 - CPU and Memory usage for all containers
](tsg032-get-cpu-and-memory-for-all-containers.ipynb)
+
+- [TSG060 - Persistent Volume disk space for all BDC PVCs
](tsg060-get-disk-space-for-all-pvcs.ipynb)
+
+- [TSG087 - Use hadoop fs CLI on nmnode pod
](tsg087-use-hadoop-fs.ipynb)
+
+- [TSG037 - Determine master pool pod hosting primary replica
](tsg037-determine-primary-master-replica.ipynb)
+
+- [TSG055 - Time Curl to Sparkhead
](tsg055-time-curl-to-sparkhead.ipynb)
+
+- [TSG079 - Generate `controller` core dump
](tsg079-generate-controller-core-dump.ipynb)
+
+- [TSG086 - Run `top` in all containers
](tsg086-run-top-for-all-containers.ipynb)
+
+- [TSG108 - View the controller upgrade config map
](tsg108-controller-failed-to-upgrade.ipynb)
+

File diff suppressed because it is too large
+ 317 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg027-observe-bdc-create.ipynb


+ 169 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg029-find-dumps-in-the-cluster.ipynb

@@ -0,0 +1,169 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG029 - Find dumps in the cluster\n",
+                "==================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Look for coredumps and minidumps from processes like SQL Server or\n",
+                "controller in a big data cluster.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get all relevant pods"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "pod_list = api.list_namespaced_pod(namespace, label_selector='app in (compute-0, data-0, storage-0, master, controller, controldb)', field_selector='status.phase==Running')\n",
+                "pod_names = [pod.metadata.name for pod in pod_list.items]\n",
+                "print('Scanning pods: ' + ', '.join(pod_names))\n",
+                "\n",
+                "command = 'find /var/opt /var/log | grep -E \"core\\\\.sqlservr|core\\\\.controller|SQLD|\\\\.mdmp$|\\\\.dmp$|\\\\.gdmp$\"'\n",
+                "all_dumps = ''\n",
+                "\n",
+                "for name in pod_names:\n",
+                "    print('Searching pod: ' + name)\n",
+                "    container = 'mssql-server'\n",
+                "    if 'control-' in name:\n",
+                "        container = 'controller'\n",
+                "\n",
+                "    try:\n",
+                "        dumps=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)\n",
+                "    except Exception as e:\n",
+                "        print(f'Unable to connect to pod: {name} due to {str(e.__class__)}. Skipping dump check for this pod...')\n",
+                "    else:\n",
+                "        if dumps:\n",
+                "            all_dumps += '*Pod: ' + name + '*\\n'\n",
+                "            all_dumps += dumps + '\\n'"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Validate\n",
+                "\n",
+                "Validate no dump files were found."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "if len(all_dumps) > 0:\n",
+                "    raise SystemExit('FAIL - dump files found:\\n' + all_dumps)\n",
+                "\n",
+                "print('SUCCESS - no dump files were found.')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 147 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg032-get-cpu-and-memory-for-all-containers.ipynb

@@ -0,0 +1,147 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG032 - CPU and Memory usage for all containers\n",
+                "================================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get per process usage stats"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "cmd = \"\"\"echo \"CPU %\\t MEM %\\t MEM\\t PROCESS\" &&\n",
+                "ps aux |\n",
+                "awk '\n",
+                "    {mem[$11] += int($6/1024)};\n",
+                "    {cpuper[$11] += $3};\n",
+                "    {memper[$11] += $4};\n",
+                "END {\n",
+                "    for (i in mem) {\n",
+                "        print cpuper[i] \"%\\t\", memper[i] \"%\\t\", mem[i] \"MB\\t\", i\n",
+                "    }\n",
+                "}' |\n",
+                "sort -k3nr\n",
+                "\"\"\"\n",
+                "\n",
+                "pod_list = api.list_namespaced_pod(namespace)\n",
+                "pod_names = [pod.metadata.name for pod in pod_list.items]\n",
+                "\n",
+                "for pod in pod_list.items:\n",
+                "    container_names = [container.name for container in pod.spec.containers]\n",
+                "\n",
+                "    for container in container_names:\n",
+                "        print (f\"CONTAINER: {container} / POD: {pod.metadata.name}\")\n",
+                "        try:\n",
+                "            print(stream(api.connect_get_namespaced_pod_exec, pod.metadata.name, namespace, command=['/bin/sh', '-c', cmd], container=container, stderr=True, stdout=True))\n",
+                "        except Exception:\n",
+                "            print (f\"Failed to get CPU/Memory for container: {container} in POD: {pod.metadata.name}\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

File diff suppressed because it is too large
+ 319 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg037-determine-primary-master-replica.ipynb


+ 469 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg055-time-curl-to-sparkhead.ipynb

@@ -0,0 +1,469 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG055 - Time Curl to Sparkhead\n",
+                "===============================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "If `azdata bdc status show` fails with:\n",
+                "\n",
+                "> StatefulSet sparkhead is not healthy: {{Pod sparkhead-0 is not\n",
+                "> healthy: {Container hadoop-livy-sparkhistory is unhealthy: {Found\n",
+                "> error properties: {Property: sparkhistory.readiness, Details: \u2018Timed\n",
+                "> out getting health status after 5000 milliseconds.\u2019}}}}}: unhealthy\n",
+                "> Pod sparkhead-0 is not healthy: {Container hadoop-livy-sparkhistory is\n",
+                "> unhealthy: {Found error properties: {Property: sparkhistory.readiness,\n",
+                "> Details: \u2018Timed out getting health status after 5000\n",
+                "> milliseconds.\u2019}}}: unhealthy spark: unhealthy\" StatefulSet sparkhead\n",
+                "> is not healthy: {{Pod sparkhead-0 is not healthy: {Container\n",
+                "> hadoop-livy-sparkhistory is unhealthy: {Found error properties:\n",
+                "> {Property: sparkhistory.readiness, Details: \u2018Timed out getting health\n",
+                "> status after 5000 milliseconds.\u2019}}}}}: unhealthy Pod sparkhead-0 is\n",
+                "> not healthy: {Container hadoop-livy-sparkhistory is unhealthy: {Found\n",
+                "> error properties: {Property: sparkhistory.readiness, Details: \u2018Timed\n",
+                "> out getting health status after 5000 milliseconds.\u2019}}}: unhealthy\n",
+                "\n",
+                "It can be a useful diagnosis step to understand what the Curl response\n",
+                "time is from the `controller` pod to the `sparkhead` pod.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg055-time-curl-to-sparkhead.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get name of the \u2018Running\u2019 `controller` `pod`"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place the name  of the 'Running' controller pod in variable `controller`\n",
+                "\n",
+                "controller = run(f'kubectl get pod --selector=app=controller -n {namespace} -o jsonpath={{.items[0].metadata.name}} --field-selector=status.phase=Running', return_output=True)\n",
+                "\n",
+                "print(f\"Controller pod name: {controller}\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Time `curl` in `controller` `pod` to `sparkhead`"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl exec {controller} -n {namespace} -c controller -- bash -c \"time curl --cacert /run/secrets/certificates/rootca/cluster-ca-certificate.crt https://sparkhead-svc:18480\"')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false,
+            "expert": {
+                "rules": [
+                    [
+                        "TSG078",
+                        "code",
+                        "stream",
+                        "name",
+                        "stdout",
+                        "text",
+                        ".*StatefulSet sparkhead is not healthy.*Timed out getting health status"
+                    ]
+                ]
+            }
+        }
+    }
+}

+ 441 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg060-get-disk-space-for-all-pvcs.ipynb

@@ -0,0 +1,441 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG060 - Persistent Volume disk space for all BDC PVCs\n",
+                "======================================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Connect to each container and get the disk space used/available for each\n",
+                "Persisted Volume (PV) mapped to each Persisted Volume Claim (PVC) of a\n",
+                "Big Data Cluster (BDC)\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg060-get-disk-space-for-all-pvcs.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Connect to each container that mounts a PVC and run the `df` linux command line tool\n",
+                "\n",
+                "For each pod:\n",
+                "\n",
+                "1.  Get the claim\\_names from the volumes which have a PVC\n",
+                "2.  Join that to the containers who volume\\_mount that claim\\_name\n",
+                "3.  Get the \u2018mount\\_path\u2019 from the \u2018volume\\_mount\u2019\n",
+                "4.  Exec into the container and run the \u2018df\u2019 tool.\n",
+                "\n",
+                "This technique seems to work across kubeadm and AKS, but does require\n",
+                "\u2018kubectl exec\u2019 into each container (which requires permission and some\n",
+                "time)."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "for pod in pods.items:\n",
+                "    for volume in pod.spec.volumes:\n",
+                "        if volume.persistent_volume_claim is not None:\n",
+                "            for container in pod.spec.containers:\n",
+                "                for volume_mount in container.volume_mounts:\n",
+                "                    if volume_mount.name == volume.name:\n",
+                "                        pvc = api.read_namespaced_persistent_volume_claim(name=volume.persistent_volume_claim.claim_name, namespace=namespace)\n",
+                "                        print (f\"Disk Space for {pod.metadata.name}/{container.name} PVC: {volume.persistent_volume_claim.claim_name} bound to PV: {pvc.spec.volume_name} ({pvc.status.capacity}) Storage Class: {pvc.spec.storage_class_name}\")\n",
+                "                        try:\n",
+                "                            output=stream(api.connect_get_namespaced_pod_exec, pod.metadata.name, namespace, container=container.name, command=['/bin/sh', '-c', f'df {volume_mount.mount_path} -h'], stderr=True, stdout=True)\n",
+                "                        except Exception as err:\n",
+                "                            print(err)\n",
+                "                        else:\n",
+                "                            print(output)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

File diff suppressed because it is too large
+ 293 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg078-is-cluster-healthy.ipynb


File diff suppressed because it is too large
+ 293 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg079-generate-controller-core-dump.ipynb


+ 135 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg086-run-top-for-all-containers.ipynb

@@ -0,0 +1,135 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG086 - Run `top` in all containers\n",
+                "====================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run top in each container"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "cmd = \"top -b -n 1\"\n",
+                "\n",
+                "pod_list = api.list_namespaced_pod(namespace)\n",
+                "pod_names = [pod.metadata.name for pod in pod_list.items]\n",
+                "\n",
+                "for pod in pod_list.items:\n",
+                "    container_names = [container.name for container in pod.spec.containers]\n",
+                "\n",
+                "    for container in container_names:\n",
+                "        print (f\"CONTAINER: {container} / POD: {pod.metadata.name}\")\n",
+                "        try:\n",
+                "            print(stream(api.connect_get_namespaced_pod_exec, pod.metadata.name, namespace, command=['/bin/sh', '-c', cmd], container=container, stderr=True, stdout=True))\n",
+                "        except Exception:\n",
+                "            print (f\"Failed to get run 'top' for container: {container} in pod: {pod.metadata.name}\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 414 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg087-use-hadoop-fs.ipynb

@@ -0,0 +1,414 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG087 - Use hadoop fs CLI on nmnode pod\n",
+                "========================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Connect directly to the namenode and use the comprehensive `hadoop fs`\n",
+                "CLI\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg087-use-hadoop-fs.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Connect to the namenode pod and run hadoop fs CLI"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl exec nmnode-0-0 -n {namespace} -c hadoop -- hadoop fs -ls /')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 417 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/diagnose/tsg108-controller-failed-to-upgrade.ipynb

@@ -0,0 +1,417 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG108 - View the controller upgrade config map\n",
+                "===============================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "When running a Big Data Cluster upgrade using `azdata bdc upgrade`:\n",
+                "\n",
+                "`azdata bdc upgrade --name <namespace> --tag <tag>`\n",
+                "\n",
+                "It may fail with:\n",
+                "\n",
+                "> Upgrading cluster to version 15.0.4003.10029\\_2\n",
+                ">\n",
+                "> NOTE: Cluster upgrade can take a significant amount of time depending\n",
+                "> on configuration, network speed, and the number of nodes in the\n",
+                "> cluster.\n",
+                ">\n",
+                "> Upgrading Control Plane. Control plane upgrade failed. Failed to\n",
+                "> upgrade controller.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "Use these steps to troubelshoot the problem.\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg108-controller-failed-to-upgrade.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### View the upgrade configmap"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl get configmap -n {namespace} controller-upgrade-configmap -o yaml')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [TSG109 - Set upgrade\n",
+                "    timeouts](../repair/tsg109-upgrade-stalled.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false,
+            "expert": {
+                "rules": [
+                    [
+                        5,
+                        "../repair/tsg109-upgrade-stalled.ipynb",
+                        "code",
+                        "stream",
+                        "name",
+                        "stdout",
+                        "text",
+                        ".\\*upgrade has timed out",
+                        0
+                    ]
+                ]
+            }
+        }
+    }
+}

+ 33 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/readme.md

@@ -0,0 +1,33 @@
+# Installation notebooks
+
+- A set of notebooks used for installing and uninstalling command line tools and packages needed to manage SQL Server Big Data Clusters.
+
+[Home](../readme.md)
+
+## Notebooks in this Chapter
+- [SOP036 - Install kubectl command line interface
](sop036-install-kubectl.ipynb)
+
+- [SOP037 - Uninstall kubectl command line interface
](sop037-uninstall-kubectl.ipynb)
+
+- [SOP059 - Install Kubernetes Python module
](sop059-install-kubernetes-module.ipynb)
+
+- [SOP060 - Uninstall kubernetes module
](sop060-uninstall-kubernetes-module.ipynb)
+
+- [SOP062 - Install ipython-sql and pyodbc modules
](sop062-install-ipython-sql-module.ipynb)
+
+- [SOP055 - Install azdata command line interface
](sop055-install-azdata.ipynb)
+
+- [SOP054 - Uninstall azdata command line interface
](sop054-uninstall-azdata.ipynb)
+
+- [SOP038 - Install azure command line interface
](sop038-install-az.ipynb)
+
+- [SOP039 - Uninstall azure command line interface
](sop039-uninstall-az.ipynb)
+
+- [SOP040 - Upgrade pip in ADS Python sandbox
](sop040-upgrade-pip.ipynb)
+
+- [SOP069 - Install ODBC for SQL Server
](sop069-install-odbc-driver-for-sql-server.ipynb)
+
+- [SOP008 - Install unixodbc for Mac
](sop008-brew-install-odbc-for-sql-server.ipynb)
+
+- [SOP010 - Upgrade a big data cluster
](sop010-upgrade-bdc.ipynb)
+

+ 332 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop008-brew-install-odbc-for-sql-server.ipynb

@@ -0,0 +1,332 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP008 - Install unixodbc for Mac\n",
+                "=================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "`azdata` may fail to install on Mac with the following error.\n",
+                "\n",
+                "> ERROR:\n",
+                "> dlopen(/Users/user/.local/lib/python3.6/site-packages/pyodbc.cpython-36m-darwin.so,\n",
+                "> 2): Library not loaded: /usr/local/opt/unixodbc/lib/libodbc.2.dylib\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop008-brew-install-odbc-for-sql-server.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Install `unixodbc`"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('brew install unixodbc')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

File diff suppressed because it is too large
+ 318 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop010-upgrade-bdc.ipynb


+ 351 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop036-install-kubectl.ipynb

@@ -0,0 +1,351 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP036 - Install kubectl command line interface\n",
+                "===============================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop036-install-kubectl.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Install Kubernetes CLI\n",
+                "\n",
+                "To get the latest version number for `kubectl` for Windows, open this\n",
+                "file:\n",
+                "\n",
+                "-   https://storage.googleapis.com/kubernetes-release/release/stable.txt\n",
+                "\n",
+                "NOTE: For Windows, `kubectl.exe` is installed in the folder containing\n",
+                "the `python.exe` (`sys.executable`), which will be in the path for\n",
+                "notebooks run in ADS."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import os\n",
+                "import sys\n",
+                "import platform\n",
+                "\n",
+                "from pathlib import Path\n",
+                "\n",
+                "if platform.system() == \"Darwin\":\n",
+                "    run('brew update')\n",
+                "    run('brew install kubernetes-cli')\n",
+                "elif platform.system() == \"Windows\":\n",
+                "    path = Path(sys.executable)\n",
+                "    cwd = os.getcwd()\n",
+                "    os.chdir(path.parent)\n",
+                "    run('curl -L https://storage.googleapis.com/kubernetes-release/release/v1.17.0/bin/windows/amd64/kubectl.exe -o kubectl.exe')\n",
+                "    os.chdir(cwd)\n",
+                "elif platform.system() == \"Linux\":\n",
+                "    run('sudo apt-get update')\n",
+                "    run('sudo apt-get install -y kubectl')\n",
+                "else:\n",
+                "    raise SystemExit(f\"Platform '{platform.system()}' is not recognized, must be 'Darwin', 'Windows' or 'Linux'\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 344 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop037-uninstall-kubectl.ipynb

@@ -0,0 +1,344 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP037 - Uninstall kubectl command line interface\n",
+                "=================================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop037-uninstall-kubectl.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Uninstall Kubernetes CLI\n",
+                "\n",
+                "NOTE: For Windows, `kubectl.exe` was installed in the folder containing\n",
+                "the `python.exe` (`sys.executable`), it will be removed from this\n",
+                "folder."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import os\n",
+                "import sys\n",
+                "import platform\n",
+                "\n",
+                "from pathlib import Path\n",
+                "\n",
+                "if platform.system() == \"Darwin\":\n",
+                "    run('brew uninstall kubernetes-cli')\n",
+                "elif platform.system() == \"Windows\":\n",
+                "    path = Path(sys.executable)\n",
+                "    cwd = os.getcwd()\n",
+                "    os.chdir(path.parent)\n",
+                "    run('cmd /k del kubectl.exe')\n",
+                "    os.chdir(cwd)\n",
+                "elif platform.system() == \"Linux\":\n",
+                "    run('sudo apt-get uninstall -y kubectl')\n",
+                "else:\n",
+                "    raise SystemExit(f\"Platform '{platform.system()}' is not recognized, must be 'Darwin', 'Windows' or 'Linux'\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 341 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop038-install-az.ipynb

@@ -0,0 +1,341 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP038 - Install azure command line interface\n",
+                "=============================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop038-install-az.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'python': []}\n",
+                "error_hints = {'python': [['Library not loaded: /usr/local/opt/unixodbc', 'SOP008 - Backup HDFS files to Azure Data Lake Store Gen2 with distcp', '../common/sop008-distcp-backup-to-adl-gen2.ipynb'], ['WARNING: You are using pip version', 'SOP040 - Upgrade pip in ADS Python sandbox', '../install/sop040-upgrade-pip.ipynb']]}\n",
+                "install_hint = {'python': []}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Install az CLI"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(\"python --version\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('python -m pip install -m pip install azure-cli')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 332 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop039-uninstall-az.ipynb

@@ -0,0 +1,332 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP039 - Uninstall azure command line interface\n",
+                "===============================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop039-uninstall-az.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'python': []}\n",
+                "error_hints = {'python': [['Library not loaded: /usr/local/opt/unixodbc', 'SOP008 - Backup HDFS files to Azure Data Lake Store Gen2 with distcp', '../common/sop008-distcp-backup-to-adl-gen2.ipynb'], ['WARNING: You are using pip version', 'SOP040 - Upgrade pip in ADS Python sandbox', '../install/sop040-upgrade-pip.ipynb']]}\n",
+                "install_hint = {'python': []}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Uninstall az CLI"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('python -m pip uninstall azure-cli --yes')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 334 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop040-upgrade-pip.ipynb

@@ -0,0 +1,334 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP040 - Upgrade pip in ADS Python sandbox\n",
+                "==========================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop040-upgrade-pip.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'python': []}\n",
+                "error_hints = {'python': [['Library not loaded: /usr/local/opt/unixodbc', 'SOP008 - Backup HDFS files to Azure Data Lake Store Gen2 with distcp', '../common/sop008-distcp-backup-to-adl-gen2.ipynb'], ['WARNING: You are using pip version', 'SOP040 - Upgrade pip in ADS Python sandbox', '../install/sop040-upgrade-pip.ipynb']]}\n",
+                "install_hint = {'python': []}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Upgrade pip"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import sys\n",
+                "\n",
+                "run(f'python -m pip install --upgrade pip')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 352 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop054-uninstall-azdata.ipynb

@@ -0,0 +1,352 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP054 - Uninstall azdata command line interface\n",
+                "================================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop054-uninstall-azdata.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'python': []}\n",
+                "error_hints = {'python': [['Library not loaded: /usr/local/opt/unixodbc', 'SOP008 - Backup HDFS files to Azure Data Lake Store Gen2 with distcp', '../common/sop008-distcp-backup-to-adl-gen2.ipynb'], ['WARNING: You are using pip version', 'SOP040 - Upgrade pip in ADS Python sandbox', '../install/sop040-upgrade-pip.ipynb']]}\n",
+                "install_hint = {'python': []}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Uninstall azdata CLI"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import sys\n",
+                "\n",
+                "run(f'python -m pip uninstall -r https://aka.ms/azdata -y')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Pip list\n",
+                "\n",
+                "Verify there are no azdata modules in the list"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'python -m pip list')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

File diff suppressed because it is too large
+ 291 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop055-install-azdata.ipynb


+ 350 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop059-install-kubernetes-module.ipynb

@@ -0,0 +1,350 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP059 - Install Kubernetes Python module\n",
+                "=========================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop059-install-kubernetes-module.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'python': []}\n",
+                "error_hints = {'python': [['Library not loaded: /usr/local/opt/unixodbc', 'SOP008 - Backup HDFS files to Azure Data Lake Store Gen2 with distcp', '../common/sop008-distcp-backup-to-adl-gen2.ipynb'], ['WARNING: You are using pip version', 'SOP040 - Upgrade pip in ADS Python sandbox', '../install/sop040-upgrade-pip.ipynb']]}\n",
+                "install_hint = {'python': []}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Pip install the kubernetes module"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import sys\n",
+                "\n",
+                "run(f'python -m pip install kubernetes>=10.0.0')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Pip list installed modules"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'python -m pip list')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 350 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop060-uninstall-kubernetes-module.ipynb

@@ -0,0 +1,350 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP060 - Uninstall kubernetes module\n",
+                "====================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop060-uninstall-kubernetes-module.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'python': []}\n",
+                "error_hints = {'python': [['Library not loaded: /usr/local/opt/unixodbc', 'SOP008 - Backup HDFS files to Azure Data Lake Store Gen2 with distcp', '../common/sop008-distcp-backup-to-adl-gen2.ipynb'], ['WARNING: You are using pip version', 'SOP040 - Upgrade pip in ADS Python sandbox', '../install/sop040-upgrade-pip.ipynb']]}\n",
+                "install_hint = {'python': []}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Pip uninstall the kubernetes module"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import sys\n",
+                "\n",
+                "run(f'python -m pip uninstall kubernetes -y')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Pip list installed modules"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'python -m pip list')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 364 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop062-install-ipython-sql-module.ipynb

@@ -0,0 +1,364 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP062 - Install ipython-sql and pyodbc modules\n",
+                "===============================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"sop062-install-ipython-sql-module.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'python': []}\n",
+                "error_hints = {'python': [['Library not loaded: /usr/local/opt/unixodbc', 'SOP008 - Backup HDFS files to Azure Data Lake Store Gen2 with distcp', '../common/sop008-distcp-backup-to-adl-gen2.ipynb'], ['WARNING: You are using pip version', 'SOP040 - Upgrade pip in ADS Python sandbox', '../install/sop040-upgrade-pip.ipynb']]}\n",
+                "install_hint = {'python': []}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Pip install the ipython-sql module"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'python -m pip install ipython-sql')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Pip install the pyodbc module"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'python -m pip install pyodbc')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Pip list installed modules"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'python -m pip list')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 62 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/install/sop069-install-odbc-driver-for-sql-server.ipynb

@@ -0,0 +1,62 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "SOP069 - Install ODBC for SQL Server\n",
+                "====================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Some subcommands in `azdata` require the SQL Server ODBC driver. If the\n",
+                "driver is not installed, the following error is given:\n",
+                "\n",
+                "> ERROR: Error processing command: \u201cInterfaceError\u201d (\u2018IM002\u2019, \u2018\\[IM002\\]\n",
+                "> \\[Microsoft\\]\\[ODBC Driver Manager\\] Data source name not found and no\n",
+                "> default driver specified (0) (SQLDriverConnect)\u2019)\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Install ODBC Driver 17 for SQL Server"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import platform\n",
+                "import webbrowser\n",
+                "\n",
+                "if platform.system() == \"Windows\":\n",
+                "    webbrowser.open('https://www.microsoft.com/en-us/download/details.aspx?id=56567')\n",
+                "else:\n",
+                "    webbrowser.open('https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 39 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/readme.md

@@ -0,0 +1,39 @@
+# Logs notebooks
+
+- A set of notebooks to gather and analyze logs from a SQL Server Big Data Cluster.  The analysis process will HINT follow on TSGs for known issues found in the logs
+
+[Home](../readme.md)
+
+## Notebooks in this Chapter
+- [TSG046 - Knox gateway logs
](tsg046-get-knox-logs.ipynb)
+
+- [TSG036 - Controller logs
](tsg036-get-controller-logs.ipynb)
+
+- [TSG034 - Livy logs
](tsg034-get-livy-logs.ipynb)
+
+- [TSG035 - Spark History logs
](tsg035-get-sparkhistory-logs.ipynb)
+
+- [TSG030 - SQL Server errorlog files
](tsg030-get-errorlog-from-all-pods.ipynb)
+
+- [TSG031 - SQL Server PolyBase logs
](tsg031-get-polybase-logs-for-all-pods.ipynb)
+
+- [TSG095 - Hadoop namenode logs
](tsg095-get-namenode-logs.ipynb)
+
+- [TSG090 - Yarn nodemanager logs
](tsg090-get-nodemanager-logs.ipynb)
+
+- [TSG088 - Hadoop datanode logs
](tsg088-get-datanode-logs.ipynb)
+
+- [TSG096 - Zookeeper logs
](tsg096-get-zookeeper-logs.ipynb)
+
+- [TSG073 - InfluxDB logs
](tsg073-get-influxdb-logs.ipynb)
+
+- [TSG076 - Elastic Search logs
](tsg076-get-elastic-search-logs.ipynb)
+
+- [TSG077 - Kibana logs
](tsg077-get-kibana-logs.ipynb)
+
+- [TSG092 - Supervisord log tail for all containers in BDC
](tsg092-get-all-supervisord-log-tails.ipynb)
+
+- [TSG093 - Agent log tail for all containers in BDC
](tsg093-get-all-agent-log-tails.ipynb)
+
+- [TSG094 - Grafana logs
](tsg094-get-grafana-logs.ipynb)
+

+ 281 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg030-get-errorlog-from-all-pods.ipynb

@@ -0,0 +1,281 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG030 - SQL Server errorlog files\n",
+                "==================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = \"mssql-server\"\n",
+                "log_files = [ \"/var/opt/mssql/log/errorlog\" ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\".{35}Error:\"),\n",
+                "    re.compile(\".{35}Login failed for user '##\"),\n",
+                "    re.compile(\".{35}SqlDumpExceptionHandler\")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 279 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg031-get-polybase-logs-for-all-pods.ipynb

@@ -0,0 +1,279 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG031 - SQL Server PolyBase logs\n",
+                "=================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = \"mssql-server\"\n",
+                "log_files = [ \"/var/opt/mssql/log/polybase/MSSQLSERVER_*_errors.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\"(.*)MppSqlException\")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 283 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg034-get-livy-logs.ipynb

@@ -0,0 +1,283 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG034 - Livy logs\n",
+                "==================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = 'hadoop-livy-sparkhistory'\n",
+                "log_files = [ '/var/log/supervisor/log/livy*' ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\".{17} WARN \"),\n",
+                "    re.compile(\".{17} ERROR \")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 283 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg035-get-sparkhistory-logs.ipynb

@@ -0,0 +1,283 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG035 - Spark History logs\n",
+                "===========================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container='hadoop-livy-sparkhistory'\n",
+                "log_files = [ \"/var/log/supervisor/log/sparkhistory*\" ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\".{23} WARN \"),\n",
+                "    re.compile(\".{23} ERROR \")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 308 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg036-get-controller-logs.ipynb

@@ -0,0 +1,308 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG036 - Controller logs\n",
+                "========================\n",
+                "\n",
+                "Get the last \u2018n\u2019 hours of controller logs.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "since_hours = 2\n",
+                "since_seconds = since_hours * 3600 # seconds in hour\n",
+                "\n",
+                "coalesce_duplicates = True"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get controller logs"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "container = \"controller\"\n",
+                "\n",
+                "pod_list = api.list_namespaced_pod(namespace, label_selector=\"app=controller\")\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for pod in pod_list.items:\n",
+                "    print (f\"Logs for controller pod: {pod.metadata.name}\")\n",
+                "    try:\n",
+                "        logs = api.read_namespaced_pod_log(pod.metadata.name, namespace, container=container, since_seconds=since_seconds)\n",
+                "    except Exception as err:\n",
+                "        print(f\"ERROR: {err}\")\n",
+                "        pass\n",
+                "    else:\n",
+                "        if coalesce_duplicates:\n",
+                "            previous_line = \"\"\n",
+                "            duplicates = 1\n",
+                "            for line in logs.split('\\n'):\n",
+                "                if line[27:] != previous_line[27:]:\n",
+                "                    if duplicates != 1:\n",
+                "                        print(f\"\\t{previous_line} (x{duplicates})\")\n",
+                "                    print(f\"\\t{line}\")\n",
+                "                    duplicates = 1\n",
+                "                else:\n",
+                "                    duplicates = duplicates + 1\n",
+                "                    continue\n",
+                "\n",
+                "                if line[25:34] == \"| ERROR |\" or line[25:33] == \"| WARN |\":\n",
+                "                    entries_for_analysis.append(line)\n",
+                "\n",
+                "                previous_line = line\n",
+                "        else:\n",
+                "            print(logs)\n",
+                "\n",
+                "print (f\"There were {len(entries_for_analysis)} warnings and errors found.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [TSG027 - Observe cluster\n",
+                "    deployment](../diagnose/tsg027-observe-bdc-create.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false,
+            "expert": {
+                "log_analyzer_rules": [
+                    [
+                        "doc is missing key: /data",
+                        "TSG038",
+                        "TSG038 - BDC create failures due to - doc is missing key",
+                        "../repair/tsg038-doc-is-missing-key-error.ipynb"
+                    ],
+                    [
+                        "Failed when starting controller service. System.TimeoutException:\nOperation timed out after 10 minutes",
+                        "TSG057",
+                        "TSG057 - Failed when starting controller service. System.TimeoutException",
+                        "../repair/tsg057-failed-when-starting-controller.ipynb"
+                    ]
+                ]
+            }
+        }
+    }
+}

+ 320 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg046-get-knox-logs.ipynb

@@ -0,0 +1,320 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG046 - Knox gateway logs\n",
+                "==========================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Knox gives a 500 error to the client, and removes details (the stack)\n",
+                "pointing to the cause of the underlying issue. Therefore use this TSG to\n",
+                "get the Knox logs from the cluster.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container='knox'\n",
+                "log_files = [ \"/var/log/knox/gateway.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\".{23} WARN \"),\n",
+                "    re.compile(\".{23} ERROR \")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false,
+            "expert": {
+                "rules": [
+                    [
+                        "SAM008",
+                        "code",
+                        "stream",
+                        "name",
+                        "stdout",
+                        "text",
+                        ".\\*ERROR: 500"
+                    ]
+                ],
+                "log_analyzer_rules": [
+                    [
+                        "Invalid object name \u2018roles\\_permissions\u2019",
+                        "TSG039",
+                        "TSG039 - Invalid object name 'role_permissions'",
+                        "../repair/tsg039-invalid-object-name-role-permissions.ipynb"
+                    ],
+                    [
+                        "Name node is in safe mode",
+                        "TSG024",
+                        "TSG024 - Namenode is in safe mode",
+                        "../repair/tsg024-name-node-is-in-safe-mode.ipynb"
+                    ],
+                    [
+                        "Connection exception dispatching request",
+                        "TSG034",
+                        "TSG034 - Livy logs",
+                        "../log-analyzers/tsg034-get-livy-logs.ipynb"
+                    ]
+                ]
+            }
+        }
+    }
+}

+ 277 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg073-get-influxdb-logs.ipynb

@@ -0,0 +1,277 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG073 - InfluxDB logs\n",
+                "======================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = \"influxdb\"\n",
+                "log_files = [ \"/var/log/supervisor/log/influxdb*.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = []"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 280 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg076-get-elastic-search-logs.ipynb

@@ -0,0 +1,280 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG076 - Elastic Search logs\n",
+                "============================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = \"elasticsearch\"\n",
+                "log_files = [ \"/var/log/supervisor/log/elasticsearch*.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\".{26}[WARN ]\"),\n",
+                "    re.compile(\".{26}[ERROR]\")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 277 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg077-get-kibana-logs.ipynb

@@ -0,0 +1,277 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG077 - Kibana logs\n",
+                "====================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = \"kibana\"\n",
+                "log_files = [ \"/var/log/supervisor/log/kibana*.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = [ ]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 282 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg088-get-datanode-logs.ipynb

@@ -0,0 +1,282 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG088 - Hadoop datanode logs\n",
+                "=============================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = \"hadoop\"\n",
+                "log_files = [ \"/var/log/supervisor/log/datanode*.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\".{23} WARN \"),\n",
+                "    re.compile(\".{23} ERROR \")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Hadoop datanode logs from the hadoop container\n",
+                "\n",
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 280 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg090-get-nodemanager-logs.ipynb

@@ -0,0 +1,280 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG090 - Yarn nodemanager logs\n",
+                "==============================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = \"hadoop\"\n",
+                "log_files = [ \"/var/log/supervisor/log/nodemanager*.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\".{23} WARN \"),\n",
+                "    re.compile(\".{23} ERROR \")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 280 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg092-get-all-supervisord-log-tails.ipynb

@@ -0,0 +1,280 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG092 - Supervisord log tail for all containers in BDC\n",
+                "=======================================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = None # All containers\n",
+                "log_files = [ \"/var/log/supervisor/supervisord.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\".{23} WARN \"),\n",
+                "    re.compile(\".{23} ERROR \")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 254 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg093-get-all-agent-log-tails.ipynb

@@ -0,0 +1,254 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG093 - Agent log tail for all containers in BDC\n",
+                "=================================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "tail_lines = 100\n",
+                "line_offset = 27 # Skip the date/time at start of line\n",
+                "\n",
+                "cmd = f'tail -n {tail_lines} /var/log/agent/agent.log'\n",
+                "\n",
+                "coalesce_duplicates = True"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log in all pod containers\n",
+                "\n",
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "pod_list = api.list_namespaced_pod(namespace)\n",
+                "pod_names = [pod.metadata.name for pod in pod_list.items]\n",
+                "\n",
+                "for pod in pod_list.items:\n",
+                "    container_names = [container.name for container in pod.spec.containers]\n",
+                "    for container in container_names:\n",
+                "            print (f\"*** LOGS for CONTAINER: {container} in POD: {pod.metadata.name}\")\n",
+                "            try:\n",
+                "                logs=stream(api.connect_get_namespaced_pod_exec, pod.metadata.name, namespace, command=['/bin/sh', '-c', cmd], container=container, stderr=True, stdout=True)\n",
+                "\n",
+                "                if coalesce_duplicates:\n",
+                "                    previous_line = \"\"\n",
+                "                    duplicates = 1\n",
+                "                    for line in logs.split('\\n'):\n",
+                "                        if line[line_offset:] != previous_line[line_offset:]:\n",
+                "                            if duplicates != 1:\n",
+                "                                print(f\"\\t{previous_line} (x{duplicates})\")\n",
+                "                            print(f\"\\t{line}\")\n",
+                "\n",
+                "                            for rule in rules:\n",
+                "                                if line[line_offset:].find(rule[0]) != -1:\n",
+                "                                    display(Markdown(f'HINT: Use [{rule[2]}](rule[3]) to resolve this issue.'))\n",
+                "\n",
+                "                            duplicates = 1\n",
+                "                        else:\n",
+                "                            duplicates = duplicates + 1\n",
+                "                            continue\n",
+                "\n",
+                "                        previous_line = line\n",
+                "                else:\n",
+                "                    print(logs)\n",
+                "\n",
+                "            except Exception:\n",
+                "                print (f\"Failed to get LOGS for CONTAINER: {container} in POD: {pod.metadata.name}\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false,
+            "expert": {
+                "log_analyzer_rules": [
+                    [
+                        "Failed to get file names from controller with Error",
+                        "TSG040",
+                        "TSG040 - Failed to get file names from controller with Error",
+                        "../repair/tsg040-failed-get-file-names-controller.ipynb"
+                    ],
+                    [
+                        "Please increase sysctl fs.aio-max-nr",
+                        "TSG041",
+                        "TSG041 - Unable to create a new asynchronous I/O context (increase sysctl fs.aio-max-nr)",
+                        "../repair/tsg041-increase-fs-aio-max-nr.ipynb"
+                    ]
+                ]
+            }
+        }
+    }
+}

+ 277 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg094-get-grafana-logs.ipynb

@@ -0,0 +1,277 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG094 - Grafana logs\n",
+                "=====================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = \"grafana\"\n",
+                "log_files = [ \"/var/log/supervisor/log/grafana*.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = []"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 280 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg095-get-namenode-logs.ipynb

@@ -0,0 +1,280 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG095 - Hadoop namenode logs\n",
+                "=============================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = \"hadoop\"\n",
+                "log_files = [ \"/var/log/supervisor/log/namenode*.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\".{23} WARN \"),\n",
+                "    re.compile(\".{23} ERROR \")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 280 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-analyzers/tsg096-get-zookeeper-logs.ipynb

@@ -0,0 +1,280 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG096 - Zookeeper logs\n",
+                "=======================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "tail_lines = 2000\n",
+                "\n",
+                "pod = None # All\n",
+                "container = \"hadoop\"\n",
+                "log_files = [ \"/var/log/supervisor/log/zkfc*.log\" ]\n",
+                "\n",
+                "expressions_to_analyze = [\n",
+                "    re.compile(\".{23} WARN \"),\n",
+                "    re.compile(\".{23} ERROR \")\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get tail for log"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Display the last 'tail_lines' of files in 'log_files' list\n",
+                "\n",
+                "pods = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "entries_for_analysis = []\n",
+                "\n",
+                "for p in pods.items:\n",
+                "    if pod is None or p.metadata.name == pod:\n",
+                "        for c in p.spec.containers:\n",
+                "            if container is None or c.name == container:\n",
+                "                for log_file in log_files:\n",
+                "                    print (f\"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'\")\n",
+                "                    try:\n",
+                "                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)\n",
+                "                    except Exception:\n",
+                "                        print (f\"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}\")\n",
+                "                    else:\n",
+                "                        for line in output.split('\\n'):\n",
+                "                            for expression in expressions_to_analyze:\n",
+                "                                if expression.match(line):\n",
+                "                                    entries_for_analysis.append(line)\n",
+                "                            print(line)\n",
+                "print(\"\")\n",
+                "print(f\"{len(entries_for_analysis)} log entries found for further analysis.\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Analyze log entries and suggest relevant Troubleshooting Guides"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Analyze log entries and suggest further relevant troubleshooting guides\n",
+                "\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "import os\n",
+                "import json\n",
+                "import requests\n",
+                "import ipykernel\n",
+                "import datetime\n",
+                "\n",
+                "from urllib.parse import urljoin\n",
+                "from notebook import notebookapp\n",
+                "\n",
+                "def get_notebook_name():\n",
+                "    \"\"\"\n",
+                "    Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) \n",
+                "    have the kernel_id in the filename of the connection file.  If so, the \n",
+                "    notebook name at runtime can be determined using `list_running_servers`.\n",
+                "    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of\n",
+                "    the connection file, therefore we are unable to establish the filename\n",
+                "    \"\"\"\n",
+                "    connection_file = os.path.basename(ipykernel.get_connection_file())\n",
+                "    \n",
+                "    # If the runtime has the kernel_id in the connection filename, use it to\n",
+                "    # get the real notebook name at runtime, otherwise, use the notebook \n",
+                "    # filename from build time.\n",
+                "    try: \n",
+                "        kernel_id = connection_file.split('-', 1)[1].split('.')[0]\n",
+                "    except:\n",
+                "        pass\n",
+                "    else:\n",
+                "        for servers in list(notebookapp.list_running_servers()):\n",
+                "            try:\n",
+                "                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)\n",
+                "            except:\n",
+                "                pass\n",
+                "            else:\n",
+                "                for nn in json.loads(response.text):\n",
+                "                    if nn['kernel']['id'] == kernel_id:\n",
+                "                        return nn['path']\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def get_notebook_rules():\n",
+                "    \"\"\"\n",
+                "    Load the notebook rules from the metadata of this notebook (in the .ipynb file)\n",
+                "    \"\"\"\n",
+                "    file_name = get_notebook_name()\n",
+                "\n",
+                "    if file_name == None:\n",
+                "        return None\n",
+                "    else:\n",
+                "        j = load_json(file_name)\n",
+                "\n",
+                "        if \"azdata\" not in j[\"metadata\"] or \\\n",
+                "            \"expert\" not in j[\"metadata\"][\"azdata\"] or \\\n",
+                "            \"log_analyzer_rules\" not in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "            return []\n",
+                "        else:\n",
+                "            return j[\"metadata\"][\"azdata\"][\"expert\"][\"log_analyzer_rules\"]\n",
+                "\n",
+                "rules = get_notebook_rules()\n",
+                "\n",
+                "if rules == None:\n",
+                "    print(\"\")\n",
+                "    print(f\"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.\")\n",
+                "else:\n",
+                "    hints = 0\n",
+                "    if len(rules) > 0:\n",
+                "        for entry in entries_for_analysis:\n",
+                "            for rule in rules:\n",
+                "                if entry.find(rule[0]) != -1:\n",
+                "                    print (entry)\n",
+                "\n",
+                "                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))\n",
+                "                    hints = hints + 1\n",
+                "\n",
+                "    print(\"\")\n",
+                "    print(f\"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 19 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/readme.md

@@ -0,0 +1,19 @@
+# Logs notebooks
+
+- A set of notebooks to gather logs from a SQL Server Big Data Cluster.
+
+[Home](../readme.md)
+
+## Notebooks in this Chapter
+- [TSG001 - Run azdata copy-logs
](tsg001-copy-logs.ipynb)
+
+- [TSG091 - Get the azdata CLI logs
](tsg091-get-azdata-logs.ipynb)
+
+- [TSG083 - Run kubectl cluster-info dump
](tsg083-run-kubectl-cluster-info-dump.ipynb)
+
+- [TSG061 - Get tail of all container logs for pods in BDC namespace
](tsg061-tail-bdc-container-logs.ipynb)
+
+- [TSG062 - Get tail of all previous container logs for pods in BDC namespace
](tsg062-tail-bdc-previous-container-logs.ipynb)
+
+- [TSG084 - Internal Query Processor Error
](tsg084-internal-query-process-error.ipynb)
+

File diff suppressed because it is too large
+ 291 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg001-copy-logs.ipynb


+ 193 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg061-tail-bdc-container-logs.ipynb

@@ -0,0 +1,193 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG061 - Get tail of all container logs for pods in BDC namespace\n",
+                "=================================================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "since_seconds = 60 * 60 * 1 # the last hour\n",
+                "coalesce_duplicates = True"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get logs for all containers in Big Data Cluster namespace"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "pod_list = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "pod_names = [pod.metadata.name for pod in pod_list.items]\n",
+                "\n",
+                "print('Scanning pods: ' + ', '.join(pod_names))\n",
+                "\n",
+                "for pod in pod_list.items:\n",
+                "    print(\"*** %s\\t%s\\t%s\" % (pod.metadata.name,\n",
+                "                        pod.status.phase,\n",
+                "                        pod.status.pod_ip))\n",
+                "\n",
+                "    container_names = [container.name for container in pod.spec.containers]\n",
+                "\n",
+                "    for container in container_names:\n",
+                "        print (f\"POD: {pod.metadata.name} / CONTAINER: {container}\")\n",
+                "        try:\n",
+                "            logs = api.read_namespaced_pod_log(pod.metadata.name, namespace, container=container, since_seconds=since_seconds)\n",
+                "\n",
+                "            if coalesce_duplicates:\n",
+                "                previous_line = \"\"\n",
+                "                duplicates = 1\n",
+                "                for line in logs.split('\\n'):\n",
+                "                    if line[27:] != previous_line[27:]:\n",
+                "                        if duplicates != 1:\n",
+                "                            print(f\"\\t{previous_line} (x{duplicates})\")\n",
+                "                        print(f\"\\t{line}\")\n",
+                "                        duplicates = 1\n",
+                "                    else:\n",
+                "                        duplicates = duplicates + 1\n",
+                "\n",
+                "                    previous_line = line\n",
+                "            else:\n",
+                "                print(logs)\n",
+                "        except Exception:\n",
+                "            print (f\"Failed to get LOGS for CONTAINER: {container} in POD: {pod.metadata.name}\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [TSG062 - Get tail of all previous container logs for pods in BDC\n",
+                "    namespace](../log-files/tsg062-tail-bdc-previous-container-logs.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "side_effects": true,
+        "azdata": {
+            "diagnostic": {
+                "categories": [
+                    "kubernetes"
+                ]
+            }
+        }
+    }
+}

+ 188 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg062-tail-bdc-previous-container-logs.ipynb

@@ -0,0 +1,188 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG062 - Get tail of all previous container logs for pods in BDC namespace\n",
+                "==========================================================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "tail_lines = 10000\n",
+                "coalesce_duplicates = True"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get logs for previous instance of all containers in Big Data Cluster namespace"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "pod_list = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "pod_names = [pod.metadata.name for pod in pod_list.items]\n",
+                "\n",
+                "print('Scanning pods: ' + ', '.join(pod_names))\n",
+                "\n",
+                "for pod in pod_list.items:\n",
+                "    print(\"*** %s\\t%s\\t%s\" % (pod.metadata.name,\n",
+                "                        pod.status.phase,\n",
+                "                        pod.status.pod_ip))\n",
+                "\n",
+                "    container_names = [container.name for container in pod.spec.containers]\n",
+                "\n",
+                "    for container in container_names:\n",
+                "        print (f\"POD: {pod.metadata.name} / CONTAINER: {container}\")\n",
+                "        try:\n",
+                "            logs = api.read_namespaced_pod_log(pod.metadata.name, namespace, container=container, tail_lines=tail_lines, previous=True)\n",
+                "\n",
+                "            if coalesce_duplicates:\n",
+                "                previous_line = \"\"\n",
+                "                duplicates = 1\n",
+                "                for line in logs.split('\\n'):\n",
+                "                    if line[27:] != previous_line[27:]:\n",
+                "                        if duplicates != 1:\n",
+                "                            print(f\"\\t{previous_line} (x{duplicates})\")\n",
+                "                        print(f\"\\t{line}\")\n",
+                "                        duplicates = 1\n",
+                "                    else:\n",
+                "                        duplicates = duplicates + 1\n",
+                "\n",
+                "                    previous_line = line\n",
+                "            else:\n",
+                "                print(logs)\n",
+                "        except Exception:\n",
+                "            print (f\"Failed to get LOGS for CONTAINER: {container} in POD: {pod.metadata.name}\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [TSG061 - Get tail of all container logs for pods in BDC\n",
+                "    namespace](../log-files/tsg061-tail-bdc-container-logs.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 337 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg083-run-kubectl-cluster-info-dump.ipynb

@@ -0,0 +1,337 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG083 - Run kubectl cluster-info dump\n",
+                "======================================\n",
+                "\n",
+                "NOTE: This kubectl command can produce a lot of output, and may take\n",
+                "some time (and produce a large notebook!). For Kubernetes clusters that\n",
+                "have been up for a long time, consider running this command outside of a\n",
+                "notebook.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg083-run-kubectl-cluster-info-dump.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run cluster-info dump"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl cluster-info dump')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 66 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg084-internal-query-process-error.ipynb

@@ -0,0 +1,66 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG084 - Internal Query Processor Error\n",
+                "=======================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "When running a Polybase query, the following error maybe returned:\n",
+                "\n",
+                "> Msg 8680, Level 16, State 30, Line 1 Internal Query Processor Error:\n",
+                "> The query processor encountered an unexpected error during the\n",
+                "> processing of a remote query phase. OLE DB provider \u201cMSOLEDBSQL\u201d for\n",
+                "> linked server \u201c(null)\u201d returned message \u201cUnspecified error\u201d. Msg 7421,\n",
+                "> Level 16, State 2, Line 1 Cannot fetch the rowset from OLE DB provider\n",
+                "> \u201cMSOLEDBSQL\u201d for linked server \u201c(null)\u201d. .\n",
+                "\n",
+                "To get more information, use the following DMV query. The `details`\n",
+                "column will contain more information. All the rows for a single\n",
+                "\u2018execution\\_id\u2019 (QIDnnnnnn) are related to a single query execution.\n",
+                "\n",
+                "### Steps"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "DECLARE @query NVARCHAR(max) = '<enter query here>'\n",
+                "\n",
+                "SELECT e.*\n",
+                "FROM sys.dm_exec_distributed_requests dr\n",
+                "CROSS APPLY sys.dm_exec_sql_text(sql_handle) st\n",
+                "JOIN sys.dm_exec_compute_node_errors e ON dr.execution_id = e.execution_id\n",
+                "WHERE st.text = @query\n",
+                "ORDER BY create_time DESC"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "sql",
+            "display_name": "SQL"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 58 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/log-files/tsg091-get-azdata-logs.ipynb

@@ -0,0 +1,58 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG091 - Get the azdata CLI logs\n",
+                "================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Get the azdata logs from the local machine\n",
+                "\n",
+                "Gets the contents of the most recent log. There may be old logs in\n",
+                "azdata.log.1, azdata.log.2 etc."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import os\n",
+                "from pathlib import Path\n",
+                "\n",
+                "home = str(Path.home())\n",
+                "\n",
+                "with open(os.path.join(home, \".azdata\", \"logs\", \"azdata.log\"), \"r\") as file:\n",
+                "    line = file.readline()\n",
+                "    while line:\n",
+                "        print(line.replace(\"\\n\", \"\"))\n",
+                "        line = file.readline()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 29 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/readme.md

@@ -0,0 +1,29 @@
+# SQL Server Big Data Cluster monitoring notebooks
+
+- This chapter contains a set of notebooks useful for getting information and status about a SQL Server big data cluster using the `azdata` command line interface (CLI).
+
+[Home](../readme.md)
+
+## Notebooks in this Chapter
+- [TSG014 - Show BDC endpoints
](tsg014-azdata-bdc-endpoint-list.ipynb)
+
+- [TSG012 - Show BDC Status
](tsg012-azdata-bdc-status.ipynb)
+
+- [TSG069 - Show Big Data Cluster Gateway status
](tsg069-azdata-bdc-gateway-status.ipynb)
+
+- [TSG049 - Show BDC Controller status
](tsg049-azdata-bdc-control-status.ipynb)
+
+- [TSG033 - Show BDC SQL status
](tsg033-azdata-bdc-sql-status.ipynb)
+
+- [TSG068 - Show BDC HDFS status
](tsg068-azdata-bdc-hdfs-status.ipynb)
+
+- [TSG017 - Show BDC Configuration
](tsg017-azdata-bdc-config-show.ipynb)
+
+- [TSG004 - Show BDC Apps
](tsg004-show-app-list.ipynb)
+
+- [TSG003 - Show BDC Spark sessions
](tsg003-show-spark-sessions.ipynb)
+
+- [TSG013 - Show file list in Storage Pool (HDFS)
](tsg013-azdata-bdc-hdfs-ls.ipynb)
+
+- [TSG070 - Query SQL master pool
](tsg070-use-azdata-sql-query.ipynb)
+

File diff suppressed because it is too large
+ 291 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg003-show-spark-sessions.ipynb


+ 332 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg004-show-app-list.ipynb

@@ -0,0 +1,332 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG004 - Show BDC Apps\n",
+                "======================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg004-show-app-list.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'azdata': ['Endpoint sql-server-master does not exist', 'Endpoint livy does not exist', 'Failed to get state for cluster', 'Endpoint webhdfs does not exist', 'Adaptive Server is unavailable or does not exist', 'Error: Address already in use']}\n",
+                "error_hints = {'azdata': [['azdata login', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['The token is expired', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Reason: Unauthorized', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Max retries exceeded with url: /api/v1/bdc/endpoints', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Look at the controller logs for more details', 'TSG027 - Observe cluster deployment', '../diagnose/tsg027-observe-bdc-create.ipynb'], ['provided port is already allocated', 'TSG062 - Get tail of all previous container logs for pods in BDC namespace', '../log-files/tsg062-tail-bdc-previous-container-logs.ipynb'], ['Create cluster failed since the existing namespace', 'SOP061 - Delete a big data cluster', '../install/sop061-delete-bdc.ipynb'], ['Failed to complete kube config setup', 'TSG067 - Failed to complete kube config setup', '../repair/tsg067-failed-to-complete-kube-config-setup.ipynb'], ['Error processing command: \"ApiError', 'TSG110 - Azdata returns ApiError', '../repair/tsg110-azdata-returns-apierror.ipynb'], ['Error processing command: \"ControllerError', 'TSG036 - Controller logs', '../log-analyzers/tsg036-get-controller-logs.ipynb'], ['ERROR: 500', 'TSG046 - Knox gateway logs', '../log-analyzers/tsg046-get-knox-logs.ipynb'], ['Data source name not found and no default driver specified', 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], [\"Can't open lib 'ODBC Driver 17 for SQL Server\", 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ['Control plane upgrade failed. Failed to upgrade controller.', 'TSG108 - View the controller upgrade config map', '../diagnose/tsg108-controller-failed-to-upgrade.ipynb']]}\n",
+                "install_hint = {'azdata': ['SOP055 - Install azdata command line interface', '../install/sop055-install-azdata.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run azdata to retrieve list of applications"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('azdata app list')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 332 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg012-azdata-bdc-status.ipynb

@@ -0,0 +1,332 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG012 - Show BDC Status\n",
+                "========================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg012-azdata-bdc-status.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'azdata': ['Endpoint sql-server-master does not exist', 'Endpoint livy does not exist', 'Failed to get state for cluster', 'Endpoint webhdfs does not exist', 'Adaptive Server is unavailable or does not exist', 'Error: Address already in use']}\n",
+                "error_hints = {'azdata': [['azdata login', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['The token is expired', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Reason: Unauthorized', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Max retries exceeded with url: /api/v1/bdc/endpoints', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Look at the controller logs for more details', 'TSG027 - Observe cluster deployment', '../diagnose/tsg027-observe-bdc-create.ipynb'], ['provided port is already allocated', 'TSG062 - Get tail of all previous container logs for pods in BDC namespace', '../log-files/tsg062-tail-bdc-previous-container-logs.ipynb'], ['Create cluster failed since the existing namespace', 'SOP061 - Delete a big data cluster', '../install/sop061-delete-bdc.ipynb'], ['Failed to complete kube config setup', 'TSG067 - Failed to complete kube config setup', '../repair/tsg067-failed-to-complete-kube-config-setup.ipynb'], ['Error processing command: \"ApiError', 'TSG110 - Azdata returns ApiError', '../repair/tsg110-azdata-returns-apierror.ipynb'], ['Error processing command: \"ControllerError', 'TSG036 - Controller logs', '../log-analyzers/tsg036-get-controller-logs.ipynb'], ['ERROR: 500', 'TSG046 - Knox gateway logs', '../log-analyzers/tsg046-get-knox-logs.ipynb'], ['Data source name not found and no default driver specified', 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], [\"Can't open lib 'ODBC Driver 17 for SQL Server\", 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ['Control plane upgrade failed. Failed to upgrade controller.', 'TSG108 - View the controller upgrade config map', '../diagnose/tsg108-controller-failed-to-upgrade.ipynb']]}\n",
+                "install_hint = {'azdata': ['SOP055 - Install azdata command line interface', '../install/sop055-install-azdata.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Use azdata to show big data cluster status"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('azdata bdc status show --all')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

File diff suppressed because it is too large
+ 311 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg013-azdata-bdc-hdfs-ls.ipynb


+ 332 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg014-azdata-bdc-endpoint-list.ipynb

@@ -0,0 +1,332 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG014 - Show BDC endpoints\n",
+                "===========================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg014-azdata-bdc-endpoint-list.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'azdata': ['Endpoint sql-server-master does not exist', 'Endpoint livy does not exist', 'Failed to get state for cluster', 'Endpoint webhdfs does not exist', 'Adaptive Server is unavailable or does not exist', 'Error: Address already in use']}\n",
+                "error_hints = {'azdata': [['azdata login', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['The token is expired', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Reason: Unauthorized', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Max retries exceeded with url: /api/v1/bdc/endpoints', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Look at the controller logs for more details', 'TSG027 - Observe cluster deployment', '../diagnose/tsg027-observe-bdc-create.ipynb'], ['provided port is already allocated', 'TSG062 - Get tail of all previous container logs for pods in BDC namespace', '../log-files/tsg062-tail-bdc-previous-container-logs.ipynb'], ['Create cluster failed since the existing namespace', 'SOP061 - Delete a big data cluster', '../install/sop061-delete-bdc.ipynb'], ['Failed to complete kube config setup', 'TSG067 - Failed to complete kube config setup', '../repair/tsg067-failed-to-complete-kube-config-setup.ipynb'], ['Error processing command: \"ApiError', 'TSG110 - Azdata returns ApiError', '../repair/tsg110-azdata-returns-apierror.ipynb'], ['Error processing command: \"ControllerError', 'TSG036 - Controller logs', '../log-analyzers/tsg036-get-controller-logs.ipynb'], ['ERROR: 500', 'TSG046 - Knox gateway logs', '../log-analyzers/tsg046-get-knox-logs.ipynb'], ['Data source name not found and no default driver specified', 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], [\"Can't open lib 'ODBC Driver 17 for SQL Server\", 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ['Control plane upgrade failed. Failed to upgrade controller.', 'TSG108 - View the controller upgrade config map', '../diagnose/tsg108-controller-failed-to-upgrade.ipynb']]}\n",
+                "install_hint = {'azdata': ['SOP055 - Install azdata command line interface', '../install/sop055-install-azdata.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Use azdata to list files"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('azdata bdc endpoint list')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 332 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg017-azdata-bdc-config-show.ipynb

@@ -0,0 +1,332 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG017 - Show BDC Configuration\n",
+                "===============================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg017-azdata-bdc-config-show.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'azdata': ['Endpoint sql-server-master does not exist', 'Endpoint livy does not exist', 'Failed to get state for cluster', 'Endpoint webhdfs does not exist', 'Adaptive Server is unavailable or does not exist', 'Error: Address already in use']}\n",
+                "error_hints = {'azdata': [['azdata login', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['The token is expired', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Reason: Unauthorized', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Max retries exceeded with url: /api/v1/bdc/endpoints', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Look at the controller logs for more details', 'TSG027 - Observe cluster deployment', '../diagnose/tsg027-observe-bdc-create.ipynb'], ['provided port is already allocated', 'TSG062 - Get tail of all previous container logs for pods in BDC namespace', '../log-files/tsg062-tail-bdc-previous-container-logs.ipynb'], ['Create cluster failed since the existing namespace', 'SOP061 - Delete a big data cluster', '../install/sop061-delete-bdc.ipynb'], ['Failed to complete kube config setup', 'TSG067 - Failed to complete kube config setup', '../repair/tsg067-failed-to-complete-kube-config-setup.ipynb'], ['Error processing command: \"ApiError', 'TSG110 - Azdata returns ApiError', '../repair/tsg110-azdata-returns-apierror.ipynb'], ['Error processing command: \"ControllerError', 'TSG036 - Controller logs', '../log-analyzers/tsg036-get-controller-logs.ipynb'], ['ERROR: 500', 'TSG046 - Knox gateway logs', '../log-analyzers/tsg046-get-knox-logs.ipynb'], ['Data source name not found and no default driver specified', 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], [\"Can't open lib 'ODBC Driver 17 for SQL Server\", 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ['Control plane upgrade failed. Failed to upgrade controller.', 'TSG108 - View the controller upgrade config map', '../diagnose/tsg108-controller-failed-to-upgrade.ipynb']]}\n",
+                "install_hint = {'azdata': ['SOP055 - Install azdata command line interface', '../install/sop055-install-azdata.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Use azdata to show the big data cluster configuration"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('azdata bdc config show')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 332 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg033-azdata-bdc-sql-status.ipynb

@@ -0,0 +1,332 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG033 - Show BDC SQL status\n",
+                "============================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg033-azdata-bdc-sql-status.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'azdata': ['Endpoint sql-server-master does not exist', 'Endpoint livy does not exist', 'Failed to get state for cluster', 'Endpoint webhdfs does not exist', 'Adaptive Server is unavailable or does not exist', 'Error: Address already in use']}\n",
+                "error_hints = {'azdata': [['azdata login', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['The token is expired', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Reason: Unauthorized', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Max retries exceeded with url: /api/v1/bdc/endpoints', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Look at the controller logs for more details', 'TSG027 - Observe cluster deployment', '../diagnose/tsg027-observe-bdc-create.ipynb'], ['provided port is already allocated', 'TSG062 - Get tail of all previous container logs for pods in BDC namespace', '../log-files/tsg062-tail-bdc-previous-container-logs.ipynb'], ['Create cluster failed since the existing namespace', 'SOP061 - Delete a big data cluster', '../install/sop061-delete-bdc.ipynb'], ['Failed to complete kube config setup', 'TSG067 - Failed to complete kube config setup', '../repair/tsg067-failed-to-complete-kube-config-setup.ipynb'], ['Error processing command: \"ApiError', 'TSG110 - Azdata returns ApiError', '../repair/tsg110-azdata-returns-apierror.ipynb'], ['Error processing command: \"ControllerError', 'TSG036 - Controller logs', '../log-analyzers/tsg036-get-controller-logs.ipynb'], ['ERROR: 500', 'TSG046 - Knox gateway logs', '../log-analyzers/tsg046-get-knox-logs.ipynb'], ['Data source name not found and no default driver specified', 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], [\"Can't open lib 'ODBC Driver 17 for SQL Server\", 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ['Control plane upgrade failed. Failed to upgrade controller.', 'TSG108 - View the controller upgrade config map', '../diagnose/tsg108-controller-failed-to-upgrade.ipynb']]}\n",
+                "install_hint = {'azdata': ['SOP055 - Install azdata command line interface', '../install/sop055-install-azdata.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Use azdata to show the big data cluster sql status"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('azdata bdc sql status show')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 332 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg049-azdata-bdc-control-status.ipynb

@@ -0,0 +1,332 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG049 - Show BDC Controller status\n",
+                "===================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg049-azdata-bdc-control-status.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'azdata': ['Endpoint sql-server-master does not exist', 'Endpoint livy does not exist', 'Failed to get state for cluster', 'Endpoint webhdfs does not exist', 'Adaptive Server is unavailable or does not exist', 'Error: Address already in use']}\n",
+                "error_hints = {'azdata': [['azdata login', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['The token is expired', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Reason: Unauthorized', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Max retries exceeded with url: /api/v1/bdc/endpoints', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Look at the controller logs for more details', 'TSG027 - Observe cluster deployment', '../diagnose/tsg027-observe-bdc-create.ipynb'], ['provided port is already allocated', 'TSG062 - Get tail of all previous container logs for pods in BDC namespace', '../log-files/tsg062-tail-bdc-previous-container-logs.ipynb'], ['Create cluster failed since the existing namespace', 'SOP061 - Delete a big data cluster', '../install/sop061-delete-bdc.ipynb'], ['Failed to complete kube config setup', 'TSG067 - Failed to complete kube config setup', '../repair/tsg067-failed-to-complete-kube-config-setup.ipynb'], ['Error processing command: \"ApiError', 'TSG110 - Azdata returns ApiError', '../repair/tsg110-azdata-returns-apierror.ipynb'], ['Error processing command: \"ControllerError', 'TSG036 - Controller logs', '../log-analyzers/tsg036-get-controller-logs.ipynb'], ['ERROR: 500', 'TSG046 - Knox gateway logs', '../log-analyzers/tsg046-get-knox-logs.ipynb'], ['Data source name not found and no default driver specified', 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], [\"Can't open lib 'ODBC Driver 17 for SQL Server\", 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ['Control plane upgrade failed. Failed to upgrade controller.', 'TSG108 - View the controller upgrade config map', '../diagnose/tsg108-controller-failed-to-upgrade.ipynb']]}\n",
+                "install_hint = {'azdata': ['SOP055 - Install azdata command line interface', '../install/sop055-install-azdata.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Use azdata to show the big data cluster controller status"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('azdata bdc control status show')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 332 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg068-azdata-bdc-hdfs-status.ipynb

@@ -0,0 +1,332 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG068 - Show BDC HDFS status\n",
+                "=============================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg068-azdata-bdc-hdfs-status.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'azdata': ['Endpoint sql-server-master does not exist', 'Endpoint livy does not exist', 'Failed to get state for cluster', 'Endpoint webhdfs does not exist', 'Adaptive Server is unavailable or does not exist', 'Error: Address already in use']}\n",
+                "error_hints = {'azdata': [['azdata login', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['The token is expired', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Reason: Unauthorized', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Max retries exceeded with url: /api/v1/bdc/endpoints', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Look at the controller logs for more details', 'TSG027 - Observe cluster deployment', '../diagnose/tsg027-observe-bdc-create.ipynb'], ['provided port is already allocated', 'TSG062 - Get tail of all previous container logs for pods in BDC namespace', '../log-files/tsg062-tail-bdc-previous-container-logs.ipynb'], ['Create cluster failed since the existing namespace', 'SOP061 - Delete a big data cluster', '../install/sop061-delete-bdc.ipynb'], ['Failed to complete kube config setup', 'TSG067 - Failed to complete kube config setup', '../repair/tsg067-failed-to-complete-kube-config-setup.ipynb'], ['Error processing command: \"ApiError', 'TSG110 - Azdata returns ApiError', '../repair/tsg110-azdata-returns-apierror.ipynb'], ['Error processing command: \"ControllerError', 'TSG036 - Controller logs', '../log-analyzers/tsg036-get-controller-logs.ipynb'], ['ERROR: 500', 'TSG046 - Knox gateway logs', '../log-analyzers/tsg046-get-knox-logs.ipynb'], ['Data source name not found and no default driver specified', 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], [\"Can't open lib 'ODBC Driver 17 for SQL Server\", 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ['Control plane upgrade failed. Failed to upgrade controller.', 'TSG108 - View the controller upgrade config map', '../diagnose/tsg108-controller-failed-to-upgrade.ipynb']]}\n",
+                "install_hint = {'azdata': ['SOP055 - Install azdata command line interface', '../install/sop055-install-azdata.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Use azdata to show the Big Data Cluster HDFS status"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('azdata bdc hdfs status show')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 332 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg069-azdata-bdc-gateway-status.ipynb

@@ -0,0 +1,332 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG069 - Show Big Data Cluster Gateway status\n",
+                "=============================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, hyperlinked suggestions, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg069-azdata-bdc-gateway-status.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'azdata': ['Endpoint sql-server-master does not exist', 'Endpoint livy does not exist', 'Failed to get state for cluster', 'Endpoint webhdfs does not exist', 'Adaptive Server is unavailable or does not exist', 'Error: Address already in use']}\n",
+                "error_hints = {'azdata': [['azdata login', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['The token is expired', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Reason: Unauthorized', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Max retries exceeded with url: /api/v1/bdc/endpoints', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Look at the controller logs for more details', 'TSG027 - Observe cluster deployment', '../diagnose/tsg027-observe-bdc-create.ipynb'], ['provided port is already allocated', 'TSG062 - Get tail of all previous container logs for pods in BDC namespace', '../log-files/tsg062-tail-bdc-previous-container-logs.ipynb'], ['Create cluster failed since the existing namespace', 'SOP061 - Delete a big data cluster', '../install/sop061-delete-bdc.ipynb'], ['Failed to complete kube config setup', 'TSG067 - Failed to complete kube config setup', '../repair/tsg067-failed-to-complete-kube-config-setup.ipynb'], ['Error processing command: \"ApiError', 'TSG110 - Azdata returns ApiError', '../repair/tsg110-azdata-returns-apierror.ipynb'], ['Error processing command: \"ControllerError', 'TSG036 - Controller logs', '../log-analyzers/tsg036-get-controller-logs.ipynb'], ['ERROR: 500', 'TSG046 - Knox gateway logs', '../log-analyzers/tsg046-get-knox-logs.ipynb'], ['Data source name not found and no default driver specified', 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], [\"Can't open lib 'ODBC Driver 17 for SQL Server\", 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ['Control plane upgrade failed. Failed to upgrade controller.', 'TSG108 - View the controller upgrade config map', '../diagnose/tsg108-controller-failed-to-upgrade.ipynb']]}\n",
+                "install_hint = {'azdata': ['SOP055 - Install azdata command line interface', '../install/sop055-install-azdata.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Use azdata to show the Big Data Cluster gateway status"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('azdata bdc gateway status show')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

File diff suppressed because it is too large
+ 311 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-bdc/tsg070-use-azdata-sql-query.ipynb


+ 51 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/readme.md

@@ -0,0 +1,51 @@
+# SQL Server Big Data Cluster Kubernetes monitoring notebooks
+
+- This chapter contains a set of notebooks useful for getting information and status about the Kubernetes cluster hosting a SQL Server Big Data Cluster (BDC).
+
+[Home](../readme.md)
+
+## Notebooks in this Chapter
+- [TSG021 - Get cluster info (Kubernetes)
](tsg021-get-k8s-cluster-info.ipynb)
+
+- [TSG008 - Get version information (Kubernetes)
](tsg008-get-k8s-version-info.ipynb)
+
+- [TSG081 - Get namespaces (Kubernetes)
](tsg081-get-kubernetes-namespaces.ipynb)
+
+- [TSG009 - Get nodes (Kubernetes)
](tsg009-get-nodes.ipynb)
+
+- [TSG006 - Get system pod status
](tsg006-view-system-pod-status.ipynb)
+
+- [TSG007 - Get BDC pod status
](tsg007-view-bdc-pod-status.ipynb)
+
+- [TSG015 - View BDC services (Kubernetes)
](tsg015-view-k8s-services-for-bdc.ipynb)
+
+- [TSG097 - Get BDC stateful sets (Kubernetes)
](tsg097-get-statefulsets.ipynb)
+
+- [TSG098 - Get BDC replicasets (Kubernetes)
](tsg098-get-replicasets.ipynb)
+
+- [TSG099 - Get BDC daemonsets (Kubernetes)
](tsg099-get-daemonsets.ipynb)
+
+- [TSG023 - Get all BDC objects (Kubernetes)
](tsg023-run-kubectl-get-all.ipynb)
+
+- [TSG063 - Get storage classes (Kubernetes)
](tsg063-get-storage-classes.ipynb)
+
+- [TSG072 - Get Persistent Volumes (Kubernetes)
](tsg072-get-persistent-volumes.ipynb)
+
+- [TSG064 - Get BDC Persistent Volume Claims
](tsg064-get-persistent-volume-claims.ipynb)
+
+- [TSG065 - Get BDC secrets (Kubernetes)
](tsg065-get-secrets-for-bdc-namespace.ipynb)
+
+- [TSG066 - Get BDC event (Kubernetes)
](tsg066-get-kubernetes-events.ipynb)
+
+- [TSG020- Describe nodes (Kubernetes)
](tsg020-describe-all-nodes.ipynb)
+
+- [TSG016- Describe BDC pods
](tsg016-describe-all-pods-in-bdc-namespace.ipynb)
+
+- [TSG089 - Describe BDC non-running pods
](tsg089-describe-non-running-pods-in-bdc.ipynb)
+
+- [TSG010 - Get configuration contexts
](tsg010-get-kubernetes-contexts.ipynb)
+
+- [TSG022 - Get external IP address for kubeadm host
](tsg022-get-external-ip-of-kubeadm-host.ipynb)
+
+- [TSG042 - Get `node name` and external mounts for `Data` and `Logs` `PVCs`
](tsg042-get-hosting-node-and-data-log-mount.ipynb)
+

+ 351 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg006-view-system-pod-status.ipynb

@@ -0,0 +1,351 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG006 - Get system pod status\n",
+                "==============================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "View the status of all system pods.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg006-view-system-pod-status.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the kube-system pods"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('kubectl get pods -n kube-system -o wide')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [TSG007 - Get BDC pod\n",
+                "    status](../monitor-k8s/tsg007-view-bdc-pod-status.ipynb)\n",
+                "\n",
+                "-   [TSG009 - Get nodes\n",
+                "    (Kubernetes)](../monitor-k8s/tsg009-get-nodes.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 390 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg007-view-bdc-pod-status.ipynb

@@ -0,0 +1,390 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG007 - Get BDC pod status\n",
+                "===========================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "View the big data cluster pods status.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg007-view-bdc-pod-status.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the big data cluster pods"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl get pods -n {namespace} -o wide')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [TSG006 - Get system pod\n",
+                "    status](../monitor-k8s/tsg006-view-system-pod-status.ipynb)\n",
+                "\n",
+                "-   [TSG009 - Get nodes\n",
+                "    (Kubernetes)](../monitor-k8s/tsg009-get-nodes.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 337 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg008-get-k8s-version-info.ipynb

@@ -0,0 +1,337 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG008 - Get version information (Kubernetes)\n",
+                "=============================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Get the Kubernetes cluster-info\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg008-get-k8s-version-info.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes version info"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('kubectl version -o yaml')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 351 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg009-get-nodes.ipynb

@@ -0,0 +1,351 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG009 - Get nodes (Kubernetes)\n",
+                "===============================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Get the kubernetes nodes details\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg009-get-nodes.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the Kubernetes nodes"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('kubectl get nodes')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [TSG006 - Get system pod\n",
+                "    status](../monitor-k8s/tsg006-view-system-pod-status.ipynb)\n",
+                "\n",
+                "-   [TSG007 - Get BDC pod\n",
+                "    status](../monitor-k8s/tsg007-view-bdc-pod-status.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 348 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg010-get-kubernetes-contexts.ipynb

@@ -0,0 +1,348 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG010 - Get configuration contexts\n",
+                "===================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Get the kubernetes contexts\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg010-get-kubernetes-contexts.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the Kubernetes config contexts"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('kubectl config get-contexts')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Related\n",
+                "-------\n",
+                "\n",
+                "-   [SOP011 - Set kubernetes configuration\n",
+                "    context](../common/sop011-set-kubernetes-context.ipynb)"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 371 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg015-view-k8s-services-for-bdc.ipynb

@@ -0,0 +1,371 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG015 - View BDC services (Kubernetes)\n",
+                "=======================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg015-view-k8s-services-for-bdc.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Use kubectl to view the services"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl get svc -n {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 418 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg016-describe-all-pods-in-bdc-namespace.ipynb

@@ -0,0 +1,418 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG016- Describe BDC pods\n",
+                "=========================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg016-describe-all-pods-in-bdc-namespace.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Describe all pods"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "pod_list = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "pod_names = [pod.metadata.name for pod in pod_list.items]\n",
+                "\n",
+                "print('Describing pods: ' + ', '.join(pod_names))\n",
+                "\n",
+                "for pod in pod_list.items:\n",
+                "    try:\n",
+                "        run(f'kubectl describe pod/{pod.metadata.name} -n {namespace}')\n",
+                "    except Exception:\n",
+                "        print (f\"Failed to describe pod: {pod.metadata.name}\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 371 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg020-describe-all-nodes.ipynb

@@ -0,0 +1,371 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG020- Describe nodes (Kubernetes)\n",
+                "===================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg020-describe-all-nodes.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Describe all nodes"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl describe nodes')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 337 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg021-get-k8s-cluster-info.ipynb

@@ -0,0 +1,337 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG021 - Get cluster info (Kubernetes)\n",
+                "======================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Get the Kubernetes cluster-info\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg021-get-k8s-cluster-info.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes cluster information"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('kubectl cluster-info')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 337 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg022-get-external-ip-of-kubeadm-host.ipynb

@@ -0,0 +1,337 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG022 - Get external IP address for kubeadm host\n",
+                "=================================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Get the external IP address of the host of kubeadm\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg022-get-external-ip-of-kubeadm-host.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get IP address"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('kubectl get node --selector=\"node-role.kubernetes.io/master\" -o=jsonpath=\"{.items[0].status.addresses[0].address}\"')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 393 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg023-run-kubectl-get-all.ipynb

@@ -0,0 +1,393 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG023 - Get all BDC objects (Kubernetes)\n",
+                "=========================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Get a summary of all Kubernetes resources for the system namespace and\n",
+                "the Big Data Cluster namespace\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg023-run-kubectl-get-all.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run kubectl get all for the system namespace"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(\"kubectl get all\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run kubectl get all for the Big Data Cluster namespace"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f\"kubectl get all -n {namespace}\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 485 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg042-get-hosting-node-and-data-log-mount.ipynb

@@ -0,0 +1,485 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG042 - Get `node name` and external mounts for `Data` and `Logs` `PVCs`\n",
+                "=========================================================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Get `node name` hosting `pod` along with the `Data` and `Logs` external\n",
+                "mounts.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "pod_name = \"master-0\""
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg042-get-hosting-node-and-data-log-mount.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get `PersistentVolumeClaim` reference for `Data` and `Logs`"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "data_claim = run(f'kubectl get pod {pod_name} --namespace {namespace} --output jsonpath=\"{{range .spec.volumes[?(@.name==\\'data\\')]}}  {{.persistentVolumeClaim.claimName}} {{end}}\"', return_output=True)\n",
+                "logs_claim = run(f'kubectl get pod {pod_name} --namespace {namespace} --output jsonpath=\"{{range .spec.volumes[?(@.name==\\'logs\\')]}}  {{.persistentVolumeClaim.claimName}} {{end}}\"', return_output=True)\n",
+                "\n",
+                "print (f\"Data claim: {data_claim}.  Logs claim: {logs_claim}\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get Kubernetes agent `node` hosting the Pod"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl get pod {pod_name} --namespace {namespace} --output jsonpath=\"{{.spec.nodeName}}\"')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get external mount for `Data`"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl get pv --namespace {namespace} --output jsonpath=\"{{range .items[?(@.spec.claimRef.name==\\'{data_claim.strip()}\\')]}}{{.spec.local.path}} {{.spec.azureDisk.diskURI}}\"')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get external mount for `Logs`"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl get pv --namespace {namespace} --output jsonpath=\"{{range .items[?(@.spec.claimRef.name==\\'{logs_claim.strip()}\\')]}}{{.spec.local.path}} {{.spec.azureDisk.diskURI}}\"')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

+ 337 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg063-get-storage-classes.ipynb

@@ -0,0 +1,337 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG063 - Get storage classes (Kubernetes)\n",
+                "=========================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Get the Kubernetes storage classes available in the cluster\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg063-get-storage-classes.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the Kubernetes storage classes"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('kubectl get sc')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 376 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg064-get-persistent-volume-claims.ipynb

@@ -0,0 +1,376 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG064 - Get BDC Persistent Volume Claims\n",
+                "=========================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Show the persistent volume claims (PVCs) for the Big Data Cluster\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg064-get-persistent-volume-claims.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run kubectl to display the PVCs"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f\"kubectl get pvc -n {namespace}\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 376 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg065-get-secrets-for-bdc-namespace.ipynb

@@ -0,0 +1,376 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG065 - Get BDC secrets (Kubernetes)\n",
+                "=====================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "View the big data cluster secrets\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg065-get-secrets-for-bdc-namespace.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the big data cluster Kubernetes Secret Store entries"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl get secrets -n {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 408 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg066-get-kubernetes-events.ipynb

@@ -0,0 +1,408 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG066 - Get BDC event (Kubernetes)\n",
+                "===================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "View the big data cluster secrets\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg066-get-kubernetes-events.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the Kubernetes events for the Big Data Cluster namespace"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl get events -n {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the Kubernetes events for the system namespace"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl get events -n kube-system')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the Kubernetes events in the default namespace"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f'kubectl get events')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 338 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg072-get-persistent-volumes.ipynb

@@ -0,0 +1,338 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG072 - Get Persistent Volumes (Kubernetes)\n",
+                "============================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Show the persistent volume (PVs) for the Kubernetes cluster. Persistent\n",
+                "Volumes are non-namespaces objects.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg072-get-persistent-volumes.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run kubectl to display the PVs"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f\"kubectl get pv\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 356 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb

@@ -0,0 +1,356 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG081 - Get namespaces (Kubernetes)\n",
+                "====================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Get the kubernetes namespaces\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg081-get-kubernetes-namespaces.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the Kubernetes namespaces"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('kubectl get namespace')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Show the Kubernetes namespaces with labels\n",
+                "\n",
+                "Kubernetes namespaces containing a SQL Server Big Data Cluster have the\n",
+                "label \u2018MSSQL\\_CLUSTER\u2019"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run('kubectl get namespaces -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,LABELS:.metadata.labels')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 436 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg089-describe-non-running-pods-in-bdc.ipynb

@@ -0,0 +1,436 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG089 - Describe BDC non-running pods\n",
+                "======================================\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "phase = 'Running'"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg089-describe-non-running-pods-in-bdc.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Describe all non running pods"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "pod_list = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "pod_names = [pod.metadata.name for pod in pod_list.items]\n",
+                "\n",
+                "for pod in pod_list.items:\n",
+                "    if (pod.status.phase != phase):\n",
+                "        run(f'kubectl describe pod/{pod.metadata.name} -n {namespace}')\n",
+                "    else:\n",
+                "      print(f\"SKIPPING: {pod.metadata.name}, its status is equal to {phase} ({pod.status.phase})\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 374 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg097-get-statefulsets.ipynb

@@ -0,0 +1,374 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG097 - Get BDC stateful sets (Kubernetes)\n",
+                "===========================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg097-get-statefulsets.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run kubectl to display the Stateful sets"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f\"kubectl get statefulset -n {namespace} -o wide\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 374 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg098-get-replicasets.ipynb

@@ -0,0 +1,374 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG098 - Get BDC replicasets (Kubernetes)\n",
+                "=========================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg098-get-replicasets.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run kubectl to display the replica sets"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f\"kubectl get replicaset -n {namespace} -o wide\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 377 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/monitor-k8s/tsg099-get-daemonsets.ipynb

@@ -0,0 +1,377 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG099 - Get BDC daemonsets (Kubernetes)\n",
+                "========================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "Show the persistent volume (PVs) for the Kubernetes cluster. Persistent\n",
+                "Volumes are non-namespaces objects.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg099-get-daemonsets.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the Kubernetes namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster use the kubectl command line\n",
+                "interface .\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "except:\n",
+                "    from IPython.display import Markdown\n",
+                "    print(f\"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.\")\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "else:\n",
+                "    print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run kubectl to display the daemon sets"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "run(f\"kubectl get daemonset -n {namespace} -o wide\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 20 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/readme.md

@@ -0,0 +1,20 @@
+# Operations and Support Jupyter Book - SQL Server 2019 Big Data Clusters
+
+This Jupyter Book of executable notebooks (.ipynb) is a companion for SQL Server 2019 to assist in operating and supporting Big Data Clusters.
+
+Each notebook is designed to check for its own dependencies.  A 'run all cells' will either complete successfully or will raise an exception with a hyperlinked 'HINT' to another notebook to resolve the missing dependency.  Follow the 'HINT' hyperlink to the subsequent notebook, press 'run all cells', and upon success return back to the original notebook, and 'run all cells'.
+
+Once all dependencies are installed, but 'run all cells' fails, each notebook will analyze results and where possible, produce a hyperlinked 'HINT' to another notebook to further aid in resolving the issue.
+
+## Chapters
+
+1. [Troubleshooters](troubleshooters/readme.md) - notebooks hyper-linked from the `Big Data Cluster Dashboard` in `Azure Data Studio`.
+2. [Log Analyzers](log-analyzers/readme.md) - notebooks linked from the troubleshooters, that get and analyze logs for known issues.
+3. [Diagnose](diagnose/readme.md) - notebooks for diagnosing situations with a Big Data Cluster.
+4. [Repair](repair/readme.md) - notebooks to perform repair actions for known issues in a Big Data Cluster.
+5. [Monitor Big Data Cluster](monitor-bdc/readme.md) - notebooks for monitoring the Big Data Cluster using the `azdata` command line tool.
+6. [Monitor Kubernetes](monitor-k8s/readme.md) - notebooks for monitoring a the Kubernetes cluster hosting a Big Data Cluster.
+7. [Logs](log-files/readme.md) - notebooks for display log files from a Big Data Cluster.
+8. [Sample](sample/readme.md) - notebooks demonstrating Big Data Cluster features and functionality.
+9. [Install](install/readme.md) - notebooks to install prerequisites for other notebooks.
+10. [Common](common/readme.md) - notebooks commonly linked from other notebooks, such as azdata login / logout.

+ 33 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/repair/readme.md

@@ -0,0 +1,33 @@
+# Repair
+
+- The notebooks in this chapter are for repairing known situations and states of a SQL Server Big Data Cluster.
+
+[Home](../readme.md)
+
+## Notebooks in this Chapter
+- [TSG024 - Namenode is in safe mode
](tsg024-name-node-is-in-safe-mode.ipynb)
+
+- [TSG041 - Unable to create a new asynchronous I/O context (increase sysctl fs.aio-max-nr)
](tsg041-increase-fs-aio-max-nr.ipynb)
+
+- [TSG048 - Deployment stuck at "Waiting for controller pod to be up"
](tsg048-create-stuck-waiting-for-controller.ipynb)
+
+- [TSG038 - BDC create failures due to - doc is missing key
](tsg038-doc-is-missing-key-error.ipynb)
+
+- [TSG047 - ConfigException - Expected only one object with name
](tsg047-expected-only-one-object-with-name.ipynb)
+
+- [TSG050 - Cluster create hangs with "timeout expired waiting for volumes to attach or mount for pod"
](tsg050-timeout-expired-waiting-for-volumes.ipynb)
+
+- [TSG057 - Failed when starting controller service. System.TimeoutException
](tsg057-failed-when-starting-controller.ipynb)
+
+- [TSG067 - Failed to complete kube config setup
](tsg067-failed-to-complete-kube-config-setup.ipynb)
+
+- [TSG075 - FailedCreatePodSandBox due to NetworkPlugin cni failed to set up pod
](tsg075-networkplugin-cni-failed-to-setup-pod.ipynb)
+
+- [TSG110 - Azdata returns ApiError
](tsg110-azdata-returns-apierror.ipynb)
+
+- [TSG028 - Restart node manager on all storage pool nodes
](tsg028-restart-nodemanager-in-storage-pool.ipynb)
+
+- [TSG045 - The maximum number of data disks allowed to be attached to a VM of this size (AKS)
](tsg045-max-number-data-disks-allowed.ipynb)
+
+- [TSG109 - Set upgrade timeouts
](tsg109-upgrade-stalled.ipynb)
+

+ 541 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/repair/tsg024-name-node-is-in-safe-mode.ipynb

@@ -0,0 +1,541 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG024 - Namenode is in safe mode\n",
+                "=================================\n",
+                "\n",
+                "HDFS can get itself into Safe mode. For example if too many Pods are\n",
+                "re-cycled too quickly in the Storage Pool then Safe mode may be\n",
+                "automatically enabled.\n",
+                "\n",
+                "When starting a spark session, the user may see (for example, when\n",
+                "trying to start a PySpark or PySpark3 session in a notebook from Azure\n",
+                "Data Studio):\n",
+                "\n",
+                "> The code failed because of a fatal error: Error sending http request\n",
+                "> and maximum retry encountered..\n",
+                ">\n",
+                "> Some things to try: a) Make sure Spark has enough available resources\n",
+                "> for Jupyter to create a Spark context. b) Contact your Jupyter\n",
+                "> administrator to make sure the Spark magics library is configured\n",
+                "> correctly. c) Restart the kernel.\n",
+                "\n",
+                "Use this notebook to run a report to understand more about HDFS, and\n",
+                "optionally move the cluster out of Safe mode if it is safe to do.\n",
+                "\n",
+                "Steps\n",
+                "-----\n",
+                "\n",
+                "### Common functions\n",
+                "\n",
+                "Define helper functions used in this notebook."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
+                "import sys\n",
+                "import os\n",
+                "import re\n",
+                "import json\n",
+                "import platform\n",
+                "import shlex\n",
+                "import shutil\n",
+                "import datetime\n",
+                "\n",
+                "from subprocess import Popen, PIPE\n",
+                "from IPython.display import Markdown\n",
+                "\n",
+                "retry_hints = {}\n",
+                "error_hints = {}\n",
+                "install_hint = {}\n",
+                "\n",
+                "first_run = True\n",
+                "rules = None\n",
+                "\n",
+                "def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
+                "    \"\"\"\n",
+                "    Run shell command, stream stdout, print stderr and optionally return output\n",
+                "    \"\"\"\n",
+                "    MAX_RETRIES = 5\n",
+                "    output = \"\"\n",
+                "    retry = False\n",
+                "\n",
+                "    global first_run\n",
+                "    global rules\n",
+                "\n",
+                "    if first_run:\n",
+                "        first_run = False\n",
+                "        rules = load_rules()\n",
+                "\n",
+                "    # shlex.split is required on bash and for Windows paths with spaces\n",
+                "    #\n",
+                "    cmd_actual = shlex.split(cmd)\n",
+                "\n",
+                "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
+                "    #\n",
+                "    user_provided_exe_name = cmd_actual[0].lower()\n",
+                "\n",
+                "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
+                "    #\n",
+                "    if cmd.startswith(\"python \"):\n",
+                "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
+                "\n",
+                "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
+                "        # with:\n",
+                "        #\n",
+                "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
+                "        #\n",
+                "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
+                "        #\n",
+                "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
+                "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
+                "\n",
+                "    # To aid supportabilty, determine which binary file will actually be executed on the machine\n",
+                "    #\n",
+                "    which_binary = None\n",
+                "\n",
+                "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
+                "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
+                "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
+                "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
+                "    # look for the 2nd installation of CURL in the path)\n",
+                "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
+                "        path = os.getenv('PATH')\n",
+                "        for p in path.split(os.path.pathsep):\n",
+                "            p = os.path.join(p, \"curl.exe\")\n",
+                "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
+                "                if p.lower().find(\"system32\") == -1:\n",
+                "                    cmd_actual[0] = p\n",
+                "                    which_binary = p\n",
+                "                    break\n",
+                "\n",
+                "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
+                "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
+                "    #\n",
+                "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
+                "    #\n",
+                "    if which_binary == None:\n",
+                "        which_binary = shutil.which(cmd_actual[0])\n",
+                "\n",
+                "    if which_binary == None:\n",
+                "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
+                "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
+                "    else:   \n",
+                "        cmd_actual[0] = which_binary\n",
+                "\n",
+                "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
+                "\n",
+                "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
+                "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
+                "    print(f\"       cwd: {os.getcwd()}\")\n",
+                "\n",
+                "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
+                "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
+                "    # workaround this, use no_output=True\n",
+                "    #\n",
+                "\n",
+                "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
+                "    #\n",
+                "    wait = True \n",
+                "\n",
+                "    try:\n",
+                "        if no_output:\n",
+                "            p = Popen(cmd_actual)\n",
+                "        else:\n",
+                "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
+                "            with p.stdout:\n",
+                "                for line in iter(p.stdout.readline, b''):\n",
+                "                    line = line.decode()\n",
+                "                    if return_output:\n",
+                "                        output = output + line\n",
+                "                    else:\n",
+                "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
+                "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
+                "                            match = regex.match(line)\n",
+                "                            if match:\n",
+                "                                if match.group(1).find(\"HTML\") != -1:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
+                "                                else:\n",
+                "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
+                "\n",
+                "                                    wait = False\n",
+                "                                    break # otherwise infinite hang, have not worked out why yet.\n",
+                "                        else:\n",
+                "                            print(line, end='')\n",
+                "                            if rules is not None:\n",
+                "                                apply_expert_rules(line)\n",
+                "\n",
+                "        if wait:\n",
+                "            p.wait()\n",
+                "    except FileNotFoundError as e:\n",
+                "        if install_hint is not None:\n",
+                "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
+                "\n",
+                "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
+                "\n",
+                "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
+                "\n",
+                "    if not no_output:\n",
+                "        for line in iter(p.stderr.readline, b''):\n",
+                "            line_decoded = line.decode()\n",
+                "\n",
+                "            # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
+                "            # print this empty \"ERR:\" as it confuses.\n",
+                "            #\n",
+                "            if line_decoded == \"\":\n",
+                "                continue\n",
+                "            \n",
+                "            print(f\"STDERR: {line_decoded}\", end='')\n",
+                "\n",
+                "            if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
+                "                exit_code_workaround = 1\n",
+                "\n",
+                "            if user_provided_exe_name in error_hints:\n",
+                "                for error_hint in error_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(error_hint[0]) != -1:\n",
+                "                        display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
+                "\n",
+                "            if rules is not None:\n",
+                "                apply_expert_rules(line_decoded)\n",
+                "\n",
+                "            if user_provided_exe_name in retry_hints:\n",
+                "                for retry_hint in retry_hints[user_provided_exe_name]:\n",
+                "                    if line_decoded.find(retry_hint) != -1:\n",
+                "                        if retry_count < MAX_RETRIES:\n",
+                "                            print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
+                "                            retry_count = retry_count + 1\n",
+                "                            output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
+                "\n",
+                "                            if return_output:\n",
+                "                                return output\n",
+                "                            else:\n",
+                "                                return\n",
+                "\n",
+                "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
+                "\n",
+                "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
+                "    # don't wait here, if success known above\n",
+                "    #\n",
+                "    if wait: \n",
+                "        if p.returncode != 0:\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
+                "    else:\n",
+                "        if exit_code_workaround !=0 :\n",
+                "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
+                "\n",
+                "\n",
+                "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
+                "\n",
+                "    if return_output:\n",
+                "        return output\n",
+                "\n",
+                "def load_json(filename):\n",
+                "    with open(filename, encoding=\"utf8\") as json_file:\n",
+                "        return json.load(json_file)\n",
+                "\n",
+                "def load_rules():\n",
+                "\n",
+                "    try:\n",
+                "\n",
+                "        # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
+                "        #\n",
+                "        j = load_json(\"tsg024-name-node-is-in-safe-mode.ipynb\")\n",
+                "\n",
+                "    except:\n",
+                "        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?\n",
+                "\n",
+                "    else:\n",
+                "        if \"metadata\" in j and \\\n",
+                "            \"azdata\" in j[\"metadata\"] and \\\n",
+                "            \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
+                "            \"rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
+                "\n",
+                "            rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"rules\"]\n",
+                "\n",
+                "            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.\n",
+                "\n",
+                "            # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
+                "\n",
+                "            return rules\n",
+                "\n",
+                "def apply_expert_rules(line):\n",
+                "\n",
+                "    global rules\n",
+                "\n",
+                "    for rule in rules:\n",
+                "\n",
+                "        # rules that have 9 elements are the injected (output) rules (the ones we want).  Rules\n",
+                "        # with only 8 elements are the source (input) rules, which are not expanded (i.e. TSG029,\n",
+                "        # not ../repair/tsg029-nb-name.ipynb)\n",
+                "        if len(rule) == 9:\n",
+                "            notebook = rule[1]\n",
+                "            cell_type = rule[2]\n",
+                "            output_type = rule[3] # i.e. stream or error\n",
+                "            output_type_name = rule[4] # i.e. ename or name \n",
+                "            output_type_value = rule[5] # i.e. SystemExit or stdout\n",
+                "            details_name = rule[6]  # i.e. evalue or text \n",
+                "            expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
+                "\n",
+                "            # print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
+                "\n",
+                "            if re.match(expression, line, re.DOTALL):\n",
+                "\n",
+                "                # print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
+                "\n",
+                "                match_found = True\n",
+                "\n",
+                "                display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
+                "\n",
+                "\n",
+                "\n",
+                "print('Common functions defined successfully.')\n",
+                "\n",
+                "# Hints for binary (transient fault) retry, (known) error and install guide\n",
+                "#\n",
+                "retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
+                "error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['no such host', 'TSG011 - Restart sparkhistory server', '../repair/tsg011-restart-sparkhistory-server.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
+                "install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the name of the namenode pod"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "namenode_pod = run(f'kubectl get pod --selector=role=namenode -n {namespace} -o jsonpath={{.items[0].metadata.name}}', return_output=True)\n",
+                "\n",
+                "print ('Namenode pod name: ' + namenode_pod)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the `hdfs dfsadmin` report"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "name=namenode_pod\n",
+                "container='hadoop'\n",
+                "\n",
+                "command='hdfs dfsadmin -report'\n",
+                "\n",
+                "string=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)\n",
+                "\n",
+                "print(string)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Set the text that identifies this issue"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "precondition_text=\"Safe mode is ON\""
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### PRECONDITION CHECK"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "if precondition_text not in string:\n",
+                "    raise Exception(\"PRECONDITION NON-MATCH: 'tsg024-name-node-is-in-safe-mode' is not a match for an active problem\")\n",
+                "\n",
+                "print(\"PRECONDITION MATCH: 'tsg024-name-node-is-in-safe-mode' is a match for an active problem in this cluster\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Resolution\n",
+                "----------\n",
+                "\n",
+                "NOTE: Only if it is determined there are no missing, corrupt or under\n",
+                "replicated blocks that should not be ignored is it safe to take the name\n",
+                "node out of safe mode. Use `hdfs dfsadmin -report` and `hdfs fsck` to\n",
+                "understand more about missing, corrupt or under replicated blocks.\n",
+                "\n",
+                "### Move the namenode out of safe mode"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "command='hdfs dfsadmin -safemode leave'\n",
+                "\n",
+                "string=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)\n",
+                "\n",
+                "print(string)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Validate - Verify the namenode is no longer in safe mode\n",
+                "\n",
+                "Validate that the text \u2018Safe mode is ON\u2019 is no longer in the\n",
+                "`hdfs dfsadmin -report` output"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "command='hdfs dfsadmin -report'\n",
+                "\n",
+                "string=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)\n",
+                "\n",
+                "if precondition_text in string:\n",
+                "    raise SystemExit ('FAILED - hdfs dfsadmin -report output still contains: ' + precondition_text)\n",
+                "\n",
+                "print ('SUCCESS - hdfs dfsadmin -report output no longer contains: ' + precondition_text)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": false
+        }
+    }
+}

+ 154 - 0
Troubleshooting-Notebooks/Big-Data-Clusters/CU1/Public/content/repair/tsg028-restart-nodemanager-in-storage-pool.ipynb

@@ -0,0 +1,154 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "TSG028 - Restart node manager on all storage pool nodes\n",
+                "=======================================================\n",
+                "\n",
+                "Description\n",
+                "-----------\n",
+                "\n",
+                "### Parameters"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "parameters"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "container='hadoop'\n",
+                "command=f'supervisorctl restart nodemanager'"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Instantiate Kubernetes client"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Instantiate the Python Kubernetes client into 'api' variable\n",
+                "\n",
+                "import os\n",
+                "\n",
+                "try:\n",
+                "    from kubernetes import client, config\n",
+                "    from kubernetes.stream import stream\n",
+                "\n",
+                "    if \"KUBERNETES_SERVICE_PORT\" in os.environ and \"KUBERNETES_SERVICE_HOST\" in os.environ:\n",
+                "        config.load_incluster_config()\n",
+                "    else:\n",
+                "        config.load_kube_config()\n",
+                "\n",
+                "    api = client.CoreV1Api()\n",
+                "\n",
+                "    print('Kubernetes client instantiated')\n",
+                "except ImportError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))\n",
+                "    raise"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Get the namespace for the big data cluster\n",
+                "\n",
+                "Get the namespace of the big data cluster from the Kuberenetes API.\n",
+                "\n",
+                "NOTE: If there is more than one big data cluster in the target\n",
+                "Kubernetes cluster, then set \\[0\\] to the correct value for the big data\n",
+                "cluster."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": [
+                    "hide_input"
+                ]
+            },
+            "outputs": [],
+            "source": [
+                "# Place Kubernetes namespace name for BDC into 'namespace' variable\n",
+                "\n",
+                "try:\n",
+                "    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name\n",
+                "except IndexError:\n",
+                "    from IPython.display import Markdown\n",
+                "    display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))\n",
+                "    display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))\n",
+                "    raise\n",
+                "\n",
+                "print('The kubernetes namespace for your big data cluster is: ' + namespace)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Run command in containers"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "pod_list = api.list_namespaced_pod(namespace)\n",
+                "\n",
+                "for pod in pod_list.items:\n",
+                "    container_names = [container.name for container in pod.spec.containers]\n",
+                "    for container_name in container_names:\n",
+                "        if container_name == container:\n",
+                "            print (f\"Pod: {pod.metadata.name} / Container: {container}:\")\n",
+                "            try:\n",
+                "                output=stream(api.connect_get_namespaced_pod_exec, pod.metadata.name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)\n",
+                "                print (output)\n",
+                "            except Exception:\n",
+                "                print (f\"Failed to run {command} in container: {container} for pod: {pod.metadata.name}\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print('Notebook execution complete.')"
+            ]
+        }
+    ],
+    "nbformat": 4,
+    "nbformat_minor": 5,
+    "metadata": {
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "azdata": {
+            "side_effects": true
+        }
+    }
+}

Some files were not shown because too many files changed in this diff