Kubernetes Debugging Skill Overview Systematic toolkit for debugging Kubernetes clusters, workloads, networking, and storage with a deterministic, safety-first workflow. Trigger Phrases Use this skill when requests resemble: - "My pod is in ; help me find the root cause." - "Service DNS works in one pod but not another." - "Deployment rollout is stuck." - "Pods are and not scheduling." - "Cluster health looks degraded after a change." - "PVC is pending and pods cannot mount storage." Prerequisites Run from the skill directory ( ) so relative script paths work as written. Required - installed…

\\r'/}\"\n token=\"${token//

Kubernetes Debugging Skill Overview Systematic toolkit for debugging Kubernetes clusters, workloads, networking, and storage with a deterministic, safety-first workflow. Trigger Phrases Use this skill when requests resemble: - "My pod is in ; help me find the root cause." - "Service DNS works in one pod but not another." - "Deployment rollout is stuck." - "Pods are and not scheduling." - "Cluster health looks degraded after a change." - "PVC is pending and pods cannot mount storage." Prerequisites Run from the skill directory ( ) so relative script paths work as written. Required - installed…

\\n'/}\"\n printf \"%s\" \"$token\"\n}\n\napi_probe_secure() {\n local token\n\n if ! pod_exec test -r \"$SERVICEACCOUNT_CA\" >/dev/null 2>&1 || \\\n ! pod_exec test -r \"$SERVICEACCOUNT_TOKEN_FILE\" >/dev/null 2>&1; then\n echo \"service account CA/token files are missing in the pod. Use --insecure only for explicit troubleshooting override.\" >&2\n return 1\n fi\n\n token=\"$(read_serviceaccount_token)\"\n if [ -z \"$token\" ]; then\n echo \"service account token is empty; cannot authenticate secure API probe.\" >&2\n return 1\n fi\n\n if pod_exec curl --fail --silent --show-error --cacert \"$SERVICEACCOUNT_CA\" --max-time 5 \\\n -H \"Authorization: Bearer $token\" \"$KUBERNETES_API_URL\" >/dev/null 2>&1; then\n return 0\n fi\n\n if pod_exec wget -q --timeout=5 --ca-certificate=\"$SERVICEACCOUNT_CA\" \\\n --header=\"Authorization: Bearer $token\" -O /dev/null \"$KUBERNETES_API_URL\" >/dev/null 2>&1; then\n return 0\n fi\n\n echo \"curl/wget secure API probe failed in the container (missing tools, auth failure, or blocked egress).\" >&2\n return 1\n}\n\napi_probe_insecure() {\n local token\n token=\"$(read_serviceaccount_token)\"\n warn \"Insecure TLS mode enabled (--insecure). Certificate validation is bypassed for API probe.\"\n\n if [ -n \"$token\" ]; then\n if pod_exec curl --fail --silent --show-error -k --max-time 5 \\\n -H \"Authorization: Bearer $token\" \"$KUBERNETES_API_URL\" >/dev/null 2>&1; then\n return 0\n fi\n if pod_exec wget -q --timeout=5 --no-check-certificate \\\n --header=\"Authorization: Bearer $token\" -O /dev/null \"$KUBERNETES_API_URL\" >/dev/null 2>&1; then\n return 0\n fi\n else\n if pod_exec curl --fail --silent --show-error -k --max-time 5 \\\n \"$KUBERNETES_API_URL\" >/dev/null 2>&1; then\n return 0\n fi\n if pod_exec wget -q --timeout=5 --no-check-certificate \\\n -O /dev/null \"$KUBERNETES_API_URL\" >/dev/null 2>&1; then\n return 0\n fi\n fi\n\n echo \"curl/wget insecure API probe failed in the container (missing tools, auth failure, or blocked egress).\" >&2\n return 1\n}\n\napi_probe() {\n if [ \"$INSECURE_TLS\" -eq 1 ]; then\n api_probe_insecure\n return $?\n fi\n api_probe_secure\n}\n\nfinalize_exit() {\n if [ \"$BLOCKED_COUNT\" -gt 0 ]; then\n return 2\n fi\n if [ \"$CHECK_FAIL_COUNT\" -gt 0 ]; then\n return 1\n fi\n if [ \"$STRICT_MODE\" -eq 1 ] && [ \"$WARN_COUNT\" -gt 0 ]; then\n return 1\n fi\n return 0\n}\n\nif ! have_cmd kubectl; then\n blocked_exit \"kubectl is not installed or not in PATH.\"\nfi\n\nif ! kubectl_cmd config current-context >/dev/null 2>&1; then\n blocked_exit \"No active Kubernetes context. Run 'kubectl config current-context' to troubleshoot.\"\nfi\n\nif ! kubectl_cmd get namespace \"$NAMESPACE\" >/dev/null 2>&1; then\n blocked_exit \"Namespace '$NAMESPACE' was not found or is not accessible.\"\nfi\n\nif ! kubectl_cmd get pod \"$POD_NAME\" -n \"$NAMESPACE\" >/dev/null 2>&1; then\n blocked_exit \"Pod '$POD_NAME' in namespace '$NAMESPACE' was not found or is not accessible.\"\nfi\n\necho \"========================================\"\necho \"Network Debugging for Pod: $POD_NAME\"\necho \"Namespace: $NAMESPACE\"\necho \"Timestamp: $(timestamp_utc)\"\necho \"========================================\"\n\nsection \"PREFLIGHT\"\nrun_or_warn \"Current context check\" kubectl_cmd config current-context\nif ! can_i get pods -n \"$NAMESPACE\"; then\n warn \"RBAC may block pod metadata reads in namespace '$NAMESPACE'.\"\nfi\nif ! can_i create pods/exec -n \"$NAMESPACE\"; then\n warn \"RBAC may block 'kubectl exec'; in-pod checks may fail.\"\nfi\n\nsection \"POD NETWORK INFORMATION\"\nPOD_IP=\"$(kubectl_cmd get pod \"$POD_NAME\" -n \"$NAMESPACE\" -o jsonpath='{.status.podIP}' 2>/dev/null || true)\"\nHOST_IP=\"$(kubectl_cmd get pod \"$POD_NAME\" -n \"$NAMESPACE\" -o jsonpath='{.status.hostIP}' 2>/dev/null || true)\"\necho \"Pod IP: ${POD_IP:-Unavailable}\"\necho \"Host IP: ${HOST_IP:-Unavailable}\"\nrun_or_warn \"Pod wide status query\" kubectl_cmd get pod \"$POD_NAME\" -n \"$NAMESPACE\" -o wide\n\nsection \"DNS CONFIGURATION\"\nrun_or_warn \"Pod DNS config read\" pod_exec cat /etc/resolv.conf\n\nsection \"DNS RESOLUTION TEST\"\necho \"Testing kubernetes.default.svc.cluster.local:\"\nif pod_exec nslookup kubernetes.default.svc.cluster.local 2>/dev/null; then\n :\nelif pod_exec getent hosts kubernetes.default.svc.cluster.local 2>/dev/null; then\n :\nelse\n record_check_failure \"DNS lookup test failed (utilities unavailable or DNS lookup failed).\"\nfi\n\nsection \"NETWORK CONNECTIVITY TESTS\"\necho \"Testing connection to kubernetes.default.svc:\"\nrun_or_warn \"Kubernetes API connectivity test from pod\" api_probe\n\nsection \"SERVICES IN NAMESPACE\"\nrun_or_warn \"Service list query\" kubectl_cmd get svc -n \"$NAMESPACE\"\n\nsection \"ENDPOINTS\"\nrun_or_warn \"Endpoint list query\" kubectl_cmd get endpoints -n \"$NAMESPACE\"\n\nsection \"NETWORK POLICIES\"\nrun_or_warn \"Network policy list query\" kubectl_cmd get networkpolicies -n \"$NAMESPACE\"\n\nsection \"POD NETWORK DETAILS\"\nrun_pipe_or_warn \"Pod describe network details query\" \"kubectl --request-timeout=\\\"$REQUEST_TIMEOUT\\\" describe pod \\\"$POD_NAME\\\" -n \\\"$NAMESPACE\\\" | grep -A 20 '^IP:'\"\n\nsection \"POD LABELS (FOR NETWORKPOLICY MATCHING)\"\nrun_or_warn \"Pod label query\" kubectl_cmd get pod \"$POD_NAME\" -n \"$NAMESPACE\" --show-labels\n\nsection \"IPTABLES RULES (IF ACCESSIBLE)\"\nif ! pod_exec iptables -L -n 2>/dev/null; then\n info \"iptables output not available (requires privileged container/tools).\"\nfi\n\nsection \"NETWORK INTERFACES\"\nif pod_exec ip addr 2>/dev/null; then\n :\nelif pod_exec ifconfig 2>/dev/null; then\n :\nelse\n info \"Network interface tools are not available in this container.\"\nfi\n\nsection \"ROUTING TABLE\"\nif pod_exec ip route 2>/dev/null; then\n :\nelif pod_exec route 2>/dev/null; then\n :\nelse\n info \"Routing table tools are not available in this container.\"\nfi\n\nsection \"COREDNS LOGS (LAST 20 LINES)\"\nif kubectl_cmd logs -n kube-system -l k8s-app=kube-dns --tail=20 2>/dev/null; then\n :\nelif kubectl_cmd logs -n kube-system -l k8s-app=coredns --tail=20 2>/dev/null; then\n :\nelse\n warn \"CoreDNS logs are not accessible.\"\nfi\n\necho -e \"\\n========================================\"\necho \"Network debugging completed at $(timestamp_utc)\"\necho \"Warnings: $WARN_COUNT | Check failures: $CHECK_FAIL_COUNT | Blocked checks: $BLOCKED_COUNT\"\necho \"========================================\"\n\nfinalize_exit\nexit $?\n","content_type":"application/x-sh; charset=utf-8","language":"bash","size":9298,"content_sha256":"cede308e036b03bca55f7fcd84ecb20625c1f9a349d852c0b03873f51534b2fa"},{"filename":"scripts/pod_diagnostics.py","content":"#!/usr/bin/env python3\n\"\"\"\nKubernetes Pod Diagnostics Script\nGathers comprehensive diagnostic information about a specific pod\nwith explicit preflight checks and graceful fallbacks.\n\"\"\"\n\nimport argparse\nimport os\nimport shutil\nimport subprocess\nimport sys\nfrom datetime import datetime, timezone\nfrom typing import Sequence, Tuple\n\nREQUEST_TIMEOUT = os.environ.get(\"K8S_REQUEST_TIMEOUT\", \"15s\")\n\n\ndef run_kubectl(args: Sequence[str], timeout: int = 30) -> Tuple[str, str, int]:\n \"\"\"Execute kubectl command and return (stdout, stderr, exit_code).\"\"\"\n cmd = [\"kubectl\", f\"--request-timeout={REQUEST_TIMEOUT}\", *args]\n try:\n result = subprocess.run(\n cmd,\n capture_output=True,\n text=True,\n timeout=timeout,\n check=False,\n )\n return result.stdout, result.stderr, result.returncode\n except subprocess.TimeoutExpired:\n return \"\", f\"Command timed out: {' '.join(cmd)}\", 1\n\n\ndef print_output(stdout: str, stderr: str) -> None:\n \"\"\"Print command output, preferring stdout then stderr.\"\"\"\n if stdout.strip():\n print(stdout.rstrip())\n elif stderr.strip():\n print(stderr.rstrip())\n\n\ndef print_section(title: str) -> None:\n print(f\"\\n## {title} ##\")\n\n\ndef ensure_prerequisites(namespace: str, pod_name: str) -> bool:\n \"\"\"Validate local tool availability and cluster access prerequisites.\"\"\"\n if shutil.which(\"kubectl\") is None:\n print(\"ERROR: kubectl is not installed or not in PATH.\", file=sys.stderr)\n return False\n\n stdout, stderr, code = run_kubectl([\"config\", \"current-context\"])\n if code != 0:\n print(\"ERROR: Unable to determine active Kubernetes context.\", file=sys.stderr)\n print_output(stdout, stderr)\n return False\n\n stdout, stderr, code = run_kubectl([\"get\", \"pod\", pod_name, \"-n\", namespace, \"-o\", \"name\"])\n if code != 0:\n print(\n f\"ERROR: Pod '{pod_name}' in namespace '{namespace}' is not accessible.\",\n file=sys.stderr,\n )\n print_output(stdout, stderr)\n return False\n\n stdout, _, _ = run_kubectl([\"auth\", \"can-i\", \"create\", \"pods/exec\", \"-n\", namespace])\n if stdout.strip() != \"yes\":\n print(\n \"WARN: RBAC may block pod exec; in-container diagnostics can be limited.\",\n file=sys.stderr,\n )\n\n return True\n\n\ndef get_pod_info(pod_name: str, namespace: str = \"default\") -> None:\n \"\"\"Gather comprehensive pod diagnostic information.\"\"\"\n\n print(f\"\\n{'=' * 80}\")\n print(f\"Pod Diagnostics for: {pod_name} (namespace: {namespace})\")\n print(f\"Timestamp: {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}\")\n print(f\"{'=' * 80}\\n\")\n\n # Pod Status\n print_section(\"POD STATUS\")\n stdout, stderr, _ = run_kubectl([\"get\", \"pod\", pod_name, \"-n\", namespace, \"-o\", \"wide\"])\n print_output(stdout, stderr)\n\n # Pod Description\n print_section(\"POD DESCRIPTION\")\n stdout, stderr, _ = run_kubectl([\"describe\", \"pod\", pod_name, \"-n\", namespace])\n print_output(stdout, stderr)\n\n # Pod YAML\n print_section(\"POD YAML\")\n stdout, stderr, _ = run_kubectl([\"get\", \"pod\", pod_name, \"-n\", namespace, \"-o\", \"yaml\"])\n print_output(stdout, stderr)\n\n # Events related to the pod\n print_section(\"RECENT EVENTS\")\n stdout, stderr, _ = run_kubectl(\n [\n \"get\",\n \"events\",\n \"-n\",\n namespace,\n \"--field-selector\",\n f\"involvedObject.name={pod_name}\",\n \"--sort-by=.lastTimestamp\",\n ]\n )\n print_output(stdout, stderr)\n\n # Container logs (all containers)\n print_section(\"CONTAINER LOGS\")\n stdout, stderr, code = run_kubectl(\n [\"get\", \"pod\", pod_name, \"-n\", namespace, \"-o\", \"jsonpath={.spec.containers[*].name}\"]\n )\n if code != 0:\n print_output(stdout, stderr)\n print(\"INFO: Skipping container logs because container names could not be queried.\")\n containers = []\n else:\n containers = stdout.strip().split()\n\n if not containers:\n print(\"INFO: No containers detected for this pod.\")\n\n for container in containers:\n print(f\"\\n### Container: {container} ###\")\n stdout, stderr, _ = run_kubectl(\n [\"logs\", pod_name, \"-n\", namespace, \"-c\", container, \"--tail=100\"],\n timeout=45,\n )\n print_output(stdout, stderr)\n\n print(f\"\\n### Previous logs for: {container} ###\")\n stdout, stderr, code = run_kubectl(\n [\"logs\", pod_name, \"-n\", namespace, \"-c\", container, \"--previous\", \"--tail=50\"],\n timeout=45,\n )\n previous_log_message = f\"{stdout}\\n{stderr}\".lower()\n if code == 0:\n print_output(stdout, stderr)\n elif (\n \"previous terminated container\" in previous_log_message\n or \"is not terminated\" in previous_log_message\n ):\n print(\"INFO: No previous terminated container logs available.\")\n else:\n print_output(stdout, stderr)\n\n # Init container logs — only emitted when the pod has init containers.\n # Init container failures are a primary cause of Init:CrashLoopBackOff and\n # Init:0/N pending states; their logs must be visible in diagnostic output.\n stdout, stderr, code = run_kubectl(\n [\"get\", \"pod\", pod_name, \"-n\", namespace, \"-o\", \"jsonpath={.spec.initContainers[*].name}\"]\n )\n if code != 0:\n print_section(\"INIT CONTAINER LOGS\")\n print_output(stdout, stderr)\n print(\"INFO: Skipping init container logs because init container names could not be queried.\")\n else:\n init_containers = stdout.strip().split()\n if init_containers:\n print_section(\"INIT CONTAINER LOGS\")\n for container in init_containers:\n print(f\"\\n### Init Container: {container} ###\")\n stdout, stderr, _ = run_kubectl(\n [\"logs\", pod_name, \"-n\", namespace, \"-c\", container, \"--tail=100\"],\n timeout=45,\n )\n print_output(stdout, stderr)\n\n print(f\"\\n### Previous init container logs for: {container} ###\")\n stdout, stderr, code = run_kubectl(\n [\"logs\", pod_name, \"-n\", namespace, \"-c\", container, \"--previous\", \"--tail=50\"],\n timeout=45,\n )\n previous_log_message = f\"{stdout}\\n{stderr}\".lower()\n if code == 0:\n print_output(stdout, stderr)\n elif (\n \"previous terminated container\" in previous_log_message\n or \"is not terminated\" in previous_log_message\n ):\n print(\"INFO: No previous terminated init container logs available.\")\n else:\n print_output(stdout, stderr)\n\n # Resource usage\n print_section(\"RESOURCE USAGE\")\n stdout, stderr, code = run_kubectl(\n [\"top\", \"pod\", pod_name, \"-n\", namespace, \"--containers\"],\n timeout=20,\n )\n if code == 0:\n print_output(stdout, stderr)\n elif \"metrics\" in stderr.lower():\n print(\"INFO: Metrics API is unavailable. Skipping 'kubectl top' output.\")\n print_output(\"\", stderr)\n else:\n print_output(stdout, stderr)\n\n # Node information\n print_section(\"NODE INFORMATION\")\n stdout, stderr, code = run_kubectl(\n [\"get\", \"pod\", pod_name, \"-n\", namespace, \"-o\", \"jsonpath={.spec.nodeName}\"]\n )\n if code != 0:\n print_output(stdout, stderr)\n return\n\n node_tokens = stdout.strip().split()\n node_name = node_tokens[0] if node_tokens else \"\"\n if node_name:\n print(f\"Pod is running on node: {node_name}\")\n stdout, stderr, _ = run_kubectl([\"describe\", \"node\", node_name], timeout=45)\n print_output(stdout, stderr)\n else:\n print(\"INFO: Node name is not available yet (pod may still be unscheduled).\")\n\n\ndef main() -> int:\n parser = argparse.ArgumentParser(description=\"Gather Kubernetes pod diagnostics\")\n parser.add_argument(\"pod_name\", help=\"Name of the pod to diagnose\")\n parser.add_argument(\"-n\", \"--namespace\", default=\"default\", help=\"Namespace (default: default)\")\n parser.add_argument(\"-o\", \"--output\", help=\"Output file path (optional)\")\n\n args = parser.parse_args()\n\n if not ensure_prerequisites(args.namespace, args.pod_name):\n return 1\n\n original_stdout = sys.stdout\n output_handle = None\n if args.output:\n output_handle = open(args.output, \"w\", encoding=\"utf-8\")\n sys.stdout = output_handle\n\n try:\n get_pod_info(args.pod_name, args.namespace)\n finally:\n if output_handle is not None:\n sys.stdout = original_stdout\n output_handle.close()\n print(f\"\\nDiagnostics written to: {args.output}\", file=sys.stderr)\n\n return 0\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n","content_type":"text/x-python; charset=utf-8","language":"python","size":8967,"content_sha256":"e31bc06934e033c29465ca18a292700a80438607abbeeac5a3573f61e847b163"},{"filename":"tests/test_pod_diagnostics.py","content":"#!/usr/bin/env python3\n\"\"\"Regression tests for pod_diagnostics.py.\"\"\"\n\nfrom __future__ import annotations\n\nimport contextlib\nimport importlib.util\nimport io\nimport pathlib\nimport sys\nimport unittest\nfrom typing import List, Sequence, Tuple\nfrom unittest.mock import patch\n\n\nsys.dont_write_bytecode = True\n\nSCRIPT_PATH = pathlib.Path(__file__).resolve().parents[1] / \"scripts\" / \"pod_diagnostics.py\"\nSPEC = importlib.util.spec_from_file_location(\"pod_diagnostics\", SCRIPT_PATH)\nif SPEC is None or SPEC.loader is None:\n raise RuntimeError(f\"Unable to load pod_diagnostics module from {SCRIPT_PATH}\")\npod_diagnostics = importlib.util.module_from_spec(SPEC)\nSPEC.loader.exec_module(pod_diagnostics)\n\nKubectlResponse = Tuple[str, str, int]\nExpectedCall = Tuple[Sequence[str], KubectlResponse]\n\n\nclass PodDiagnosticsInitContainerTests(unittest.TestCase):\n def _run_with_expected_calls(self, expected_calls: List[ExpectedCall]) -> str:\n pending = list(expected_calls)\n\n def fake_run(args: Sequence[str], timeout: int = 30) -> KubectlResponse:\n del timeout # Assert command sequence/args; timeout variations are not relevant here.\n self.assertTrue(pending, f\"Unexpected kubectl call: {list(args)}\")\n expected_args, response = pending.pop(0)\n self.assertEqual(list(args), list(expected_args))\n return response\n\n stdout_buffer = io.StringIO()\n with patch.object(pod_diagnostics, \"run_kubectl\", side_effect=fake_run):\n with contextlib.redirect_stdout(stdout_buffer):\n pod_diagnostics.get_pod_info(\"demo-pod\", \"demo-ns\")\n\n self.assertFalse(pending, f\"Expected kubectl calls were not consumed: {pending}\")\n return stdout_buffer.getvalue()\n\n def test_init_container_previous_logs_message_when_not_terminated(self) -> None:\n output = self._run_with_expected_calls(\n [\n ([\"get\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\", \"-o\", \"wide\"], (\"pod wide\", \"\", 0)),\n ([\"describe\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\"], (\"pod describe\", \"\", 0)),\n ([\"get\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\", \"-o\", \"yaml\"], (\"pod yaml\", \"\", 0)),\n (\n [\n \"get\",\n \"events\",\n \"-n\",\n \"demo-ns\",\n \"--field-selector\",\n \"involvedObject.name=demo-pod\",\n \"--sort-by=.lastTimestamp\",\n ],\n (\"event list\", \"\", 0),\n ),\n (\n [\"get\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\", \"-o\", \"jsonpath={.spec.containers[*].name}\"],\n (\"app\", \"\", 0),\n ),\n (\n [\"logs\", \"demo-pod\", \"-n\", \"demo-ns\", \"-c\", \"app\", \"--tail=100\"],\n (\"app logs\", \"\", 0),\n ),\n (\n [\n \"logs\",\n \"demo-pod\",\n \"-n\",\n \"demo-ns\",\n \"-c\",\n \"app\",\n \"--previous\",\n \"--tail=50\",\n ],\n (\"\", \"previous terminated container not found\", 1),\n ),\n (\n [\n \"get\",\n \"pod\",\n \"demo-pod\",\n \"-n\",\n \"demo-ns\",\n \"-o\",\n \"jsonpath={.spec.initContainers[*].name}\",\n ],\n (\"init-setup\", \"\", 0),\n ),\n (\n [\"logs\", \"demo-pod\", \"-n\", \"demo-ns\", \"-c\", \"init-setup\", \"--tail=100\"],\n (\"init logs\", \"\", 0),\n ),\n (\n [\n \"logs\",\n \"demo-pod\",\n \"-n\",\n \"demo-ns\",\n \"-c\",\n \"init-setup\",\n \"--previous\",\n \"--tail=50\",\n ],\n (\"\", \"container is not terminated\", 1),\n ),\n (\n [\"top\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\", \"--containers\"],\n (\"resource usage\", \"\", 0),\n ),\n (\n [\"get\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\", \"-o\", \"jsonpath={.spec.nodeName}\"],\n (\"node-a\", \"\", 0),\n ),\n ([\"describe\", \"node\", \"node-a\"], (\"node describe\", \"\", 0)),\n ]\n )\n\n self.assertIn(\"## INIT CONTAINER LOGS ##\", output)\n self.assertIn(\"### Init Container: init-setup ###\", output)\n self.assertIn(\"INFO: No previous terminated init container logs available.\", output)\n\n def test_init_container_query_failure_prints_skip_message(self) -> None:\n output = self._run_with_expected_calls(\n [\n ([\"get\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\", \"-o\", \"wide\"], (\"pod wide\", \"\", 0)),\n ([\"describe\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\"], (\"pod describe\", \"\", 0)),\n ([\"get\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\", \"-o\", \"yaml\"], (\"pod yaml\", \"\", 0)),\n (\n [\n \"get\",\n \"events\",\n \"-n\",\n \"demo-ns\",\n \"--field-selector\",\n \"involvedObject.name=demo-pod\",\n \"--sort-by=.lastTimestamp\",\n ],\n (\"event list\", \"\", 0),\n ),\n (\n [\"get\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\", \"-o\", \"jsonpath={.spec.containers[*].name}\"],\n (\"app\", \"\", 0),\n ),\n (\n [\"logs\", \"demo-pod\", \"-n\", \"demo-ns\", \"-c\", \"app\", \"--tail=100\"],\n (\"app logs\", \"\", 0),\n ),\n (\n [\n \"logs\",\n \"demo-pod\",\n \"-n\",\n \"demo-ns\",\n \"-c\",\n \"app\",\n \"--previous\",\n \"--tail=50\",\n ],\n (\"\", \"previous terminated container not found\", 1),\n ),\n (\n [\n \"get\",\n \"pod\",\n \"demo-pod\",\n \"-n\",\n \"demo-ns\",\n \"-o\",\n \"jsonpath={.spec.initContainers[*].name}\",\n ],\n (\"\", \"forbidden\", 1),\n ),\n (\n [\"top\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\", \"--containers\"],\n (\"resource usage\", \"\", 0),\n ),\n (\n [\"get\", \"pod\", \"demo-pod\", \"-n\", \"demo-ns\", \"-o\", \"jsonpath={.spec.nodeName}\"],\n (\"node-a\", \"\", 0),\n ),\n ([\"describe\", \"node\", \"node-a\"], (\"node describe\", \"\", 0)),\n ]\n )\n\n self.assertIn(\"## INIT CONTAINER LOGS ##\", output)\n self.assertIn(\"forbidden\", output)\n self.assertIn(\n \"INFO: Skipping init container logs because init container names could not be queried.\",\n output,\n )\n self.assertNotIn(\"### Init Container:\", output)\n\n\nif __name__ == \"__main__\":\n unittest.main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":7800,"content_sha256":"788c781382a0ac5a6964e40216a2dc45e33960c3fbb56933b1778b26845b093e"},{"filename":"tests/test_regressions.sh","content":"#!/usr/bin/env bash\n#\n# Regression tests for k8s-debug shell scripts.\n# Validates:\n# - network_debug.sh secure/insecure API probing and exit codes\n# - cluster_health.sh blocked/check-failure exit codes\n#\n\nset -euo pipefail\n\nSCRIPT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nreadonly SCRIPT_DIR\nSKILL_DIR=\"$(cd \"$SCRIPT_DIR/..\" && pwd)\"\nreadonly SKILL_DIR\n\nNETWORK_SCRIPT=\"$SKILL_DIR/scripts/network_debug.sh\"\nCLUSTER_SCRIPT=\"$SKILL_DIR/scripts/cluster_health.sh\"\nreadonly NETWORK_SCRIPT\nreadonly CLUSTER_SCRIPT\n\nTMP_DIR=\"$(mktemp -d)\"\nKUBECTL_LOG=\"$TMP_DIR/kubectl.log\"\nreadonly TMP_DIR\nreadonly KUBECTL_LOG\n\ncleanup() {\n rm -rf \"$TMP_DIR\"\n}\ntrap cleanup EXIT\n\nPASS=0\nFAIL=0\nOUTPUT=\"\"\nEXIT_CODE=0\n\npass() {\n echo \" PASS: $1\"\n PASS=$((PASS + 1))\n}\n\nfail() {\n echo \" FAIL: $1\"\n FAIL=$((FAIL + 1))\n}\n\nreset_stub_env() {\n unset K8S_STUB_CONTEXT_FAIL || true\n unset K8S_STUB_CAN_I_EXEC || true\n unset K8S_STUB_SA_FILES || true\n unset K8S_STUB_EXPECT_SECURE || true\n unset K8S_STUB_EXPECT_INSECURE || true\n unset K8S_STUB_DNS_FAIL || true\n unset K8S_STUB_FAIL_NODE_LIST || true\n}\n\ncreate_kubectl_stub() {\n mkdir -p \"$TMP_DIR/bin\"\n\n cat > \"$TMP_DIR/bin/kubectl\" \u003c\u003c'EOF'\n#!/usr/bin/env bash\nset -u\n\nLOG_FILE=\"${KUBECTL_STUB_LOG:-/dev/null}\"\nprintf '%s\\n' \"$*\" >> \"$LOG_FILE\"\n\nargs=(\"$@\")\nif [[ \"${args[0]:-}\" == --request-timeout=* ]]; then\n args=(\"${args[@]:1}\")\nfi\n\nif [[ \"${#args[@]}\" -eq 0 ]]; then\n exit 0\nfi\n\njoined=\" ${args[*]} \"\ncmd=\"${args[0]}\"\nsub=\"${args[1]:-}\"\n\nif [[ \"$cmd\" == \"config\" && \"$sub\" == \"current-context\" ]]; then\n if [[ \"${K8S_STUB_CONTEXT_FAIL:-0}\" == \"1\" ]]; then\n echo \"no context\" >&2\n exit 1\n fi\n echo \"stub-context\"\n exit 0\nfi\n\nif [[ \"$cmd\" == \"auth\" && \"$sub\" == \"can-i\" ]]; then\n if [[ \"$joined\" == *\" create pods/exec \"* ]]; then\n echo \"${K8S_STUB_CAN_I_EXEC:-yes}\"\n else\n echo \"yes\"\n fi\n exit 0\nfi\n\nif [[ \"$cmd\" == \"cluster-info\" ]]; then\n echo \"Kubernetes control plane is running\"\n exit 0\nfi\n\nif [[ \"$cmd\" == \"version\" ]]; then\n echo \"Client Version: v1.30.0\"\n exit 0\nfi\n\nif [[ \"$cmd\" == \"top\" ]]; then\n echo \"stub metrics\"\n exit 0\nfi\n\nif [[ \"$cmd\" == \"logs\" ]]; then\n echo \"stub logs\"\n exit 0\nfi\n\nif [[ \"$cmd\" == \"describe\" && \"$sub\" == \"pod\" ]]; then\n echo \"IP: 10.0.0.10\"\n echo \"Controlled By: ReplicaSet/demo\"\n exit 0\nfi\n\nif [[ \"$cmd\" == \"get\" ]]; then\n resource=\"${args[1]:-}\"\n\n if [[ \"$resource\" == \"--raw=/readyz?verbose\" || \"$resource\" == \"--raw=/healthz?verbose\" ]]; then\n echo \"ok\"\n exit 0\n fi\n\n if [[ \"$resource\" == \"componentstatuses\" ]]; then\n echo \"scheduler Healthy\"\n exit 0\n fi\n\n if [[ \"$resource\" == \"namespace\" ]]; then\n if [[ \"${K8S_STUB_CONTEXT_FAIL:-0}\" == \"1\" ]]; then\n exit 1\n fi\n echo \"namespace/${args[2]:-default}\"\n exit 0\n fi\n\n if [[ \"$resource\" == \"pod\" ]]; then\n if [[ \"$joined\" == *\"jsonpath={.status.podIP}\"* ]]; then\n echo \"10.0.0.10\"\n exit 0\n fi\n if [[ \"$joined\" == *\"jsonpath={.status.hostIP}\"* ]]; then\n echo \"192.168.1.10\"\n exit 0\n fi\n if [[ \"$joined\" == *\"--show-labels\"* ]]; then\n echo \"demo-pod app=demo\"\n exit 0\n fi\n if [[ \"$joined\" == *\"-o wide\"* ]]; then\n echo \"demo-pod 1/1 Running 0\"\n exit 0\n fi\n echo \"pod/${args[2]:-demo-pod}\"\n exit 0\n fi\n\n if [[ \"$resource\" == \"nodes\" ]]; then\n if [[ \"$joined\" == *\" -o wide \"* && \"${K8S_STUB_FAIL_NODE_LIST:-0}\" == \"1\" ]]; then\n echo \"node list failed\" >&2\n exit 1\n fi\n if [[ \"$joined\" == *\"jsonpath=\"* ]]; then\n echo -e \"node-a\\tTrue\"\n exit 0\n fi\n echo \"node-a Ready\"\n exit 0\n fi\n\n if [[ \"$resource\" == \"events\" ]]; then\n echo \"Normal Started pod/demo-pod\"\n exit 0\n fi\n\n echo \"stub get $resource\"\n exit 0\nfi\n\nif [[ \"$cmd\" == \"exec\" ]]; then\n idx=-1\n for i in \"${!args[@]}\"; do\n if [[ \"${args[$i]}\" == \"--\" ]]; then\n idx=$i\n break\n fi\n done\n\n if (( idx \u003c 0 )); then\n echo \"malformed exec command\" >&2\n exit 1\n fi\n\n exec_args=(\"${args[@]:idx+1}\")\n first=\"${exec_args[0]:-}\"\n\n if [[ \"$first\" == \"test\" && \"${exec_args[1]:-}\" == \"-r\" ]]; then\n if [[ \"${K8S_STUB_SA_FILES:-present}\" == \"present\" ]]; then\n exit 0\n fi\n exit 1\n fi\n\n if [[ \"$first\" == \"cat\" && \"${exec_args[1]:-}\" == \"/var/run/secrets/kubernetes.io/serviceaccount/token\" ]]; then\n if [[ \"${K8S_STUB_SA_FILES:-present}\" == \"present\" ]]; then\n echo \"stub-token\"\n exit 0\n fi\n exit 1\n fi\n\n if [[ \"$first\" == \"cat\" && \"${exec_args[1]:-}\" == \"/etc/resolv.conf\" ]]; then\n echo \"nameserver 10.96.0.10\"\n exit 0\n fi\n\n if [[ \"$first\" == \"nslookup\" ]]; then\n if [[ \"${K8S_STUB_DNS_FAIL:-0}\" == \"1\" ]]; then\n exit 1\n fi\n echo \"Name: kubernetes.default.svc.cluster.local\"\n exit 0\n fi\n\n if [[ \"$first\" == \"getent\" ]]; then\n if [[ \"${K8S_STUB_DNS_FAIL:-0}\" == \"1\" ]]; then\n exit 1\n fi\n echo \"10.96.0.1 kubernetes.default.svc.cluster.local\"\n exit 0\n fi\n\n if [[ \"$first\" == \"curl\" ]]; then\n has_cacert=0\n has_insecure=0\n for arg in \"${exec_args[@]}\"; do\n [[ \"$arg\" == \"--cacert\" ]] && has_cacert=1\n [[ \"$arg\" == \"-k\" ]] && has_insecure=1\n done\n\n if [[ \"${K8S_STUB_EXPECT_SECURE:-0}\" == \"1\" ]]; then\n [[ \"$has_cacert\" -eq 1 && \"$has_insecure\" -eq 0 ]] || exit 1\n fi\n if [[ \"${K8S_STUB_EXPECT_INSECURE:-0}\" == \"1\" ]]; then\n [[ \"$has_insecure\" -eq 1 ]] || exit 1\n fi\n exit 0\n fi\n\n if [[ \"$first\" == \"wget\" ]]; then\n has_ca=0\n has_no_check=0\n for arg in \"${exec_args[@]}\"; do\n [[ \"$arg\" == --ca-certificate=* ]] && has_ca=1\n [[ \"$arg\" == \"--no-check-certificate\" ]] && has_no_check=1\n done\n\n if [[ \"${K8S_STUB_EXPECT_SECURE:-0}\" == \"1\" ]]; then\n [[ \"$has_ca\" -eq 1 ]] || exit 1\n fi\n if [[ \"${K8S_STUB_EXPECT_INSECURE:-0}\" == \"1\" ]]; then\n [[ \"$has_no_check\" -eq 1 ]] || exit 1\n fi\n exit 0\n fi\n\n exit 0\nfi\n\necho \"stub kubectl default response\"\nexit 0\nEOF\n\n chmod +x \"$TMP_DIR/bin/kubectl\"\n}\n\nrun_script() {\n local script=\"$1\"\n shift\n\n OUTPUT=\"\"\n EXIT_CODE=0\n : > \"$KUBECTL_LOG\"\n OUTPUT=$(\n PATH=\"$TMP_DIR/bin:/usr/bin:/bin:$PATH\" \\\n KUBECTL_STUB_LOG=\"$KUBECTL_LOG\" \\\n bash \"$script\" \"$@\" 2>&1\n ) || EXIT_CODE=$?\n}\n\nassert_exit() {\n local label=\"$1\"\n local expected=\"$2\"\n if [[ \"$EXIT_CODE\" -eq \"$expected\" ]]; then\n pass \"$label\"\n else\n fail \"$label (expected exit $expected, got $EXIT_CODE)\"\n echo \"$OUTPUT\" | sed 's/^/ /'\n fi\n}\n\nassert_output_contains() {\n local label=\"$1\"\n local pattern=\"$2\"\n if echo \"$OUTPUT\" | grep -qE -- \"$pattern\"; then\n pass \"$label\"\n else\n fail \"$label (pattern not found: $pattern)\"\n echo \"$OUTPUT\" | sed 's/^/ /'\n fi\n}\n\nassert_log_contains() {\n local label=\"$1\"\n local pattern=\"$2\"\n if grep -qE -- \"$pattern\" \"$KUBECTL_LOG\"; then\n pass \"$label\"\n else\n fail \"$label (pattern not found in kubectl log: $pattern)\"\n sed 's/^/ /' \"$KUBECTL_LOG\"\n fi\n}\n\nassert_log_not_contains() {\n local label=\"$1\"\n local pattern=\"$2\"\n if grep -qE -- \"$pattern\" \"$KUBECTL_LOG\"; then\n fail \"$label (unexpected pattern found in kubectl log: $pattern)\"\n sed 's/^/ /' \"$KUBECTL_LOG\"\n else\n pass \"$label\"\n fi\n}\n\nassert_no_bytecode_artifacts() {\n local findings\n findings=\"$(find \"$SKILL_DIR\" -type f \\( -name '*.pyc' -o -path '*/__pycache__/*' \\) -print)\"\n if [[ -z \"$findings\" ]]; then\n pass \"no Python bytecode artifacts exist under k8s-debug\"\n else\n fail \"no Python bytecode artifacts exist under k8s-debug\"\n echo \"$findings\" | sed 's/^/ /'\n fi\n}\n\necho \"Running k8s-debug shell regressions...\"\ncreate_kubectl_stub\n\necho \"\"\necho \"[P1] bytecode artifact hygiene\"\nassert_no_bytecode_artifacts\n\necho \"\"\necho \"[P0] network_debug secure-by-default API probe\"\nreset_stub_env\nexport K8S_STUB_EXPECT_SECURE=1\nrun_script \"$NETWORK_SCRIPT\" demo-pod\nassert_exit \"secure default run returns success\" 0\nassert_log_contains \"secure probe passes --cacert\" \"--cacert /var/run/secrets/kubernetes.io/serviceaccount/ca.crt\"\nassert_log_not_contains \"secure probe does not use -k\" \" exec .* -- curl .* -k \"\n\necho \"\"\necho \"[P0] network_debug insecure mode remains explicit\"\nreset_stub_env\nexport K8S_STUB_EXPECT_INSECURE=1\nexport K8S_STUB_SA_FILES=missing\nrun_script \"$NETWORK_SCRIPT\" --insecure demo-pod\nassert_exit \"insecure override run returns success\" 0\nassert_output_contains \"prints insecure override warning\" \"Insecure TLS mode enabled\"\nassert_log_contains \"insecure probe uses -k\" \" exec .* -- curl .* -k \"\n\necho \"\"\necho \"[P0/P1] secure mode fails when SA CA/token are missing\"\nreset_stub_env\nexport K8S_STUB_SA_FILES=missing\nrun_script \"$NETWORK_SCRIPT\" demo-pod\nassert_exit \"missing SA materials produce partial-failure exit code\" 1\nassert_output_contains \"missing SA files are reported\" \"service account CA/token files are missing in the pod\"\n\necho \"\"\necho \"[P1] --strict upgrades warnings in network_debug\"\nreset_stub_env\nexport K8S_STUB_CAN_I_EXEC=no\nrun_script \"$NETWORK_SCRIPT\" --strict demo-pod\nassert_exit \"strict mode returns failure on warnings\" 1\nassert_output_contains \"RBAC warning is surfaced\" \"RBAC may block 'kubectl exec'; in-pod checks may fail\"\n\necho \"\"\necho \"[P1] cluster_health blocked precondition returns exit 2\"\nreset_stub_env\nexport K8S_STUB_CONTEXT_FAIL=1\nrun_script \"$CLUSTER_SCRIPT\"\nassert_exit \"missing context is blocked\" 2\nassert_output_contains \"blocked error message is clear\" \"No active Kubernetes context\"\n\necho \"\"\necho \"[P1] cluster_health check failure returns exit 1\"\nreset_stub_env\nexport K8S_STUB_FAIL_NODE_LIST=1\nrun_script \"$CLUSTER_SCRIPT\"\nassert_exit \"node list failure maps to exit 1\" 1\nassert_output_contains \"node list failure is reported\" \"Node list failed; continuing\"\n\necho \"\"\necho \"Test summary: PASS=$PASS FAIL=$FAIL\"\nif [[ \"$FAIL\" -ne 0 ]]; then\n exit 1\nfi\n\necho \"All k8s-debug shell regressions passed.\"\n","content_type":"application/x-sh; charset=utf-8","language":"bash","size":10554,"content_sha256":"322004769597a2f1760d129cc3182b57049d0f1cec41dc0eb254143c1b8ad470"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"Kubernetes Debugging Skill","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Overview","type":"text"}]},{"type":"paragraph","content":[{"text":"Systematic toolkit for debugging Kubernetes clusters, workloads, networking, and storage with a deterministic, safety-first workflow.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Trigger Phrases","type":"text"}]},{"type":"paragraph","content":[{"text":"Use this skill when requests resemble:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"My pod is in ","type":"text"},{"text":"CrashLoopBackOff","type":"text","marks":[{"type":"code_inline"}]},{"text":"; help me find the root cause.\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"Service DNS works in one pod but not another.\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"Deployment rollout is stuck.\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"Pods are ","type":"text"},{"text":"Pending","type":"text","marks":[{"type":"code_inline"}]},{"text":" and not scheduling.\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"Cluster health looks degraded after a change.\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"PVC is pending and pods cannot mount storage.\"","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Prerequisites","type":"text"}]},{"type":"paragraph","content":[{"text":"Run from the skill directory (","type":"text"},{"text":"devops-skills-plugin/skills/k8s-debug","type":"text","marks":[{"type":"code_inline"}]},{"text":") so relative script paths work as written.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Required","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"kubectl","type":"text","marks":[{"type":"code_inline"}]},{"text":" installed and configured.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"An active cluster context.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Read access to namespaces, pods, events, services, and nodes.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Quick preflight:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"kubectl config current-context\nkubectl auth can-i get pods -A\nkubectl auth can-i get events -A\nkubectl get ns","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Optional but Recommended","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"jq","type":"text","marks":[{"type":"code_inline"}]},{"text":" for more precise filtering in ","type":"text"},{"text":"./scripts/cluster_health.sh","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Metrics API (","type":"text"},{"text":"metrics-server","type":"text","marks":[{"type":"code_inline"}]},{"text":") for ","type":"text"},{"text":"kubectl top","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"In-container debug tools (","type":"text"},{"text":"nslookup","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"getent","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"curl","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"wget","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"ip","type":"text","marks":[{"type":"code_inline"}]},{"text":") for deep network tests.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Fallback behavior:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"If optional tools are missing, scripts continue and print warnings with reduced output.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"If ","type":"text"},{"text":"kubectl top","type":"text","marks":[{"type":"code_inline"}]},{"text":" is unavailable, continue with ","type":"text"},{"text":"kubectl describe","type":"text","marks":[{"type":"code_inline"}]},{"text":" and events.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"When to Use This Skill","type":"text"}]},{"type":"paragraph","content":[{"text":"Use this skill for:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Pod failures (CrashLoopBackOff, ImagePullBackOff, Pending, OOMKilled)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Service connectivity or DNS resolution issues","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Network policy or ingress problems","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Volume and storage mount failures","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Deployment rollout issues","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Cluster health or performance degradation","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Resource exhaustion (CPU/memory)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Configuration problems (ConfigMaps, Secrets, RBAC)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Safety Rules for Disruptive Commands","type":"text"}]},{"type":"paragraph","content":[{"text":"Default mode is read-only diagnosis first. Only execute disruptive commands after confirming blast radius and rollback.","type":"text"}]},{"type":"paragraph","content":[{"text":"Commands requiring explicit confirmation:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"kubectl delete pod ... --force --grace-period=0","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"kubectl drain ...","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"kubectl rollout restart ...","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"kubectl rollout undo ...","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"kubectl debug ... --copy-to=...","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"Before disruptive actions:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Snapshot current state for rollback and incident notes\nkubectl get deploy,rs,pod,svc -n \u003cnamespace> -o wide\nkubectl get pod \u003cpod-name> -n \u003cnamespace> -o yaml > before-\u003cpod-name>.yaml\nkubectl get events -n \u003cnamespace> --sort-by='.lastTimestamp' > before-events.txt","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Reference Navigation Map","type":"text"}]},{"type":"paragraph","content":[{"text":"Load only the section needed for the observed symptom.","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Symptom / Need","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Open","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Start section","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"You need an end-to-end diagnosis path","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./references/troubleshooting_workflow.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"General Debugging Workflow","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Pod state is ","type":"text"},{"text":"Pending","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"CrashLoopBackOff","type":"text","marks":[{"type":"code_inline"}]},{"text":", or ","type":"text"},{"text":"ImagePullBackOff","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./references/troubleshooting_workflow.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Pod Lifecycle Troubleshooting","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Service reachability or DNS failure","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./references/troubleshooting_workflow.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Network Troubleshooting Workflow","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Node pressure or performance regression","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./references/troubleshooting_workflow.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Resource and Performance Workflow","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"PVC / PV / storage class issues","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./references/troubleshooting_workflow.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Storage Troubleshooting Workflow","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Quick symptom-to-fix lookup","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./references/common_issues.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"matching issue heading","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Post-mortem fix options for known issues","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./references/common_issues.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Solutions","type":"text","marks":[{"type":"code_inline"}]},{"text":" sections","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Scripts Overview","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Script","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Purpose","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Required args","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Optional args","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Output","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fallback behavior","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/cluster_health.sh","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Cluster-wide health snapshot (nodes, workloads, events, common failure states)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"None","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--strict","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"K8S_REQUEST_TIMEOUT","type":"text","marks":[{"type":"code_inline"}]},{"text":" env var","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Sectioned report to stdout","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Continues on check failures, tracks them in summary and exit code","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/network_debug.sh","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Pod-centric network and DNS diagnostics","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\u003cpod-name>","type":"text","marks":[{"type":"code_inline"}]},{"text":" (","type":"text"},{"text":"\u003cnamespace>","type":"text","marks":[{"type":"code_inline"}]},{"text":" defaults to ","type":"text"},{"text":"default","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--strict","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--insecure","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"K8S_REQUEST_TIMEOUT","type":"text","marks":[{"type":"code_inline"}]},{"text":" env var","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Sectioned report to stdout","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Uses secure API probe by default; insecure TLS requires explicit ","type":"text"},{"text":"--insecure","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/pod_diagnostics.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Deep pod diagnostics (status, describe, YAML, events, per-container logs, node context)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\u003cpod-name>","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"-n/--namespace","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"-o/--output","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Sectioned report to stdout or file","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fails fast on missing access; skips optional metrics/log blocks with clear messages","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Script Exit Codes","type":"text"}]},{"type":"paragraph","content":[{"text":"./scripts/cluster_health.sh","type":"text","marks":[{"type":"code_inline"}]},{"text":" and ","type":"text"},{"text":"./scripts/network_debug.sh","type":"text","marks":[{"type":"code_inline"}]},{"text":" share the same contract:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"0","type":"text","marks":[{"type":"code_inline"}]},{"text":": checks completed with no check failures (warnings allowed unless ","type":"text"},{"text":"--strict","type":"text","marks":[{"type":"code_inline"}]},{"text":" is set).","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"1","type":"text","marks":[{"type":"code_inline"}]},{"text":": one or more checks failed, or warnings occurred in ","type":"text"},{"text":"--strict","type":"text","marks":[{"type":"code_inline"}]},{"text":" mode.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"2","type":"text","marks":[{"type":"code_inline"}]},{"text":": blocked preconditions (for example: missing ","type":"text"},{"text":"kubectl","type":"text","marks":[{"type":"code_inline"}]},{"text":", no active context, inaccessible namespace/pod).","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Deterministic Debugging Workflow","type":"text"}]},{"type":"paragraph","content":[{"text":"Follow this systematic approach for any Kubernetes issue:","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"1. Preflight and Scope","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"kubectl config current-context\nkubectl get ns\nkubectl auth can-i get pods -n \u003cnamespace>","type":"text"}]},{"type":"paragraph","content":[{"text":"If preflight fails, stop and fix access/context first.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"2. Identify the Problem Layer","type":"text"}]},{"type":"paragraph","content":[{"text":"Categorize the issue:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Application Layer","type":"text","marks":[{"type":"strong"}]},{"text":": Application crashes, errors, bugs","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Pod Layer","type":"text","marks":[{"type":"strong"}]},{"text":": Pod not starting, restarting, or pending","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Service Layer","type":"text","marks":[{"type":"strong"}]},{"text":": Network connectivity, DNS issues","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Node Layer","type":"text","marks":[{"type":"strong"}]},{"text":": Node not ready, resource exhaustion","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Cluster Layer","type":"text","marks":[{"type":"strong"}]},{"text":": Control plane issues, API problems","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Storage Layer","type":"text","marks":[{"type":"strong"}]},{"text":": Volume mount failures, PVC issues","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Configuration Layer","type":"text","marks":[{"type":"strong"}]},{"text":": ConfigMap, Secret, RBAC issues","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"3. Gather Diagnostics with the Right Script","type":"text"}]},{"type":"paragraph","content":[{"text":"Use the appropriate diagnostic script based on scope:","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"Pod-Level Diagnostics","type":"text"}]},{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"./scripts/pod_diagnostics.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" for comprehensive pod analysis:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python3 ./scripts/pod_diagnostics.py \u003cpod-name> -n \u003cnamespace>","type":"text"}]},{"type":"paragraph","content":[{"text":"This script gathers:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Pod status and description","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Pod events","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Container logs (current and previous)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Resource usage","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Node information","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"YAML configuration","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Output can be saved for analysis:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python3 ./scripts/pod_diagnostics.py \u003cpod-name> -n \u003cnamespace> -o diagnostics.txt","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"Cluster-Level Health Check","type":"text"}]},{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"./scripts/cluster_health.sh","type":"text","marks":[{"type":"code_inline"}]},{"text":" for overall cluster diagnostics:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"./scripts/cluster_health.sh > cluster-health-$(date +%Y%m%d-%H%M%S).txt","type":"text"}]},{"type":"paragraph","content":[{"text":"This script checks:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Cluster info and version","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Node status and resources","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Pods across all namespaces","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Failed/pending pods","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Recent events","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Deployments, services, statefulsets, daemonsets","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"PVCs and PVs","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Component health","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Common error states (CrashLoopBackOff, ImagePullBackOff)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"Network Diagnostics","type":"text"}]},{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"./scripts/network_debug.sh","type":"text","marks":[{"type":"code_inline"}]},{"text":" for connectivity issues:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"./scripts/network_debug.sh \u003cnamespace> \u003cpod-name>\n# or force warning sensitivity / insecure TLS only when explicitly needed:\n./scripts/network_debug.sh --strict \u003cnamespace> \u003cpod-name>\n./scripts/network_debug.sh --insecure \u003cnamespace> \u003cpod-name>","type":"text"}]},{"type":"paragraph","content":[{"text":"This script analyzes:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Pod network configuration","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"DNS setup and resolution","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Service endpoints","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Network policies","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Connectivity tests","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"CoreDNS logs","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"4. Follow Issue-Specific Reference Workflow","type":"text"}]},{"type":"paragraph","content":[{"text":"Based on the identified issue, consult ","type":"text"},{"text":"./references/troubleshooting_workflow.md","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Pod Pending","type":"text","marks":[{"type":"strong"}]},{"text":": Resource/scheduling workflow","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"CrashLoopBackOff","type":"text","marks":[{"type":"strong"}]},{"text":": Application crash workflow","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ImagePullBackOff","type":"text","marks":[{"type":"strong"}]},{"text":": Image pull workflow","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Service issues","type":"text","marks":[{"type":"strong"}]},{"text":": Network connectivity workflow","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"DNS failures","type":"text","marks":[{"type":"strong"}]},{"text":": DNS troubleshooting workflow","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Resource exhaustion","type":"text","marks":[{"type":"strong"}]},{"text":": Performance investigation workflow","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Storage issues","type":"text","marks":[{"type":"strong"}]},{"text":": PVC binding workflow","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Deployment stuck","type":"text","marks":[{"type":"strong"}]},{"text":": Rollout workflow","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"5. Apply Targeted Fixes","type":"text"}]},{"type":"paragraph","content":[{"text":"Refer to ","type":"text"},{"text":"./references/common_issues.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" for symptom-specific fixes.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"6. Verify and Close","type":"text"}]},{"type":"paragraph","content":[{"text":"Run final verification:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"kubectl get pods -n \u003cnamespace> -o wide\nkubectl get events -n \u003cnamespace> --sort-by='.lastTimestamp' | tail -20\nkubectl rollout status deployment/\u003cname> -n \u003cnamespace>","type":"text"}]},{"type":"paragraph","content":[{"text":"Issue is done when user-visible behavior is healthy and no new critical warning events appear.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Example Flows","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Example 1: CrashLoopBackOff in ","type":"text"},{"text":"payments","type":"text","marks":[{"type":"code_inline"}]},{"text":" Namespace","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python3 ./scripts/pod_diagnostics.py payments-api-7c97f95dfb-q9l7k -n payments -o payments-diagnostics.txt\nkubectl logs payments-api-7c97f95dfb-q9l7k -n payments --previous --tail=100\nkubectl get deploy payments-api -n payments -o yaml | grep -A 8 livenessProbe","type":"text"}]},{"type":"paragraph","content":[{"text":"Then open ","type":"text"},{"text":"./references/common_issues.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" and apply the ","type":"text"},{"text":"CrashLoopBackOff","type":"text","marks":[{"type":"code_inline"}]},{"text":" solutions.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Example 2: Service DNS/Connectivity Failure","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"./scripts/network_debug.sh checkout checkout-api-75f49c9d8f-z6qtm\nkubectl get svc checkout-api -n checkout\nkubectl get endpoints checkout-api -n checkout\nkubectl get networkpolicies -n checkout","type":"text"}]},{"type":"paragraph","content":[{"text":"Then follow ","type":"text"},{"text":"Service Connectivity Workflow","type":"text","marks":[{"type":"code_inline"}]},{"text":" in ","type":"text"},{"text":"./references/troubleshooting_workflow.md","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Essential Manual Commands","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Pod Debugging","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# View pod status\nkubectl get pods -n \u003cnamespace> -o wide\n\n# Detailed pod information\nkubectl describe pod \u003cpod-name> -n \u003cnamespace>\n\n# View logs\nkubectl logs \u003cpod-name> -n \u003cnamespace>\nkubectl logs \u003cpod-name> -n \u003cnamespace> --previous # Previous container\nkubectl logs \u003cpod-name> -n \u003cnamespace> -c \u003ccontainer> # Specific container\n\n# Execute commands in pod\nkubectl exec \u003cpod-name> -n \u003cnamespace> -it -- /bin/sh\n\n# Get pod YAML\nkubectl get pod \u003cpod-name> -n \u003cnamespace> -o yaml","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Service and Network Debugging","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Check services\nkubectl get svc -n \u003cnamespace>\nkubectl describe svc \u003cservice-name> -n \u003cnamespace>\n\n# Check endpoints\nkubectl get endpoints -n \u003cnamespace>\n\n# Test DNS\nkubectl exec \u003cpod-name> -n \u003cnamespace> -- nslookup kubernetes.default\n\n# View events\nkubectl get events -n \u003cnamespace> --sort-by='.lastTimestamp'","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Resource Monitoring","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Node resources\nkubectl top nodes\nkubectl describe nodes\n\n# Pod resources\nkubectl top pods -n \u003cnamespace>\nkubectl top pod \u003cpod-name> -n \u003cnamespace> --containers","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Emergency Operations","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Restart deployment\nkubectl rollout restart deployment/\u003cname> -n \u003cnamespace>\n\n# Rollback deployment\nkubectl rollout undo deployment/\u003cname> -n \u003cnamespace>\n\n# Force delete stuck pod\nkubectl delete pod \u003cpod-name> -n \u003cnamespace> --force --grace-period=0\n\n# Drain node (maintenance)\nkubectl drain \u003cnode-name> --ignore-daemonsets --delete-emptydir-data\n\n# Cordon node (prevent scheduling)\nkubectl cordon \u003cnode-name>","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Completion Criteria","type":"text"}]},{"type":"paragraph","content":[{"text":"Troubleshooting session is complete when all are true:","type":"text"}]},{"type":"checkbox_list","attrs":{"id":null},"content":[{"type":"checkbox_item","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Cluster context and namespace are confirmed.","type":"text"}]}]},{"type":"checkbox_item","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Relevant diagnostic script output is captured.","type":"text"}]}]},{"type":"checkbox_item","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Root cause is identified and tied to evidence (events/logs/config/state).","type":"text"}]}]},{"type":"checkbox_item","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Any disruptive action was preceded by snapshot and rollback plan.","type":"text"}]}]},{"type":"checkbox_item","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Fix verification commands show healthy state.","type":"text"}]}]},{"type":"checkbox_item","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Reference path used (","type":"text"},{"text":"./references/troubleshooting_workflow.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" or ","type":"text"},{"text":"./references/common_issues.md","type":"text","marks":[{"type":"code_inline"}]},{"text":") is documented in notes.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Related Tools","type":"text"}]},{"type":"paragraph","content":[{"text":"Useful additional tools for Kubernetes debugging:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"kubectl-debug","type":"text","marks":[{"type":"strong"}]},{"text":": Advanced debugging plugin","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"stern","type":"text","marks":[{"type":"strong"}]},{"text":": Multi-pod log tailing","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"kubectx/kubens","type":"text","marks":[{"type":"strong"}]},{"text":": Context and namespace switching","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"k9s","type":"text","marks":[{"type":"strong"}]},{"text":": Terminal UI for Kubernetes","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"lens","type":"text","marks":[{"type":"strong"}]},{"text":": Desktop IDE for Kubernetes","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Prometheus/Grafana","type":"text","marks":[{"type":"strong"}]},{"text":": Monitoring and alerting","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Jaeger/Zipkin","type":"text","marks":[{"type":"strong"}]},{"text":": Distributed tracing","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"k8s-debug","author":"@skillopedia","source":{"stars":224,"repo_name":"cc-devops-skills","origin_url":"https://github.com/akin-ozer/cc-devops-skills/blob/HEAD/devops-skills-plugin/skills/k8s-debug/SKILL.md","repo_owner":"akin-ozer","body_sha256":"776ac2c435256fdd46347af5deb268ba207bccc4b292b485d2ca11f102dc45b4","cluster_key":"036e7a8a9ec100e4a4263e54555590738ef81d736dfe2f3ba192ae0044df7209","clean_bundle":{"format":"clean-skill-bundle-v1","source":"akin-ozer/cc-devops-skills/devops-skills-plugin/skills/k8s-debug/SKILL.md","attachments":[{"id":"6daa29d7-658b-5188-a641-74891c0570c8","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/6daa29d7-658b-5188-a641-74891c0570c8/attachment.md","path":"references/common_issues.md","size":9337,"sha256":"737c2d0094922445e46ddf54434a6e5766d95085d11e8034aeef974717d12a1f","contentType":"text/markdown; charset=utf-8"},{"id":"8318ef4a-9a38-5968-8a3c-3e6669dafa34","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/8318ef4a-9a38-5968-8a3c-3e6669dafa34/attachment.md","path":"references/troubleshooting_workflow.md","size":9940,"sha256":"14ee2dcb20afd8d66d9167819f97a6e1601aba89b8d19a0fbc22e5745a986fb7","contentType":"text/markdown; charset=utf-8"},{"id":"ff03c7b8-a84f-50fe-9c1e-40beec80ae8a","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/ff03c7b8-a84f-50fe-9c1e-40beec80ae8a/attachment.sh","path":"scripts/cluster_health.sh","size":6602,"sha256":"ae0212b76ac1cfc62bc040168253a268811be624ca217c56cb2f2db4d30ac828","contentType":"application/x-sh; charset=utf-8"},{"id":"ad1be09f-07b0-5b65-8844-9a2aea59b911","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/ad1be09f-07b0-5b65-8844-9a2aea59b911/attachment.sh","path":"scripts/network_debug.sh","size":9298,"sha256":"cede308e036b03bca55f7fcd84ecb20625c1f9a349d852c0b03873f51534b2fa","contentType":"application/x-sh; charset=utf-8"},{"id":"3a9e8e54-883b-5dee-9edc-b23dd597b89c","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/3a9e8e54-883b-5dee-9edc-b23dd597b89c/attachment.py","path":"scripts/pod_diagnostics.py","size":8967,"sha256":"e31bc06934e033c29465ca18a292700a80438607abbeeac5a3573f61e847b163","contentType":"text/x-python; charset=utf-8"},{"id":"f73e27d1-9ee6-5e53-8d8d-a1812d2ab2d3","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/f73e27d1-9ee6-5e53-8d8d-a1812d2ab2d3/attachment.py","path":"tests/test_pod_diagnostics.py","size":7800,"sha256":"788c781382a0ac5a6964e40216a2dc45e33960c3fbb56933b1778b26845b093e","contentType":"text/x-python; charset=utf-8"},{"id":"5f44c905-3697-5b3e-b289-9806a21267eb","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/5f44c905-3697-5b3e-b289-9806a21267eb/attachment.sh","path":"tests/test_regressions.sh","size":10554,"sha256":"322004769597a2f1760d129cc3182b57049d0f1cec41dc0eb254143c1b8ad470","contentType":"application/x-sh; charset=utf-8"}],"bundle_sha256":"dd7f3732d76e6f8d775ba12b7e2ffbe2cdeaa50a0481b74f553c26c7da9a037c","attachment_count":7,"text_attachments":7,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":0,"excluded_attachments":[]},"cluster_size":1,"skill_md_path":"devops-skills-plugin/skills/k8s-debug/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"devops-infrastructure","category_label":"DevOps"},"exact_dupes_collapsed_into_this":0},"version":"v1","category":"devops-infrastructure","import_tag":"clean-skills-v1","description":"Diagnose and fix Kubernetes pods, CrashLoopBackOff, Pending, DNS, networking, storage, and rollout failures with kubectl."}},"renderedAt":1782986663942}

Kubernetes Debugging Skill Overview Systematic toolkit for debugging Kubernetes clusters, workloads, networking, and storage with a deterministic, safety-first workflow. Trigger Phrases Use this skill when requests resemble: - "My pod is in ; help me find the root cause." - "Service DNS works in one pod but not another." - "Deployment rollout is stuck." - "Pods are and not scheduling." - "Cluster health looks degraded after a change." - "PVC is pending and pods cannot mount storage." Prerequisites Run from the skill directory ( ) so relative script paths work as written. Required - installed…