HyperPod Issue Report Collect diagnostic logs from HyperPod cluster nodes via SSM, store results in S3. Supports both EKS and Slurm clusters with auto-detection. Uses the bundled for reliable parallel collection. Prerequisites - AWS CLI configured with permissions: , , , , , - Python 3.8+ and uv (see uv installation docs for install options) - SSM Agent running on target nodes; node IAM roles need / on the report bucket - For EKS clusters: kubectl installed and configured (see Workflow step 2) Workflow 1. Gather Information Collect from the user: - Cluster identifier (required): accepts clust…

, '_').replace('`', '_').replace('\"', '_').replace(\"'\", '_')[:50]\n output_file = f\"command_{i:02d}_{safe_name}.txt\"\n\n # Use shlex.quote() to safely escape the command for display in echo\n quoted_cmd = shlex.quote(cmd)\n\n cmd_line = f\"{cmd} > \\\"${{OUTPUT_DIR}}/{output_file}\\\" 2>&1 || echo \\\"Command failed with exit code $?\\\" >> \\\"${{OUTPUT_DIR}}/{output_file}\\\"\"\n\n script_lines.extend([\n f\"# Command {i}\",\n f\"echo 'Running: '{quoted_cmd}\",\n cmd_line,\n \"\",\n ])\n \n # Add S3 upload logic with new filename format\n script_lines.extend([\n \"# Upload results to S3\",\n f\"S3_BUCKET={shlex.quote(self.s3_bucket)}\",\n f\"S3_PREFIX={shlex.quote(self.report_s3_key + '/instances')}\",\n \"\",\n \"echo \\\"Creating tarball...\\\"\",\n \"TARBALL=\\\"/tmp/${INSTANCE_GROUP}_${INSTANCE_ID}.tar.gz\\\"\",\n \"tar -czf \\\"${TARBALL}\\\" -C /tmp \\\"$(basename ${OUTPUT_DIR})\\\"\",\n \"if [ $? -ne 0 ]; then\",\n \" echo \\\"ERROR: Failed to create tarball\\\"\",\n \" exit 1\",\n \"fi\",\n \"\",\n \"echo \\\"Uploading to S3...\\\"\",\n \"aws s3 cp \\\"${TARBALL}\\\" \\\"s3://${S3_BUCKET}/${S3_PREFIX}/$(basename ${TARBALL})\\\"\",\n \"\",\n \"if [ $? -eq 0 ]; then\",\n \" echo \\\"Successfully uploaded report to s3://${S3_BUCKET}/${S3_PREFIX}/$(basename ${TARBALL})\\\"\",\n \" rm -rf \\\"${OUTPUT_DIR}\\\" \\\"${TARBALL}\\\"\",\n \" echo \\\"Report collection completed for ${INSTANCE_GROUP}/${INSTANCE_ID}\\\"\",\n \" exit 0\",\n \"else\",\n \" echo \\\"ERROR: Failed to upload to S3\\\"\",\n \" exit 1\",\n \"fi\",\n ])\n \n return '\\n'.join(script_lines)\n \n def get_hyperpod_ssm_target(self, instance_id: str, instance_group_name: str) -> str:\n \"\"\"Construct the HyperPod SSM target format.\"\"\"\n if not self.cluster_id:\n raise ValueError(\"Cluster ID is required for HyperPod SSM targets\")\n return f\"sagemaker-cluster:{self.cluster_id}_{instance_group_name}-{instance_id}\"\n \n def execute_collection_on_node(self, node: Dict, commands: List[str], script_s3_uri: str) -> Dict:\n \"\"\"Execute the collection script on a single node via SSM using pexpect.\"\"\"\n instance_id = node['InstanceId']\n instance_group = node.get('NodeGroup', 'unknown')\n \n # Start timing\n start_time = time.time()\n \n try:\n ssm_target = self.get_hyperpod_ssm_target(instance_id, instance_group)\n except ValueError as e:\n return {\n 'InstanceId': instance_id,\n 'NodeGroup': instance_group,\n 'Success': False,\n 'Error': str(e),\n 'ElapsedTime': time.time() - start_time\n }\n \n # Build the command to download and execute the script with environment variables\n commands_to_run = [\n f\"aws s3 cp {shlex.quote(script_s3_uri)} /tmp/collector_script.sh\",\n \"chmod +x /tmp/collector_script.sh\",\n f\"INSTANCE_GROUP={shlex.quote(instance_group)} INSTANCE_ID={shlex.quote(instance_id)} CLUSTER_TYPE={shlex.quote(self.cluster_type)} /tmp/collector_script.sh\"\n ]\n \n full_command = \" && \".join(commands_to_run)\n \n print(f\"Executing collection on {instance_id} ({instance_group})...\")\n \n child = None\n custom_prompt = \"PEXPECT_READY# \"\n \n try:\n ssm_command = ['aws', 'ssm', 'start-session', '--target', ssm_target]\n if self.region:\n ssm_command.extend(['--region', self.region])\n\n if self.debug:\n print(f\"[DEBUG] {instance_id}: SSM command: {ssm_command}\")\n print(f\"[DEBUG] {instance_id}: Full command: {full_command}\")\n\n # Use pexpect to handle the interactive session\n # Note: No default timeout set - each expect() call has explicit timeout\n child = pexpect.spawn(ssm_command[0], ssm_command[1:], encoding='utf-8')\n child.logfile_read = None\n \n # Wait for initial prompt (60 seconds to handle slow SSM session initialization)\n initial_prompt_patterns = [\n r'[\\$#]\\s+', # Standard shell prompt\n r'sh-\\d+\\.\\d+[\\$#]\\s*', # sh prompt\n pexpect.TIMEOUT\n ]\n \n prompt_index = child.expect(initial_prompt_patterns, timeout=SSM_PROMPT_TIMEOUT)\n \n if prompt_index == len(initial_prompt_patterns) - 1: # TIMEOUT\n # Get output for debugging\n output_sample = \"\"\n if child and hasattr(child, 'before') and child.before:\n # Show more output to help diagnose the issue\n output_sample = child.before.strip()\n if len(output_sample) > 1000:\n output_sample = output_sample[-1000:] # Last 1000 chars\n \n error_msg = (\n f\"Failed to detect shell prompt after 60 seconds.\\n\"\n f\"This may indicate:\\n\"\n f\" - Custom SSM session configuration interfering with prompt detection\\n\"\n f\" - Non-standard shell prompt format\\n\"\n f\" - SSM session initialization issues\\n\"\n )\n \n if output_sample:\n error_msg += f\"\\nSession output received:\\n{output_sample}\\n\"\n error_msg += (\n f\"\\nExpected prompt patterns: $ or # followed by space\\n\"\n f\"If your cluster uses custom SSM session commands or non-standard prompts,\\n\"\n f\"this tool may not be compatible.\"\n )\n else:\n error_msg += \"\\nNo output received from SSM session.\"\n \n return {\n 'InstanceId': instance_id,\n 'NodeGroup': instance_group,\n 'Success': False,\n 'Error': error_msg\n }\n \n # Set custom prompt\n child.sendline(f'export PS1=\"{custom_prompt}\"')\n child.sendline('echo \"PROMPT_SET_MARKER\"')\n child.expect('PROMPT_SET_MARKER', timeout=SSM_PROMPT_TIMEOUT)\n child.expect(custom_prompt, timeout=SSM_PROMPT_TIMEOUT)\n \n if self.debug:\n print(f\"[DEBUG] {instance_id}: Custom prompt set\")\n \n # Execute the command and capture exit code immediately\n child.sendline(f'{full_command}; EXIT_CODE=$?; echo \"EXIT_CODE:$EXIT_CODE\"')\n \n # Wait for command completion (15 minutes for script execution)\n child.expect(custom_prompt, timeout=SSM_SCRIPT_EXECUTION_TIMEOUT)\n \n # Extract output\n output = child.before\n exit_code = 1 # Default to failure\n \n if output:\n lines = output.split('\\n')\n cleaned_lines = []\n command_echo_removed = False\n \n for line in lines:\n line_stripped = line.strip()\n \n # Remove command echo\n if not command_echo_removed and full_command in line:\n command_echo_removed = True\n continue\n \n # Extract exit code\n if line_stripped.startswith('EXIT_CODE:'):\n try:\n exit_code = int(line_stripped.split(':')[1].strip())\n except (ValueError, IndexError):\n pass\n continue\n \n if line_stripped:\n cleaned_lines.append(line_stripped)\n \n output = '\\n'.join(cleaned_lines)\n else:\n output = \"\"\n \n # Close session\n try:\n child.sendline('exit')\n child.expect(pexpect.EOF, timeout=5)\n except Exception:\n try:\n child.kill(signal.SIGINT)\n except Exception: # nosec B110 - best-effort cleanup\n pass\n \n # Determine success based on exit code OR successful S3 upload message\n # Some nodes may not properly echo the EXIT_CODE line due to terminal issues\n success_indicators = [\n exit_code == 0,\n 'Successfully uploaded report to s3://' in output,\n 'upload: ../../tmp/' in output and '.tar.gz to s3://' in output\n ]\n \n if any(success_indicators):\n return {\n 'InstanceId': instance_id,\n 'NodeGroup': instance_group,\n 'Success': True,\n 'Output': output,\n 'ElapsedTime': time.time() - start_time\n }\n else:\n # Show last 15 lines of output which usually contain the error\n output_lines = output.split('\\n')\n error_context = '\\n'.join(output_lines[-15:]) if len(output_lines) > 15 else output\n \n return {\n 'InstanceId': instance_id,\n 'NodeGroup': instance_group,\n 'Success': False,\n 'Error': f\"Script execution failed (exit code: {exit_code})\\n{error_context}\",\n 'Output': output,\n 'ElapsedTime': time.time() - start_time\n }\n \n except pexpect.TIMEOUT:\n # Show more context about where the timeout occurred\n output_sample = \"\"\n if child and hasattr(child, 'before') and child.before:\n output_sample = child.before.strip()\n if len(output_sample) > 1000:\n output_sample = output_sample[-1000:] # Last 1000 chars\n \n error_msg = (\n f\"Operation timed out during command execution.\\n\"\n f\"This may indicate:\\n\"\n f\" - Command taking longer than expected to complete\\n\"\n f\" - Custom shell configuration interfering with output detection\\n\"\n f\" - Network or SSM session issues\\n\"\n )\n \n if output_sample:\n error_msg += f\"\\nLast output received:\\n{output_sample}\"\n else:\n error_msg += \"\\nNo output received.\"\n \n return {\n 'InstanceId': instance_id,\n 'NodeGroup': instance_group,\n 'Success': False,\n 'Error': error_msg,\n 'ElapsedTime': time.time() - start_time\n }\n \n except pexpect.EOF:\n output_sample = \"\"\n if child and hasattr(child, 'before') and child.before:\n output_sample = child.before.strip()\n if len(output_sample) > 500:\n output_sample = output_sample[-500:] # Last 500 chars\n \n error_msg = \"SSM session ended unexpectedly\"\n if output_sample:\n error_msg += f\"\\nLast output:\\n{output_sample}\"\n \n return {\n 'InstanceId': instance_id,\n 'NodeGroup': instance_group,\n 'Success': False,\n 'Error': error_msg,\n 'ElapsedTime': time.time() - start_time\n }\n \n except Exception as e:\n error_msg = f\"Error executing command: {str(e)}\"\n if self.debug:\n error_msg += f\"\\nTraceback: {traceback.format_exc()}\"\n return {\n 'InstanceId': instance_id,\n 'NodeGroup': instance_group,\n 'Success': False,\n 'Error': error_msg,\n 'ElapsedTime': time.time() - start_time\n }\n \n finally:\n if child and child.isalive():\n try:\n child.terminate(force=True)\n except Exception: # nosec B110 - best-effort cleanup\n pass\n \n def execute_with_retry(self, node: Dict, commands: List[str], script_s3_uri: str, max_retries: int = 3) -> Dict:\n \"\"\"Execute collection on a node with exponential backoff on throttling errors.\"\"\"\n for attempt in range(max_retries):\n result = self.execute_collection_on_node(node, commands, script_s3_uri)\n \n error_msg = result.get('Error', '')\n if 'ThrottlingException' in error_msg or 'Rate exceeded' in error_msg:\n if attempt \u003c max_retries - 1:\n wait_time = 2 ** attempt\n if self.debug:\n print(f\"[DEBUG] {node['InstanceId']}: Throttled, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})\")\n time.sleep(wait_time)\n continue\n \n return result\n \n return result\n\n def collect_reports(self, commands: List[str], instance_groups: Optional[List[str]] = None, instance_ids: Optional[List[str]] = None, max_workers: int = 16):\n \"\"\"Collect reports from all nodes, specific instance groups, or specific instance IDs.\n \n For Slurm clusters, instance_ids can be either:\n - Instance IDs: i-0123456789abcdef0\n - Slurm node names: ip-10-1-104-161\n \n Note: max_workers defaults to 16 to balance speed and avoid SSM throttling on large clusters.\n \"\"\"\n # Get cluster nodes\n self.nodes = self.get_cluster_nodes()\n \n if not self.nodes:\n print(\"No nodes found in cluster\")\n return\n \n # Collect kubectl information first (for EKS clusters)\n if self.cluster_type == 'eks':\n self.collect_kubectl_node_info()\n \n # Filter by specific instance IDs or Slurm node names if specified\n if instance_ids:\n # Resolve node identifiers (handles both instance IDs and Slurm node names)\n resolved_instance_ids = self.resolve_node_identifiers(instance_ids)\n \n if not resolved_instance_ids:\n print(f\"No valid nodes found from specified identifiers: {', '.join(instance_ids)}\")\n return\n \n self.nodes = [n for n in self.nodes if n.get('InstanceId') in resolved_instance_ids]\n if not self.nodes:\n print(f\"No nodes found with specified identifiers: {', '.join(instance_ids)}\")\n return\n \n # Show which requested identifiers were not found\n found_ids = {n.get('InstanceId') for n in self.nodes}\n missing_ids = set(resolved_instance_ids) - found_ids\n if missing_ids:\n print(f\"Warning: Instance IDs not found in cluster: {', '.join(missing_ids)}\")\n # Filter by instance groups if specified (only if instance_ids not specified)\n elif instance_groups:\n # Convert instance groups to lowercase for case-insensitive matching\n instance_groups_lower = [ig.lower() for ig in instance_groups]\n self.nodes = [n for n in self.nodes if n.get('NodeGroup', '').lower() in instance_groups_lower]\n if not self.nodes:\n print(f\"No nodes found in instance groups: {', '.join(instance_groups)}\")\n return\n print(f\"Filtering to instance groups: {', '.join(instance_groups)}\")\n \n print(f\"\\nCollecting reports from {len(self.nodes)} nodes\")\n print(f\"Cluster type: {self.cluster_type.upper()}\")\n print(f\"Report ID: {self.report_id}\")\n print(f\"S3 Location: s3://{self.s3_bucket}/{self.report_s3_key}/\")\n \n # Show what will be collected based on cluster type\n if self.cluster_type == 'eks':\n print(f\"Default collections: nvidia-smi, containerd status, kubelet status, EKS log collector, resource config, cluster logs, systemd services, disk usage\")\n elif self.cluster_type == 'slurm':\n print(f\"Default collections: nvidia-smi, nvidia-bug-report, sinfo, Slurm services, Slurm config, Slurm logs, system logs\")\n \n if commands:\n print(f\"Additional commands: {', '.join(commands)}\")\n print(\"-\" * 60)\n \n # Generate and upload the collector script once\n script_content = self.generate_collector_script(commands)\n script_key = f\"{self.report_s3_key}/collector_script.sh\"\n \n try:\n self.s3_client.put_object(\n Bucket=self.s3_bucket,\n Key=script_key,\n Body=script_content.encode('utf-8'),\n ContentType='text/x-shellscript'\n )\n script_s3_uri = f\"s3://{self.s3_bucket}/{script_key}\"\n print(f\"Uploaded collector script to: {script_s3_uri}\")\n except Exception as e:\n print(f\"Error uploading collector script: {e}\")\n return\n \n # Execute collection on all nodes using ThreadPoolExecutor\n results = []\n \n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n future_to_node = {\n executor.submit(self.execute_with_retry, node, commands, script_s3_uri): node\n for node in self.nodes\n }\n \n for future in as_completed(future_to_node):\n node = future_to_node[future]\n try:\n result = future.result()\n results.append(result)\n \n status = \"✓\" if result['Success'] else \"✗\"\n elapsed = result.get('ElapsedTime', 0)\n print(f\"[{status}] {result['InstanceId']} ({result['NodeGroup']}) - {elapsed:.1f}s\")\n \n if not result['Success']:\n error_msg = result.get('Error', 'Unknown error')\n # Print error details with indentation for readability\n for line in error_msg.split('\\n'):\n if line.strip():\n print(f\" {line}\")\n \n except Exception as e:\n print(f\"[✗] {node['InstanceId']}: Exception: {e}\")\n results.append({\n 'InstanceId': node['InstanceId'],\n 'NodeGroup': node.get('NodeGroup', 'unknown'),\n 'Success': False,\n 'Error': str(e),\n 'ElapsedTime': 0\n })\n \n # Save summary\n summary_saved = self.save_summary(results)\n\n print(\"-\" * 60)\n print(f\"\\nReport collection completed!\")\n print(f\"Instance reports uploaded to: s3://{self.s3_bucket}/{self.report_s3_key}/instances/\")\n if summary_saved:\n print(f\"Summary: s3://{self.s3_bucket}/{self.report_s3_key}/summary.json\")\n else:\n print(\"Warning: Summary upload failed — see error above\")\n \n # Print statistics\n successful = sum(1 for r in results if r['Success'])\n failed = len(results) - successful\n print(f\"\\nStatistics:\")\n print(f\" Total nodes: {len(results)}\")\n print(f\" Successful: {successful}\")\n print(f\" Failed: {failed}\")\n \n # Offer to download results\n self.offer_download_results()\n \n def offer_download_results(self):\n \"\"\"Ask user if they want to download results from S3.\"\"\"\n print(\"\\n\" + \"=\" * 60)\n print(\"Download Results\")\n print(\"=\" * 60)\n \n try:\n response = input(\"\\nWould you like to download all results from S3 to the current directory? (y/n): \").strip().lower()\n \n if response in ['y', 'yes']:\n download_dir = self.download_results_from_s3()\n \n if download_dir:\n # Ask about creating zip archive\n response = input(\"\\nWould you like to create a zip archive of the downloaded results? (y/n): \").strip().lower()\n \n if response in ['y', 'yes']:\n self.create_zip_archive(download_dir)\n else:\n print(\"\\nSkipping download. You can download manually using:\")\n print(f\" aws s3 sync s3://{self.s3_bucket}/{self.report_s3_key}/ ./{self.cluster_name}_{self.report_id}/\")\n \n except KeyboardInterrupt:\n print(\"\\n\\nDownload cancelled by user.\")\n except Exception as e:\n print(f\"\\nError during download prompt: {e}\")\n \n def download_results_from_s3(self) -> Optional[str]:\n \"\"\"Download all results from S3 to local directory.\n \n Returns:\n str: Path to download directory if successful, None otherwise\n \"\"\"\n # Create download directory\n download_dir = f\"{self.cluster_name}_{self.report_id}\"\n \n print(f\"\\nDownloading results to: ./{download_dir}/\")\n print(f\"Source: s3://{self.s3_bucket}/{self.report_s3_key}/\")\n \n try:\n # List all objects in the S3 prefix\n paginator = self.s3_client.get_paginator('list_objects_v2')\n pages = paginator.paginate(Bucket=self.s3_bucket, Prefix=self.report_s3_key)\n \n files_to_download = []\n for page in pages:\n if 'Contents' in page:\n for obj in page['Contents']:\n key = obj['Key']\n # Skip the prefix itself (directory marker)\n if key != self.report_s3_key and key != f\"{self.report_s3_key}/\":\n files_to_download.append(key)\n \n if not files_to_download:\n print(\"No files found to download.\")\n return None\n \n print(f\"Found {len(files_to_download)} files to download...\")\n \n # Download each file\n downloaded = 0\n failed = 0\n \n for key in files_to_download:\n # Calculate relative path (remove the report_s3_key prefix)\n relative_path = key[len(self.report_s3_key):].lstrip('/')\n local_path = os.path.join(download_dir, relative_path)\n \n # Create parent directory if needed\n local_dir = os.path.dirname(local_path)\n if local_dir:\n os.makedirs(local_dir, exist_ok=True)\n \n try:\n # Download file\n self.s3_client.download_file(self.s3_bucket, key, local_path)\n downloaded += 1\n \n # Show progress for every 5 files or last file\n if downloaded % 5 == 0 or downloaded == len(files_to_download):\n print(f\" Downloaded {downloaded}/{len(files_to_download)} files...\")\n \n except Exception as e:\n print(f\" Failed to download {relative_path}: {e}\")\n failed += 1\n \n print(f\"\\n✓ Download completed!\")\n print(f\" Downloaded: {downloaded} files\")\n if failed > 0:\n print(f\" Failed: {failed} files\")\n print(f\" Location: ./{download_dir}/\")\n \n return download_dir\n \n except Exception as e:\n print(f\"\\nError downloading results: {e}\")\n if self.debug:\n traceback.print_exc()\n return None\n \n def create_zip_archive(self, directory: str):\n \"\"\"Create a zip archive of the downloaded results.\n \n Args:\n directory: Path to directory to archive\n \"\"\"\n zip_filename = f\"{directory}.zip\"\n \n print(f\"\\nCreating zip archive: {zip_filename}\")\n \n try:\n with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:\n # Walk through directory\n file_count = 0\n for root, dirs, files in os.walk(directory):\n for file in files:\n file_path = os.path.join(root, file)\n # Calculate archive name (relative to directory)\n arcname = os.path.relpath(file_path, os.path.dirname(directory))\n zipf.write(file_path, arcname)\n file_count += 1\n \n # Show progress\n if file_count % 5 == 0:\n print(f\" Archived {file_count} files...\")\n \n # Get zip file size\n zip_size = os.path.getsize(zip_filename)\n zip_size_mb = zip_size / (1024 * 1024)\n \n print(f\"\\n✓ Zip archive created!\")\n print(f\" File: {zip_filename}\")\n print(f\" Size: {zip_size_mb:.2f} MB\")\n print(f\" Files: {file_count}\")\n \n # Ask if user wants to delete the uncompressed directory\n response = input(f\"\\nWould you like to delete the uncompressed directory '{directory}'? (y/n): \").strip().lower()\n \n if response in ['y', 'yes']:\n shutil.rmtree(directory)\n print(f\"✓ Deleted directory: {directory}\")\n else:\n print(f\"Keeping directory: {directory}\")\n \n except Exception as e:\n print(f\"\\nError creating zip archive: {e}\")\n if self.debug:\n traceback.print_exc()\n \n def save_summary(self, results: List[Dict]) -> bool:\n \"\"\"Save collection summary to S3. Returns True on success.\"\"\"\n summary = {\n 'cluster_name': self.cluster_name,\n 'cluster_id': self.cluster_id,\n 'report_id': self.report_id,\n 'timestamp': datetime.now(timezone.utc).isoformat(),\n 'total_nodes': len(results),\n 'successful': sum(1 for r in results if r['Success']),\n 'failed': sum(1 for r in results if not r['Success']),\n 'results': results\n }\n\n summary_key = f\"{self.report_s3_key}/summary.json\"\n\n try:\n self.s3_client.put_object(\n Bucket=self.s3_bucket,\n Key=summary_key,\n Body=json.dumps(summary, indent=2).encode('utf-8'),\n ContentType='application/json'\n )\n print(f\"Summary saved to: s3://{self.s3_bucket}/{summary_key}\")\n return True\n except Exception as e:\n print(f\"Error saving summary: {e}\")\n return False\n \n def verify_kubectl_config(self) -> bool:\n \"\"\"Verify kubectl is configured for the EKS cluster.\"\"\"\n if not self.eks_cluster_name:\n print(\"Warning: EKS cluster name not available, skipping kubectl verification\")\n return False\n \n try:\n # Check if kubectl is installed\n result = subprocess.run(['kubectl', 'version', '--client'], # nosec B603 B607\n capture_output=True, text=True, timeout=10)\n if result.returncode != 0:\n print(\"\\n\" + \"!\" * 60)\n print(\"ERROR: kubectl is not installed or not in PATH\")\n print(\"!\" * 60)\n return False\n \n # Extract just the version line\n version_line = result.stdout.strip().split('\\n')[0] if result.stdout else \"kubectl installed\"\n print(f\"kubectl version: {version_line}\")\n \n # Check current context\n result = subprocess.run(['kubectl', 'config', 'current-context'], # nosec B603 B607\n capture_output=True, text=True, timeout=10)\n if result.returncode == 0:\n current_context = result.stdout.strip()\n print(f\"Current kubectl context: {current_context}\")\n \n # Check if context matches EKS cluster\n if self.eks_cluster_name in current_context:\n print(f\"✓ kubectl is configured for EKS cluster: {self.eks_cluster_name}\")\n return True\n else:\n # Extract region from EKS cluster ARN\n arn_parts = self.eks_cluster_arn.split(':') if self.eks_cluster_arn else []\n if len(arn_parts) \u003c= 3:\n print(f\"Error: Malformed EKS cluster ARN: {self.eks_cluster_arn}\")\n return False\n region = arn_parts[3]\n \n print(\"\\n\" + \"!\" * 60)\n print(f\"ERROR: kubectl context does not match EKS cluster\")\n print(f\"Current context: {current_context}\")\n print(f\"Expected cluster: {self.eks_cluster_name}\")\n print(\"!\" * 60)\n print(\"\\nTo configure kubectl for this EKS cluster, run:\")\n print(f\" aws eks update-kubeconfig --name {self.eks_cluster_name} --region {region}\")\n return False\n else:\n # Extract region from EKS cluster ARN\n region = self.eks_cluster_arn.split(':')[3] if self.eks_cluster_arn else 'REGION'\n \n print(\"\\n\" + \"!\" * 60)\n print(\"ERROR: No kubectl context configured\")\n print(\"!\" * 60)\n print(\"\\nTo configure kubectl for this EKS cluster, run:\")\n print(f\" aws eks update-kubeconfig --name {self.eks_cluster_name} --region {region}\")\n return False\n \n except subprocess.TimeoutExpired:\n print(\"Warning: kubectl command timed out\")\n return False\n except FileNotFoundError:\n print(\"\\n\" + \"!\" * 60)\n print(\"ERROR: kubectl not found in PATH\")\n print(\"!\" * 60)\n return False\n except Exception as e:\n print(f\"Warning: Error verifying kubectl config: {e}\")\n return False\n\n @staticmethod\n def _save_kubectl_result(result: subprocess.CompletedProcess,\n name: str, description: str,\n kubectl_output_dir: str, elapsed: float,\n successful: int, failed: int) -> tuple:\n \"\"\"Save kubectl output and update counters. Returns (successful, failed).\"\"\"\n output_file = os.path.join(kubectl_output_dir, f'{name}.txt')\n if result.returncode == 0:\n if result.stdout.strip():\n with open(output_file, 'w', encoding='utf-8') as f:\n f.write(result.stdout)\n print(f\" Collecting: {description}... ✓ ({elapsed:.1f}s)\")\n successful += 1\n else:\n with open(output_file, 'w', encoding='utf-8') as f:\n f.write(\"No resources found\\n\")\n print(f\" Collecting: {description}... ✓ (empty, {elapsed:.1f}s)\")\n successful += 1\n else:\n with open(output_file, 'w', encoding='utf-8') as f:\n f.write(f\"Error: {result.stderr}\\n\")\n print(f\" Collecting: {description}... ✗ ({result.stderr.strip()[:50]}, {elapsed:.1f}s)\")\n failed += 1\n return successful, failed\n\n def collect_kubectl_node_info(self):\n \"\"\"Collect kubectl describe node information for all nodes.\"\"\"\n if self.cluster_type != 'eks':\n print(\"Skipping kubectl collection - not an EKS cluster\")\n return\n\n if not self.eks_cluster_name:\n print(\"Skipping kubectl collection - EKS cluster name not available\")\n return\n\n print(\"\\n\" + \"=\" * 60)\n print(\"Collecting kubectl node information...\")\n print(\"=\" * 60)\n\n # Verify kubectl configuration - exit if not configured\n if not self.verify_kubectl_config():\n print(\"\\n\" + \"!\" * 60)\n print(\"ERROR: kubectl must be configured for EKS clusters\")\n print(\"!\" * 60)\n print(\"\\nPlease configure kubectl and re-run the tool.\\n\")\n sys.exit(1)\n\n kubectl_output_dir = None\n tarball_path = None\n try:\n # Create output directory\n kubectl_output_dir = tempfile.mkdtemp(prefix='kubectl_output_')\n \n # Each subprocess.run uses static string arguments so security\n # linters can verify no dynamic command injection is possible.\n print(\"Collecting 15 Kubernetes resource types...\")\n successful = 0\n failed = 0\n timeout = KUBECTL_TIMEOUT\n\n # High Priority - Essential for troubleshooting\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'describe', 'nodes'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'nodes_describe', 'Node descriptions (capacity, conditions, pods)',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'pods', '-A', '-o', 'wide'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'pods_all_namespaces', 'All pods across namespaces (wide output)',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'describe', 'pods', '-A'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'pods_describe_all_namespaces', 'Detailed pod descriptions (all namespaces)',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'events', '-A', '--sort-by=.lastTimestamp'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'events_all_namespaces', 'Cluster events sorted by timestamp',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'pvc', '-A', '-o', 'wide'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'pvcs_all_namespaces', 'PersistentVolumeClaims (storage)',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'describe', 'pvc', '-A'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'pvcs_describe_all_namespaces', 'Detailed PVC descriptions',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'svc', '-A', '-o', 'wide'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'services_all_namespaces', 'Services (network endpoints)',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'describe', 'svc', '-A'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'services_describe_all_namespaces', 'Detailed service descriptions',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n # Medium Priority - Very useful\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'deployments', '-A', '-o', 'wide'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'deployments_all_namespaces', 'Deployments',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'statefulsets', '-A', '-o', 'wide'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'statefulsets_all_namespaces', 'StatefulSets',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'daemonsets', '-A', '-o', 'wide'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'daemonsets_all_namespaces', 'DaemonSets',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'configmaps', '-A'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'configmaps_all_namespaces', 'ConfigMaps (metadata only)',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'secrets', '-A'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'secrets_all_namespaces', 'Secrets (metadata only, no content)',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'resourcequota', '-A'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'resourcequotas_all_namespaces', 'Resource quotas',\n kubectl_output_dir, time.time() - t, successful, failed)\n\n t = time.time()\n successful, failed = self._save_kubectl_result(\n subprocess.run(['kubectl', 'get', 'networkpolicies', '-A'], # nosec B603 B607\n capture_output=True, text=True, timeout=timeout),\n 'networkpolicies_all_namespaces', 'Network policies',\n kubectl_output_dir, time.time() - t, successful, failed)\n \n print(f\"\\nCollection summary: {successful} successful, {failed} failed\")\n \n # Create tarball with files at root level (no wrapper directory)\n print(\"\\nCreating kubectl output tarball...\")\n tarball_fd, tarball_path = tempfile.mkstemp(suffix='_kubectl_resources.tar.gz')\n os.close(tarball_fd)\n \n with tarfile.open(tarball_path, 'w:gz') as tar:\n # Add each file directly to the tarball root (no parent directory)\n for filename in os.listdir(kubectl_output_dir):\n file_path = os.path.join(kubectl_output_dir, filename)\n tar.add(file_path, arcname=filename)\n \n print(f\"Created tarball: {tarball_path}\")\n \n # Upload to S3\n s3_key = f\"{self.report_s3_key}/kubectl_resources.tar.gz\"\n print(f\"Uploading to S3: s3://{self.s3_bucket}/{s3_key}\")\n \n self.s3_client.upload_file(tarball_path, self.s3_bucket, s3_key)\n \n print(f\"✓ Successfully uploaded kubectl resource information to S3\")\n print(f\" Location: s3://{self.s3_bucket}/{s3_key}\")\n\n except Exception as e:\n print(f\"Error collecting kubectl information: {e}\")\n if self.debug:\n traceback.print_exc()\n raise\n finally:\n # Cleanup temp files regardless of success or failure\n if kubectl_output_dir and os.path.isdir(kubectl_output_dir):\n shutil.rmtree(kubectl_output_dir, ignore_errors=True)\n if tarball_path and os.path.exists(tarball_path):\n os.remove(tarball_path)\n\n\ndef main():\n # Check platform compatibility\n if platform.system() == 'Windows':\n print(\"=\" * 70)\n print(\"ERROR: Windows is not supported\")\n print(\"=\" * 70)\n print()\n print(\"This tool uses pexpect for interactive SSM sessions, which has\")\n print(\"different behavior on Windows compared to macOS and Linux.\")\n print()\n print(\"Supported platforms:\")\n print(\" - macOS\")\n print(\" - Linux\")\n print()\n print(\"Please run this tool from a macOS or Linux machine, or use WSL\")\n print(\"(Windows Subsystem for Linux) if you're on Windows.\")\n print()\n sys.exit(1)\n \n parser = argparse.ArgumentParser(\n description='HyperPod Issue Report Collector - Supports both EKS and Slurm clusters',\n formatter_class=argparse.RawDescriptionHelpFormatter,\n epilog=\"\"\"\nExamples:\n # Basic usage - auto-detects cluster type\n python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket\n\n # With custom prefix and additional commands\n python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket/diagnostics \\\\\n --command \"df -h\" --command \"free -h\"\n\n # Target specific instance groups\n python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket \\\\\n --instance-groups worker-group-1 worker-group-2\n\n # Target specific nodes (instance IDs, EKS names, or Slurm names)\n python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket \\\\\n --nodes i-abc123 hyperpod-i-044bbf66a68558e87 ip-10-1-104-161\n \"\"\"\n )\n \n parser.add_argument('--cluster', '-c', required=True, help='HyperPod cluster name (EKS or Slurm)')\n parser.add_argument('--region', '-r', help='AWS region (uses default boto3 region if not specified)')\n parser.add_argument('--s3-path', '-s', required=True, help='S3 path for storing reports (e.g., s3://bucket-name/prefix or s3://bucket-name)')\n parser.add_argument('--command', '-cmd', action='append', help='Additional command to execute on nodes (can be specified multiple times)')\n parser.add_argument('--instance-groups', '-g', nargs='+', help='Target specific instance groups (e.g., --instance-groups worker1 worker2)')\n parser.add_argument('--max-workers', '-w', type=int, default=16, help='Maximum concurrent SSM sessions (default: 16, reduce if hitting throttling)')\n parser.add_argument('--nodes', '-n', nargs='+', help='Target specific nodes: instance IDs (i-*), EKS node names (hyperpod-i-*), or Slurm node names (ip-*)')\n parser.add_argument('--debug', '-d', action='store_true', help='Enable debug mode')\n \n args = parser.parse_args()\n \n # Validate mutually exclusive options\n if args.instance_groups and args.nodes:\n print(\"Error: --instance-groups and --nodes cannot be used together\")\n sys.exit(1)\n \n try:\n collector = HyperPodIssueReportCollector(\n cluster_name=args.cluster,\n s3_path=args.s3_path,\n region=args.region,\n debug=args.debug\n )\n \n # User-specified commands\n commands = []\n \n # Add any user-specified commands\n if args.command:\n commands.extend(args.command)\n \n collector.collect_reports(\n commands=commands,\n instance_groups=args.instance_groups,\n instance_ids=args.nodes,\n max_workers=args.max_workers\n )\n \n except KeyboardInterrupt:\n print(\"\\n\\nInterrupted by user. Exiting...\")\n sys.exit(1)\n except Exception as e:\n print(f\"\\nError: {e}\")\n if args.debug:\n traceback.print_exc()\n sys.exit(1)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":68446,"content_sha256":"fd8a0b962ca9cb2e33d35894e36a5d6b551c343707d01ee54b2c81ea229d5bd5"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"HyperPod Issue Report","type":"text"}]},{"type":"paragraph","content":[{"text":"Collect diagnostic logs from HyperPod cluster nodes via SSM, store results in S3. Supports both EKS and Slurm clusters with auto-detection. Uses the bundled ","type":"text"},{"text":"scripts/hyperpod_issue_report.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" for reliable parallel collection.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Prerequisites","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"AWS CLI configured with permissions: ","type":"text"},{"text":"sagemaker:DescribeCluster","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"sagemaker:ListClusterNodes","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"ssm:StartSession","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"s3:PutObject","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"s3:GetObject","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"eks:DescribeCluster","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Python 3.8+ and ","type":"text"},{"text":"uv","type":"text","marks":[{"type":"link","attrs":{"href":"https://docs.astral.sh/uv/","title":null}}]},{"text":" (see ","type":"text"},{"text":"uv installation docs","type":"text","marks":[{"type":"link","attrs":{"href":"https://docs.astral.sh/uv/getting-started/installation/","title":null}}]},{"text":" for install options)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"SSM Agent running on target nodes; node IAM roles need ","type":"text"},{"text":"s3:GetObject","type":"text","marks":[{"type":"code_inline"}]},{"text":"/","type":"text"},{"text":"s3:PutObject","type":"text","marks":[{"type":"code_inline"}]},{"text":" on the report bucket","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"For EKS clusters: kubectl installed and configured (see Workflow step 2)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Workflow","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"1. Gather Information","type":"text"}]},{"type":"paragraph","content":[{"text":"Collect from the user:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Cluster identifier","type":"text","marks":[{"type":"strong"}]},{"text":" (required): accepts cluster name or full cluster ARN (e.g., ","type":"text"},{"text":"arn:aws:sagemaker:us-west-2:123456789012:cluster/abc123","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"AWS region","type":"text","marks":[{"type":"strong"}]},{"text":" (required unless extractable from ARN)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"S3 path","type":"text","marks":[{"type":"strong"}]},{"text":" for report storage (required, e.g. ","type":"text"},{"text":"s3://bucket/prefix","type":"text","marks":[{"type":"code_inline"}]},{"text":"). If the user doesn't have a bucket, create one (e.g., ","type":"text"},{"text":"s3://hyperpod-diagnostics-\u003caccount-id>-\u003cregion>","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Issue description","type":"text","marks":[{"type":"strong"}]},{"text":" (optional)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Target scope","type":"text","marks":[{"type":"strong"}]},{"text":": all nodes, specific instance groups, or specific node IDs (optional)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Additional commands","type":"text","marks":[{"type":"strong"}]},{"text":" to run on nodes (optional)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"2. Verify Environment","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"aws sts get-caller-identity\naws sagemaker describe-cluster --cluster-name \u003cname-or-arn> --region \u003cregion>","type":"text"}]},{"type":"paragraph","content":[{"text":"If the S3 bucket doesn't exist, create it:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"aws s3 mb s3://\u003cbucket-name> --region \u003cregion>","type":"text"}]},{"type":"paragraph","content":[{"text":"For EKS clusters","type":"text","marks":[{"type":"strong"}]},{"text":" (check ","type":"text"},{"text":"Orchestrator.Eks","type":"text","marks":[{"type":"code_inline"}]},{"text":" in describe-cluster output):","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Ensure kubectl is installed (","type":"text"},{"text":"which kubectl","type":"text","marks":[{"type":"code_inline"}]},{"text":"). If missing, install it for the current platform.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Configure kubeconfig using the EKS cluster name from the describe-cluster response:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"aws eks update-kubeconfig --name \u003ceks-cluster-name> --region \u003cregion>","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"3. Run the Collection Script","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"uv run scripts/hyperpod_issue_report.py \\\n --cluster \u003ccluster-name-or-arn> \\\n --region \u003cregion> \\\n --s3-path s3://\u003cbucket>[/prefix]","type":"text"}]},{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"--help","type":"text","marks":[{"type":"code_inline"}]},{"text":" for all options including ","type":"text"},{"text":"--instance-groups","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--nodes","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--command","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--max-workers","type":"text","marks":[{"type":"code_inline"}]},{"text":", and ","type":"text"},{"text":"--debug","type":"text","marks":[{"type":"code_inline"}]},{"text":". Note: ","type":"text"},{"text":"--instance-groups","type":"text","marks":[{"type":"code_inline"}]},{"text":" and ","type":"text"},{"text":"--nodes","type":"text","marks":[{"type":"code_inline"}]},{"text":" are mutually exclusive. Node identifiers accept instance IDs (","type":"text"},{"text":"i-*","type":"text","marks":[{"type":"code_inline"}]},{"text":"), EKS names (","type":"text"},{"text":"hyperpod-i-*","type":"text","marks":[{"type":"code_inline"}]},{"text":"), or Slurm names (","type":"text"},{"text":"ip-*","type":"text","marks":[{"type":"code_inline"}]},{"text":").","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"4. Present Results","type":"text"}]},{"type":"paragraph","content":[{"text":"After collection, the script shows statistics and offers interactive download. Report the S3 location and offer to:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Download the report locally","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Help analyze collected diagnostics (see ","type":"text"},{"text":"references/collection-details.md","type":"text","marks":[{"type":"link","attrs":{"href":"references/collection-details.md","title":null}}]},{"text":" for what's in each file)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Prepare a summary for AWS Support","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Troubleshooting","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"See ","type":"text"},{"text":"references/troubleshooting.md","type":"text","marks":[{"type":"link","attrs":{"href":"references/troubleshooting.md","title":null}}]},{"text":" for error handling, large cluster tuning, and known limitations.","type":"text"}]}]},"metadata":{"date":"2026-06-05","name":"hyperpod-issue-report","author":"@skillopedia","source":{"stars":765,"repo_name":"agent-plugins","origin_url":"https://github.com/awslabs/agent-plugins/blob/HEAD/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md","repo_owner":"awslabs","body_sha256":"d02223f618f20526b2f602a82b400b3dec427d4607571e2906f1762717999936","cluster_key":"2cbb4edb9cc545b101c64b5460c2d13e80a66087be931c2aefcc2b151a9a9da7","clean_bundle":{"format":"clean-skill-bundle-v1","source":"awslabs/agent-plugins/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md","attachments":[{"id":"fbf42c0e-9f31-59a2-87d5-26d64cc84b26","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/fbf42c0e-9f31-59a2-87d5-26d64cc84b26/attachment.md","path":"references/collection-details.md","size":3839,"sha256":"f6bed21b40179f80474f517559026b80e61e89783c0af65118e5c2443b57e86f","contentType":"text/markdown; charset=utf-8"},{"id":"4d1a9b31-cc53-5615-8845-607bbb532f1f","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/4d1a9b31-cc53-5615-8845-607bbb532f1f/attachment.md","path":"references/troubleshooting.md","size":3622,"sha256":"4760a6987f8149f77b6e6494ef5f090434bf28ff35e8397636150d167665d84e","contentType":"text/markdown; charset=utf-8"},{"id":"f06b4e32-4470-5ca5-9ea3-1c6ba1da27e3","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/f06b4e32-4470-5ca5-9ea3-1c6ba1da27e3/attachment.py","path":"scripts/hyperpod_issue_report.py","size":68446,"sha256":"fd8a0b962ca9cb2e33d35894e36a5d6b551c343707d01ee54b2c81ea229d5bd5","contentType":"text/x-python; charset=utf-8"}],"bundle_sha256":"1d2925a4c3995e2db5b7617b8bd0143f30b11169eb148540d9110eaf8312ef05","attachment_count":3,"text_attachments":3,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":0,"excluded_attachments":[]},"cluster_size":1,"skill_md_path":"plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"devops-infrastructure","category_label":"DevOps"},"exact_dupes_collapsed_into_this":0},"version":"v1","category":"devops-infrastructure","metadata":{"version":"1.0.0"},"import_tag":"clean-skills-v1","description":"Generate comprehensive issue reports from HyperPod clusters (EKS and Slurm) by collecting diagnostic logs and configurations for troubleshooting and AWS Support cases. Use when users need to collect diagnostics from HyperPod cluster nodes, generate issue reports for AWS Support, investigate node failures or performance problems, document cluster state, or create diagnostic snapshots. Triggers on requests involving issue reports, diagnostic collection, support case preparation, or cluster troubleshooting that requires gathering logs and system information from multiple nodes."}},"renderedAt":1782979894081}

HyperPod Issue Report Collect diagnostic logs from HyperPod cluster nodes via SSM, store results in S3. Supports both EKS and Slurm clusters with auto-detection. Uses the bundled for reliable parallel collection. Prerequisites - AWS CLI configured with permissions: , , , , , - Python 3.8+ and uv (see uv installation docs for install options) - SSM Agent running on target nodes; node IAM roles need / on the report bucket - For EKS clusters: kubectl installed and configured (see Workflow step 2) Workflow 1. Gather Information Collect from the user: - Cluster identifier (required): accepts clust…