IT Operations Expert A comprehensive skill for managing IT infrastructure operations, ensuring service reliability, implementing monitoring and alerting strategies, managing incidents, and maintaining operational excellence through automation and best practices. Core Principles 1. Service Reliability First - Proactive Monitoring : Implement comprehensive observability before incidents occur - Incident Management : Structured response processes with clear escalation paths - SLA/SLO Management : Define and maintain service level objectives aligned with business needs - Continuous Improvement :…

\\n\\t' # Set internal field separator\n\n# Configuration\nreadonly SCRIPT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nreadonly LOG_FILE=\"/var/log/health_check.log\"\nreadonly EMAIL_TO=\"[email protected]\"\nreadonly DISK_THRESHOLD=80\nreadonly MEMORY_THRESHOLD=85\nreadonly CPU_THRESHOLD=90\n\n# Colors for output\nreadonly RED='\\033[0;31m'\nreadonly GREEN='\\033[0;32m'\nreadonly YELLOW='\\033[1;33m'\nreadonly NC='\\033[0m' # No Color\n\n# Logging function\nlog() {\n local level=$1\n shift\n local message=\"$@\"\n local timestamp=$(date '+%Y-%m-%d %H:%M:%S')\n echo \"[${timestamp}] [${level}] ${message}\" | tee -a \"${LOG_FILE}\"\n}\n\n# Error handling\nerror_exit() {\n log \"ERROR\" \"$1\"\n exit 1\n}\n\n# Cleanup on exit\ncleanup() {\n log \"INFO\" \"Cleaning up...\"\n # Remove temp files, etc.\n}\ntrap cleanup EXIT\n\n# Check if running as root\ncheck_root() {\n if [[ $EUID -ne 0 ]]; then\n error_exit \"This script must be run as root\"\n fi\n}\n\n# Check disk space\ncheck_disk_space() {\n log \"INFO\" \"Checking disk space...\"\n local status=0\n\n while read -r line; do\n local usage=$(echo \"$line\" | awk '{print $5}' | sed 's/%//')\n local mount=$(echo \"$line\" | awk '{print $6}')\n\n if [[ $usage -ge $DISK_THRESHOLD ]]; then\n log \"WARN\" \"Disk usage on ${mount} is ${usage}% (threshold: ${DISK_THRESHOLD}%)\"\n status=1\n else\n log \"INFO\" \"Disk usage on ${mount} is ${usage}% - OK\"\n fi\n done \u003c \u003c(df -h | grep -vE '^Filesystem|tmpfs|cdrom' | awk '{print $0}')\n\n return $status\n}\n\n# Check memory usage\ncheck_memory() {\n log \"INFO\" \"Checking memory usage...\"\n\n local total_mem=$(free -m | awk '/^Mem:/{print $2}')\n local used_mem=$(free -m | awk '/^Mem:/{print $3}')\n local usage_pct=$((used_mem * 100 / total_mem))\n\n if [[ $usage_pct -ge $MEMORY_THRESHOLD ]]; then\n log \"WARN\" \"Memory usage is ${usage_pct}% (threshold: ${MEMORY_THRESHOLD}%)\"\n return 1\n else\n log \"INFO\" \"Memory usage is ${usage_pct}% - OK\"\n return 0\n fi\n}\n\n# Check CPU load\ncheck_cpu() {\n log \"INFO\" \"Checking CPU load...\"\n\n local load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')\n local cpu_count=$(nproc)\n local load_pct=$(echo \"scale=2; ($load_avg / $cpu_count) * 100\" | bc)\n local load_pct_int=${load_pct%.*}\n\n if [[ $load_pct_int -ge $CPU_THRESHOLD ]]; then\n log \"WARN\" \"CPU load is ${load_pct}% (threshold: ${CPU_THRESHOLD}%)\"\n return 1\n else\n log \"INFO\" \"CPU load is ${load_pct}% - OK\"\n return 0\n fi\n}\n\n# Check critical services\ncheck_services() {\n log \"INFO\" \"Checking critical services...\"\n local services=(\"sshd\" \"nginx\" \"postgresql\")\n local status=0\n\n for service in \"${services[@]}\"; do\n if systemctl is-active --quiet \"$service\"; then\n log \"INFO\" \"Service ${service} is running - OK\"\n else\n log \"ERROR\" \"Service ${service} is NOT running\"\n status=1\n fi\n done\n\n return $status\n}\n\n# Check network connectivity\ncheck_network() {\n log \"INFO\" \"Checking network connectivity...\"\n local hosts=(\"8.8.8.8\" \"1.1.1.1\")\n local status=0\n\n for host in \"${hosts[@]}\"; do\n if ping -c 3 -W 5 \"$host\" &>/dev/null; then\n log \"INFO\" \"Network connectivity to ${host} - OK\"\n else\n log \"ERROR\" \"Cannot reach ${host}\"\n status=1\n fi\n done\n\n return $status\n}\n\n# Generate report\ngenerate_report() {\n local overall_status=$1\n\n cat \u003c\u003cEOF > /tmp/health_report.txt\nServer Health Check Report\n==========================\nHostname: $(hostname)\nDate: $(date)\nStatus: $([[ $overall_status -eq 0 ]] && echo \"HEALTHY\" || echo \"ISSUES DETECTED\")\n\nDetails:\n$(tail -n 50 \"${LOG_FILE}\")\n\nEOF\n\n # Email report if issues detected\n if [[ $overall_status -ne 0 ]]; then\n mail -s \"ALERT: Health Check Failed on $(hostname)\" \"${EMAIL_TO}\" \u003c /tmp/health_report.txt\n fi\n}\n\n# Main execution\nmain() {\n log \"INFO\" \"Starting server health check...\"\n\n local overall_status=0\n\n check_root\n check_disk_space || overall_status=1\n check_memory || overall_status=1\n check_cpu || overall_status=1\n check_services || overall_status=1\n check_network || overall_status=1\n\n if [[ $overall_status -eq 0 ]]; then\n log \"INFO\" \"All health checks passed\"\n else\n log \"WARN\" \"Some health checks failed\"\n fi\n\n generate_report $overall_status\n\n log \"INFO\" \"Health check complete\"\n exit $overall_status\n}\n\n# Run main function\nmain \"$@\"\n```\n\n### Python Automation Scripts\n\n```python\n#!/usr/bin/env python3\n\"\"\"\nUser account management automation\n\nAutomates user provisioning across multiple systems\n\"\"\"\n\nimport argparse\nimport logging\nimport sys\nimport subprocess\nimport json\nfrom typing import Dict, List, Optional\nfrom pathlib import Path\nimport yaml\n\n# Configure logging\nlogging.basicConfig(\n level=logging.INFO,\n format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',\n handlers=[\n logging.FileHandler('/var/log/user_management.log'),\n logging.StreamHandler(sys.stdout)\n ]\n)\nlogger = logging.getLogger(__name__)\n\n\nclass UserManagementError(Exception):\n \"\"\"Custom exception for user management errors\"\"\"\n pass\n\n\nclass UserManager:\n \"\"\"Manages user accounts across systems\"\"\"\n\n def __init__(self, config_file: str):\n self.config = self._load_config(config_file)\n self.dry_run = False\n\n def _load_config(self, config_file: str) -> Dict:\n \"\"\"Load configuration from YAML file\"\"\"\n try:\n with open(config_file, 'r') as f:\n return yaml.safe_load(f)\n except Exception as e:\n logger.error(f\"Failed to load config: {e}\")\n raise UserManagementError(f\"Config load failed: {e}\")\n\n def _run_command(self, cmd: List[str], check: bool = True) -> subprocess.CompletedProcess:\n \"\"\"Execute shell command with error handling\"\"\"\n logger.debug(f\"Executing command: {' '.join(cmd)}\")\n\n if self.dry_run:\n logger.info(f\"[DRY RUN] Would execute: {' '.join(cmd)}\")\n return subprocess.CompletedProcess(cmd, 0, '', '')\n\n try:\n result = subprocess.run(\n cmd,\n check=check,\n capture_output=True,\n text=True,\n timeout=30\n )\n return result\n except subprocess.CalledProcessError as e:\n logger.error(f\"Command failed: {e.stderr}\")\n raise UserManagementError(f\"Command failed: {e}\")\n except subprocess.TimeoutExpired:\n logger.error(f\"Command timed out: {' '.join(cmd)}\")\n raise UserManagementError(\"Command timeout\")\n\n def user_exists(self, username: str) -> bool:\n \"\"\"Check if user exists\"\"\"\n result = self._run_command(['id', username], check=False)\n return result.returncode == 0\n\n def create_user(\n self,\n username: str,\n full_name: str,\n email: str,\n groups: Optional[List[str]] = None,\n ssh_public_key: Optional[str] = None\n ) -> None:\n \"\"\"Create user account with specified attributes\"\"\"\n\n logger.info(f\"Creating user: {username}\")\n\n if self.user_exists(username):\n logger.warning(f\"User {username} already exists\")\n return\n\n # Create user\n cmd = ['useradd', '-m', '-s', '/bin/bash']\n\n if full_name:\n cmd.extend(['-c', full_name])\n\n if groups:\n cmd.extend(['-G', ','.join(groups)])\n\n cmd.append(username)\n\n self._run_command(cmd)\n\n # Set up SSH key\n if ssh_public_key:\n self._setup_ssh_key(username, ssh_public_key)\n\n # Send welcome email (pseudo-code)\n # self._send_welcome_email(username, email)\n\n logger.info(f\"User {username} created successfully\")\n\n def _setup_ssh_key(self, username: str, public_key: str) -> None:\n \"\"\"Set up SSH public key for user\"\"\"\n\n logger.info(f\"Setting up SSH key for {username}\")\n\n # Create .ssh directory\n ssh_dir = Path(f\"/home/{username}/.ssh\")\n ssh_dir.mkdir(mode=0o700, exist_ok=True)\n\n # Write authorized_keys\n authorized_keys = ssh_dir / \"authorized_keys\"\n authorized_keys.write_text(public_key + \"\\n\")\n authorized_keys.chmod(0o600)\n\n # Set ownership\n self._run_command(['chown', '-R', f'{username}:{username}', str(ssh_dir)])\n\n def delete_user(self, username: str, remove_home: bool = False) -> None:\n \"\"\"Delete user account\"\"\"\n\n logger.info(f\"Deleting user: {username}\")\n\n if not self.user_exists(username):\n logger.warning(f\"User {username} does not exist\")\n return\n\n cmd = ['userdel']\n if remove_home:\n cmd.append('-r')\n cmd.append(username)\n\n self._run_command(cmd)\n\n logger.info(f\"User {username} deleted successfully\")\n\n def modify_user_groups(self, username: str, groups: List[str]) -> None:\n \"\"\"Modify user's group membership\"\"\"\n\n logger.info(f\"Modifying groups for {username}\")\n\n if not self.user_exists(username):\n raise UserManagementError(f\"User {username} does not exist\")\n\n # Set supplementary groups\n cmd = ['usermod', '-G', ','.join(groups), username]\n self._run_command(cmd)\n\n logger.info(f\"Groups updated for {username}\")\n\n def list_users(self) -> List[Dict[str, str]]:\n \"\"\"List all users on the system\"\"\"\n\n logger.info(\"Listing all users\")\n\n users = []\n with open('/etc/passwd', 'r') as f:\n for line in f:\n parts = line.strip().split(':')\n if int(parts[2]) >= 1000: # Only regular users\n users.append({\n 'username': parts[0],\n 'uid': parts[2],\n 'home': parts[5],\n 'shell': parts[6]\n })\n\n return users\n\n def bulk_create_users(self, users_file: str) -> None:\n \"\"\"Create multiple users from CSV file\"\"\"\n\n logger.info(f\"Bulk creating users from {users_file}\")\n\n import csv\n\n with open(users_file, 'r') as f:\n reader = csv.DictReader(f)\n for row in reader:\n try:\n self.create_user(\n username=row['username'],\n full_name=row.get('full_name', ''),\n email=row.get('email', ''),\n groups=row.get('groups', '').split(',') if row.get('groups') else None,\n ssh_public_key=row.get('ssh_public_key')\n )\n except Exception as e:\n logger.error(f\"Failed to create user {row['username']}: {e}\")\n\n\ndef main():\n parser = argparse.ArgumentParser(description='User account management automation')\n parser.add_argument('--config', default='/etc/user_mgmt/config.yaml', help='Config file path')\n parser.add_argument('--dry-run', action='store_true', help='Dry run mode')\n\n subparsers = parser.add_subparsers(dest='command', help='Command to execute')\n\n # Create user\n create_parser = subparsers.add_parser('create', help='Create user')\n create_parser.add_argument('username', help='Username')\n create_parser.add_argument('--full-name', help='Full name')\n create_parser.add_argument('--email', help='Email address')\n create_parser.add_argument('--groups', help='Comma-separated groups')\n create_parser.add_argument('--ssh-key', help='SSH public key')\n\n # Delete user\n delete_parser = subparsers.add_parser('delete', help='Delete user')\n delete_parser.add_argument('username', help='Username')\n delete_parser.add_argument('--remove-home', action='store_true', help='Remove home directory')\n\n # List users\n list_parser = subparsers.add_parser('list', help='List users')\n\n # Bulk create\n bulk_parser = subparsers.add_parser('bulk-create', help='Bulk create users')\n bulk_parser.add_argument('csv_file', help='CSV file with user data')\n\n args = parser.parse_args()\n\n # Initialize manager\n manager = UserManager(args.config)\n manager.dry_run = args.dry_run\n\n # Execute command\n try:\n if args.command == 'create':\n groups = args.groups.split(',') if args.groups else None\n manager.create_user(\n args.username,\n args.full_name or '',\n args.email or '',\n groups,\n args.ssh_key\n )\n\n elif args.command == 'delete':\n manager.delete_user(args.username, args.remove_home)\n\n elif args.command == 'list':\n users = manager.list_users()\n for user in users:\n print(f\"{user['username']}\\t{user['uid']}\\t{user['home']}\")\n\n elif args.command == 'bulk-create':\n manager.bulk_create_users(args.csv_file)\n\n else:\n parser.print_help()\n sys.exit(1)\n\n except UserManagementError as e:\n logger.error(f\"Operation failed: {e}\")\n sys.exit(1)\n\n\nif __name__ == '__main__':\n main()\n```\n\n## Configuration Management\n\n### Ansible Playbooks\n\n```yaml\n# playbook.yml - Web server setup playbook\n\n---\n- name: Configure web servers\n hosts: webservers\n become: yes\n vars:\n nginx_port: 80\n app_user: webapp\n app_path: /var/www/myapp\n\n tasks:\n # System updates\n - name: Update apt cache\n apt:\n update_cache: yes\n cache_valid_time: 3600\n when: ansible_os_family == \"Debian\"\n\n # Install packages\n - name: Install required packages\n apt:\n name:\n - nginx\n - python3-pip\n - git\n - ufw\n state: present\n\n # Create application user\n - name: Create application user\n user:\n name: \"{{ app_user }}\"\n shell: /bin/bash\n create_home: yes\n system: no\n\n # Configure firewall\n - name: Configure UFW\n ufw:\n rule: allow\n port: \"{{ item }}\"\n proto: tcp\n loop:\n - 22\n - 80\n - 443\n\n - name: Enable UFW\n ufw:\n state: enabled\n policy: deny\n\n # Deploy application\n - name: Create application directory\n file:\n path: \"{{ app_path }}\"\n state: directory\n owner: \"{{ app_user }}\"\n group: \"{{ app_user }}\"\n mode: '0755'\n\n - name: Clone application repository\n git:\n repo: 'https://github.com/example/myapp.git'\n dest: \"{{ app_path }}\"\n version: main\n become_user: \"{{ app_user }}\"\n notify: restart nginx\n\n # Configure Nginx\n - name: Deploy Nginx configuration\n template:\n src: templates/nginx.conf.j2\n dest: /etc/nginx/sites-available/myapp\n owner: root\n group: root\n mode: '0644'\n notify: restart nginx\n\n - name: Enable Nginx site\n file:\n src: /etc/nginx/sites-available/myapp\n dest: /etc/nginx/sites-enabled/myapp\n state: link\n notify: restart nginx\n\n - name: Remove default Nginx site\n file:\n path: /etc/nginx/sites-enabled/default\n state: absent\n notify: restart nginx\n\n # Install application dependencies\n - name: Install Python dependencies\n pip:\n requirements: \"{{ app_path }}/requirements.txt\"\n virtualenv: \"{{ app_path }}/venv\"\n virtualenv_command: python3 -m venv\n become_user: \"{{ app_user }}\"\n\n # Configure systemd service\n - name: Deploy systemd service file\n template:\n src: templates/myapp.service.j2\n dest: /etc/systemd/system/myapp.service\n owner: root\n group: root\n mode: '0644'\n notify: restart myapp\n\n - name: Enable and start application service\n systemd:\n name: myapp\n enabled: yes\n state: started\n daemon_reload: yes\n\n # Monitoring\n - name: Install node_exporter\n include_role:\n name: node_exporter\n\n handlers:\n - name: restart nginx\n service:\n name: nginx\n state: restarted\n\n - name: restart myapp\n service:\n name: myapp\n state: restarted\n\n# Nginx template (templates/nginx.conf.j2)\n---\nserver {\n listen {{ nginx_port }};\n server_name {{ ansible_hostname }};\n\n location / {\n proxy_pass http://127.0.0.1:8000;\n proxy_set_header Host $host;\n proxy_set_header X-Real-IP $remote_addr;\n proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n }\n\n location /static {\n alias {{ app_path }}/static;\n }\n}\n```\n\n### Ansible Inventory\n\n```ini\n# inventory/production.ini\n\n[webservers]\nweb01.example.com ansible_host=10.0.10.20\nweb02.example.com ansible_host=10.0.10.21\nweb03.example.com ansible_host=10.0.10.22\n\n[databases]\ndb01.example.com ansible_host=10.0.30.10\ndb02.example.com ansible_host=10.0.30.11\n\n[monitoring]\nmon01.example.com ansible_host=10.0.99.10\n\n[all:vars]\nansible_user=ansible\nansible_become=yes\nansible_python_interpreter=/usr/bin/python3\n\n[webservers:vars]\nnginx_port=80\nenvironment=production\n```\n\n### Running Ansible\n\n```bash\n# Check syntax\nansible-playbook playbook.yml --syntax-check\n\n# Dry run (check mode)\nansible-playbook -i inventory/production.ini playbook.yml --check\n\n# Run playbook\nansible-playbook -i inventory/production.ini playbook.yml\n\n# Run specific tags\nansible-playbook -i inventory/production.ini playbook.yml --tags \"nginx,app\"\n\n# Limit to specific hosts\nansible-playbook -i inventory/production.ini playbook.yml --limit web01.example.com\n\n# Run ad-hoc command\nansible webservers -i inventory/production.ini -m shell -a \"uptime\"\nansible webservers -i inventory/production.ini -m apt -a \"name=nginx state=latest\" --become\n```\n\n## Orchestration Tools\n\n### Terraform for Infrastructure Orchestration\n\nSee [infrastructure.md](infrastructure.md#infrastructure-as-code) for detailed Terraform examples.\n\n### Kubernetes Operators (Advanced Automation)\n\n```yaml\n# Custom Kubernetes Operator for database backups\n# backup-operator.yaml\n\napiVersion: v1\nkind: ServiceAccount\nmetadata:\n name: backup-operator\n namespace: operators\n\n---\napiVersion: rbac.authorization.k8s.io/v1\nkind: ClusterRole\nmetadata:\n name: backup-operator\nrules:\n - apiGroups: [\"\"]\n resources: [\"pods\", \"secrets\"]\n verbs: [\"get\", \"list\", \"create\"]\n - apiGroups: [\"batch\"]\n resources: [\"cronjobs\", \"jobs\"]\n verbs: [\"get\", \"list\", \"create\", \"update\", \"delete\"]\n\n---\napiVersion: rbac.authorization.k8s.io/v1\nkind: ClusterRoleBinding\nmetadata:\n name: backup-operator\nroleRef:\n apiGroup: rbac.authorization.k8s.io\n kind: ClusterRole\n name: backup-operator\nsubjects:\n - kind: ServiceAccount\n name: backup-operator\n namespace: operators\n\n---\napiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: backup-operator\n namespace: operators\nspec:\n replicas: 1\n selector:\n matchLabels:\n app: backup-operator\n template:\n metadata:\n labels:\n app: backup-operator\n spec:\n serviceAccountName: backup-operator\n containers:\n - name: operator\n image: myregistry/backup-operator:latest\n env:\n - name: WATCH_NAMESPACE\n value: \"\"\n - name: POD_NAME\n valueFrom:\n fieldRef:\n fieldPath: metadata.name\n - name: OPERATOR_NAME\n value: \"backup-operator\"\n```\n\n## CI/CD for Infrastructure\n\n### GitOps Workflow\n\n```yaml\n# GitOps workflow for infrastructure changes\n\nStep 1: Developer makes change\n - Edit Terraform/Ansible files\n - Commit to feature branch\n - Open pull request\n\nStep 2: Automated validation (CI)\n - terraform fmt -check (code formatting)\n - terraform validate (syntax check)\n - tflint (linting)\n - terraform plan (dry run)\n - Security scan (Checkov, tfsec)\n - Cost estimation (Infracost)\n\nStep 3: Peer review\n - Code review by team member\n - Review terraform plan output\n - Approve or request changes\n\nStep 4: Merge to main\n - PR approved and merged\n - Triggers deployment pipeline\n\nStep 5: Automated deployment (CD)\n - terraform apply (auto-approved for specific changes)\n - Or manual approval for high-risk changes\n - Post-deployment validation\n - Notify team in Slack\n\nStep 6: Monitoring\n - Monitor for issues\n - Auto-rollback on failure\n - Update status page\n```\n\n### GitHub Actions for Infrastructure\n\n```yaml\n# .github/workflows/terraform.yml\n\nname: Terraform CI/CD\n\non:\n pull_request:\n branches: [main]\n paths:\n - 'terraform/**'\n push:\n branches: [main]\n paths:\n - 'terraform/**'\n\nenv:\n TF_VERSION: 1.6.0\n AWS_REGION: us-east-1\n\njobs:\n validate:\n runs-on: ubuntu-latest\n steps:\n - name: Checkout code\n uses: actions/checkout@v3\n\n - name: Setup Terraform\n uses: hashicorp/setup-terraform@v2\n with:\n terraform_version: ${{ env.TF_VERSION }}\n\n - name: Terraform Format Check\n run: terraform fmt -check -recursive\n working-directory: ./terraform\n\n - name: Terraform Init\n run: terraform init\n working-directory: ./terraform\n\n - name: Terraform Validate\n run: terraform validate\n working-directory: ./terraform\n\n - name: TFLint\n uses: terraform-linters/setup-tflint@v3\n with:\n tflint_version: latest\n\n - name: Run TFLint\n run: tflint --recursive\n working-directory: ./terraform\n\n plan:\n runs-on: ubuntu-latest\n needs: validate\n if: github.event_name == 'pull_request'\n steps:\n - name: Checkout code\n uses: actions/checkout@v3\n\n - name: Setup Terraform\n uses: hashicorp/setup-terraform@v2\n with:\n terraform_version: ${{ env.TF_VERSION }}\n\n - name: Configure AWS Credentials\n uses: aws-actions/configure-aws-credentials@v2\n with:\n aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}\n aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}\n aws-region: ${{ env.AWS_REGION }}\n\n - name: Terraform Init\n run: terraform init\n working-directory: ./terraform\n\n - name: Terraform Plan\n run: terraform plan -out=tfplan\n working-directory: ./terraform\n\n - name: Comment PR with Plan\n uses: actions/github-script@v6\n with:\n script: |\n const fs = require('fs');\n const plan = fs.readFileSync('./terraform/tfplan', 'utf8');\n github.rest.issues.createComment({\n issue_number: context.issue.number,\n owner: context.repo.owner,\n repo: context.repo.repo,\n body: `## Terraform Plan\\n\\`\\`\\`\\n${plan}\\n\\`\\`\\``\n });\n\n apply:\n runs-on: ubuntu-latest\n needs: validate\n if: github.event_name == 'push' && github.ref == 'refs/heads/main'\n steps:\n - name: Checkout code\n uses: actions/checkout@v3\n\n - name: Setup Terraform\n uses: hashicorp/setup-terraform@v2\n with:\n terraform_version: ${{ env.TF_VERSION }}\n\n - name: Configure AWS Credentials\n uses: aws-actions/configure-aws-credentials@v2\n with:\n aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}\n aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}\n aws-region: ${{ env.AWS_REGION }}\n\n - name: Terraform Init\n run: terraform init\n working-directory: ./terraform\n\n - name: Terraform Apply\n run: terraform apply -auto-approve\n working-directory: ./terraform\n\n - name: Notify Slack\n uses: 8398a7/action-slack@v3\n with:\n status: ${{ job.status }}\n text: 'Terraform applied successfully'\n webhook_url: ${{ secrets.SLACK_WEBHOOK }}\n if: always()\n```\n\n## Runbook Automation\n\n### ChatOps Integration\n\n```python\n# Slack bot for runbook automation\n\nfrom slack_bolt import App\nfrom slack_bolt.adapter.socket_mode import SocketModeHandler\nimport subprocess\nimport logging\n\napp = App(token=\"xoxb-your-token\")\n\nlogging.basicConfig(level=logging.INFO)\n\n# Command: restart service\[email protected](\"/restart-service\")\ndef restart_service_command(ack, command, respond):\n ack()\n\n service_name = command['text']\n\n # Validate service name (whitelist)\n allowed_services = ['nginx', 'postgresql', 'redis']\n\n if service_name not in allowed_services:\n respond(f\"❌ Service '{service_name}' is not allowed. Allowed: {', '.join(allowed_services)}\")\n return\n\n # Confirm with user\n respond(f\"Restarting service: {service_name}...\")\n\n try:\n # Execute restart command\n result = subprocess.run(\n ['ansible', 'webservers', '-m', 'service', '-a', f'name={service_name} state=restarted'],\n capture_output=True,\n text=True,\n timeout=30\n )\n\n if result.returncode == 0:\n respond(f\"✅ Service {service_name} restarted successfully\\n```{result.stdout}```\")\n else:\n respond(f\"❌ Failed to restart {service_name}\\n```{result.stderr}```\")\n\n except Exception as e:\n respond(f\"❌ Error: {str(e)}\")\n\n# Command: check server health\[email protected](\"/health-check\")\ndef health_check_command(ack, command, respond):\n ack()\n\n hostname = command['text'] or 'all'\n\n respond(f\"Running health check on {hostname}...\")\n\n try:\n result = subprocess.run(\n ['ansible', hostname, '-m', 'shell', '-a', '/opt/scripts/health_check.sh'],\n capture_output=True,\n text=True,\n timeout=60\n )\n\n respond(f\"```{result.stdout}```\")\n\n except Exception as e:\n respond(f\"❌ Error: {str(e)}\")\n\n# Interactive approval flow\[email protected](\"approve_deployment\")\ndef handle_approval(ack, body, client):\n ack()\n\n user = body[\"user\"][\"id\"]\n deployment_id = body[\"actions\"][0][\"value\"]\n\n # Execute deployment\n result = execute_deployment(deployment_id)\n\n client.chat_postMessage(\n channel=body[\"channel\"][\"id\"],\n text=f\"\u003c@{user}> approved deployment {deployment_id}. Status: {result}\"\n )\n\ndef execute_deployment(deployment_id):\n # Deployment logic here\n return \"Success\"\n\nif __name__ == \"__main__\":\n handler = SocketModeHandler(app, \"xapp-your-app-token\")\n handler.start()\n```\n\n## Self-Healing Systems\n\n### Auto-Remediation Framework\n\n```python\n# Self-healing automation framework\n\nimport time\nimport logging\nfrom prometheus_client import start_http_server, Counter, Gauge\nimport requests\n\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n# Metrics\nremediation_counter = Counter('remediations_total', 'Total remediations', ['type', 'status'])\nsystem_health = Gauge('system_health', 'System health status (1=healthy, 0=unhealthy)')\n\n\nclass SelfHealingAgent:\n \"\"\"Auto-remediation agent that monitors and fixes common issues\"\"\"\n\n def __init__(self):\n self.checks = [\n self.check_disk_space,\n self.check_service_health,\n self.check_memory_usage,\n ]\n\n def check_disk_space(self):\n \"\"\"Check disk space and clean up if needed\"\"\"\n import shutil\n\n stat = shutil.disk_usage('/')\n usage_pct = (stat.used / stat.total) * 100\n\n if usage_pct > 85:\n logger.warning(f\"Disk usage high: {usage_pct:.1f}%\")\n self.cleanup_disk()\n remediation_counter.labels(type='disk_cleanup', status='executed').inc()\n return False\n\n return True\n\n def cleanup_disk(self):\n \"\"\"Clean up disk space\"\"\"\n import subprocess\n\n logger.info(\"Cleaning up disk space...\")\n\n # Clean apt cache\n subprocess.run(['apt-get', 'clean'], check=False)\n\n # Clean old logs\n subprocess.run(['find', '/var/log', '-name', '*.gz', '-mtime', '+30', '-delete'], check=False)\n\n # Clean temp files\n subprocess.run(['find', '/tmp', '-mtime', '+7', '-delete'], check=False)\n\n logger.info(\"Disk cleanup complete\")\n\n def check_service_health(self):\n \"\"\"Check if critical services are running\"\"\"\n import subprocess\n\n services = ['nginx', 'postgresql']\n all_healthy = True\n\n for service in services:\n result = subprocess.run(\n ['systemctl', 'is-active', service],\n capture_output=True,\n text=True\n )\n\n if result.returncode != 0:\n logger.warning(f\"Service {service} is not running\")\n self.restart_service(service)\n all_healthy = False\n\n return all_healthy\n\n def restart_service(self, service_name):\n \"\"\"Restart a failed service\"\"\"\n import subprocess\n\n logger.info(f\"Restarting service: {service_name}\")\n\n try:\n subprocess.run(['systemctl', 'restart', service_name], check=True, timeout=30)\n logger.info(f\"Service {service_name} restarted successfully\")\n remediation_counter.labels(type='service_restart', status='success').inc()\n\n except Exception as e:\n logger.error(f\"Failed to restart {service_name}: {e}\")\n remediation_counter.labels(type='service_restart', status='failed').inc()\n\n def check_memory_usage(self):\n \"\"\"Check memory usage and kill memory hogs if needed\"\"\"\n import psutil\n\n mem = psutil.virtual_memory()\n usage_pct = mem.percent\n\n if usage_pct > 90:\n logger.warning(f\"Memory usage critical: {usage_pct:.1f}%\")\n self.kill_memory_hogs()\n remediation_counter.labels(type='memory_cleanup', status='executed').inc()\n return False\n\n return True\n\n def kill_memory_hogs(self):\n \"\"\"Kill processes using excessive memory\"\"\"\n import psutil\n\n logger.info(\"Identifying memory hogs...\")\n\n # Get processes sorted by memory usage\n processes = []\n for proc in psutil.process_iter(['pid', 'name', 'memory_percent']):\n try:\n processes.append(proc.info)\n except (psutil.NoSuchProcess, psutil.AccessDenied):\n pass\n\n processes.sort(key=lambda x: x['memory_percent'], reverse=True)\n\n # Kill top memory consumer (with safety checks)\n for proc in processes[:3]: # Check top 3\n # Don't kill critical processes\n if proc['name'] in ['systemd', 'init', 'sshd']:\n continue\n\n if proc['memory_percent'] > 20: # Using more than 20% memory\n logger.warning(f\"Killing memory hog: {proc['name']} (PID {proc['pid']})\")\n try:\n psutil.Process(proc['pid']).kill()\n break\n except Exception as e:\n logger.error(f\"Failed to kill process: {e}\")\n\n def run(self, interval=60):\n \"\"\"Run continuous health checks\"\"\"\n logger.info(\"Starting self-healing agent...\")\n\n while True:\n try:\n all_healthy = True\n\n for check in self.checks:\n if not check():\n all_healthy = False\n\n system_health.set(1 if all_healthy else 0)\n\n time.sleep(interval)\n\n except Exception as e:\n logger.error(f\"Error in health check loop: {e}\")\n time.sleep(interval)\n\n\nif __name__ == '__main__':\n # Start Prometheus metrics server\n start_http_server(9101)\n\n # Run agent\n agent = SelfHealingAgent()\n agent.run(interval=60)\n```\n\n## Toil Reduction\n\n### Measuring Toil\n\n```yaml\nToil Definition (Google SRE):\n - Manual: Requires human intervention\n - Repetitive: Done over and over\n - Automatable: Can be automated\n - Tactical: Interrupt-driven, reactive\n - No enduring value: Doesn't improve system\n - Scales linearly: Grows with service\n\nExamples of Toil:\n - Manually restarting services\n - Copying files between servers\n - Running deployment scripts manually\n - Responding to recurring alerts\n - Manual user provisioning\n - Generating reports manually\n\nTarget: \u003c 50% of time on toil (Google SRE recommendation)\n```\n\n### Toil Reduction Strategy\n\n```yaml\nStep 1: Identify Toil (Weekly)\n - Track time spent on tasks\n - Categorize: Toil vs Engineering work\n - Prioritize by time impact\n\n Tool: Time tracking spreadsheet\n Columns: Task, Time Spent, Frequency, Automatable?, Priority\n\nStep 2: Quantify Impact\n - Calculate time spent per month\n - Estimate automation effort\n - Calculate ROI (see ROI calculator above)\n\nStep 3: Automate High-Impact Toil\n - Start with highest ROI\n - Build automation incrementally\n - Test thoroughly\n - Document automation\n\nStep 4: Measure Improvement\n - Track toil percentage monthly\n - Celebrate wins\n - Share automations across team\n\nStep 5: Prevent New Toil\n - Question new manual processes\n - Design for automation from start\n - Code review includes automation check\n```\n\nThis comprehensive automation guide provides everything needed to reduce toil and improve operational efficiency through automation.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":40343,"content_sha256":"4903c9d1a65ecfd96ff0708a8bda5322d49440d20683756719d5198659992def"},{"filename":"reference/backup-recovery.md","content":"# Backup and Disaster Recovery\n\nComprehensive guide to backup strategies, disaster recovery planning, business continuity, and data protection for IT operations.\n\n## Table of Contents\n- [Backup Strategy](#backup-strategy)\n- [Backup Types](#backup-types)\n- [Backup Tools](#backup-tools)\n- [Disaster Recovery Planning](#disaster-recovery-planning)\n- [Business Continuity](#business-continuity)\n- [Recovery Testing](#recovery-testing)\n- [Cloud Backup Solutions](#cloud-backup-solutions)\n- [Database Backups](#database-backups)\n- [Backup Monitoring](#backup-monitoring)\n\n## Backup Strategy\n\n### 3-2-1 Backup Rule\n\n```yaml\nThe Gold Standard: 3-2-1 Rule\n\n3 Copies of Data:\n - 1 Production copy\n - 2 Backup copies\n\n2 Different Media Types:\n - Local storage (NAS, SAN)\n - Cloud storage (S3, Azure Blob)\n - Or: Disk + Tape\n\n1 Offsite Copy:\n - Geographic separation\n - Protection against site disasters\n - Cloud storage or remote datacenter\n\nExample Implementation:\n Production: Database server (primary)\n Backup 1: Local NAS (hourly snapshots)\n Backup 2: Cloud storage S3 (daily backups)\n Result: 3 copies, 2 media (disk + cloud), 1 offsite (cloud)\n```\n\n### Backup Policy Framework\n\n```yaml\nRPO (Recovery Point Objective):\n Definition: Maximum acceptable data loss (time)\n Question: \"How much data can we afford to lose?\"\n\n Examples:\n Critical databases: RPO = 15 minutes (need transaction log backups)\n File servers: RPO = 24 hours (daily backups acceptable)\n Development servers: RPO = 7 days (weekly backups)\n\nRTO (Recovery Time Objective):\n Definition: Maximum acceptable downtime (time)\n Question: \"How quickly must we recover?\"\n\n Examples:\n E-commerce site: RTO = 1 hour (hot standby, fast recovery)\n Internal tools: RTO = 8 hours (restore from backup)\n Archive data: RTO = 72 hours (restore from tape/glacier)\n\nRetention Policy:\n Daily backups: Keep 7 days\n Weekly backups: Keep 4 weeks\n Monthly backups: Keep 12 months\n Yearly backups: Keep 7 years (compliance)\n\n Grandfather-Father-Son (GFS) Rotation:\n Son: Daily backups (7 days)\n Father: Weekly backups (4 weeks)\n Grandfather: Monthly backups (12 months)\n```\n\n### Backup Matrix\n\n| System | Criticality | RPO | RTO | Backup Frequency | Retention | Method |\n|--------|-------------|-----|-----|------------------|-----------|--------|\n| Production Database | Critical | 15 min | 1 hour | Continuous (transaction logs) + Daily full | 30 days | Replication + Snapshots |\n| Application Servers | High | 1 hour | 4 hours | Hourly incremental | 7 days | Agent-based |\n| File Servers | Medium | 24 hours | 8 hours | Daily | 30 days | Filesystem snapshots |\n| Development | Low | 7 days | 24 hours | Weekly | 14 days | Full backup |\n| Workstations | Low | N/A | N/A | User responsibility | N/A | Cloud sync |\n\n## Backup Types\n\n### Full Backup\n\n```yaml\nDescription:\n - Complete copy of all data\n - Self-contained (no dependencies)\n\nPros:\n - Simplest to restore (single backup set)\n - Fastest restore time\n - No dependency on other backups\n\nCons:\n - Slowest backup time\n - Largest storage requirement\n - Most network bandwidth\n\nUse Case:\n - Weekly or monthly baseline\n - Small datasets (\u003c 1 TB)\n - High-priority systems\n\nTime Required:\n - 1 TB database: 2-4 hours (to disk)\n - 10 TB file server: 20-40 hours\n```\n\n### Incremental Backup\n\n```yaml\nDescription:\n - Only backs up changes since last backup (full or incremental)\n - Creates chain of dependencies\n\nPros:\n - Fastest backup time\n - Smallest storage requirement\n - Least network bandwidth\n\nCons:\n - Slowest restore (need full + all incrementals)\n - Higher restore complexity\n - Chain dependency (missing link = data loss)\n\nUse Case:\n - Daily/hourly backups\n - Large datasets with small changes\n - Bandwidth-constrained environments\n\nTime Required:\n - Daily changes (10 GB): 5-15 minutes\n\nRestore Process:\n 1. Restore full backup (baseline)\n 2. Apply incremental 1\n 3. Apply incremental 2\n 4. ... apply all incrementals in order\n```\n\n### Differential Backup\n\n```yaml\nDescription:\n - Backs up changes since last FULL backup\n - Each differential is cumulative\n\nPros:\n - Faster restore than incremental (only need full + latest differential)\n - Simpler dependency chain\n - Easier to manage than incremental\n\nCons:\n - Slower than incremental (growing backup size)\n - More storage than incremental\n\nUse Case:\n - Compromise between full and incremental\n - Weekly full + daily differentials\n\nTime Required:\n - Day 1 differential: 10 GB (15 min)\n - Day 2 differential: 20 GB (30 min)\n - Day 6 differential: 60 GB (90 min)\n\nRestore Process:\n 1. Restore full backup\n 2. Apply latest differential only\n```\n\n### Snapshot Backup\n\n```yaml\nDescription:\n - Point-in-time copy using storage features\n - Copy-on-write or redirect-on-write\n - Nearly instantaneous\n\nPros:\n - Very fast to create (seconds)\n - Minimal performance impact\n - Multiple snapshots (hourly, daily)\n - Fast rollback\n\nCons:\n - Depends on source storage (not offsite)\n - Storage overhead grows over time\n - Limited retention (storage capacity)\n\nUse Case:\n - Frequent recovery points (hourly)\n - VM backups\n - Database consistency points\n - Pre-change snapshots\n\nExamples:\n - LVM snapshots (Linux)\n - ZFS snapshots\n - VMware snapshots\n - AWS EBS snapshots\n```\n\n### Continuous Data Protection (CDP)\n\n```yaml\nDescription:\n - Real-time or near real-time replication\n - Every change is captured\n - Can recover to any point in time\n\nPros:\n - RPO near zero (\u003c 1 minute)\n - Granular recovery (any point in time)\n - No backup windows\n\nCons:\n - Most expensive\n - Complex to implement\n - High bandwidth requirements\n\nUse Case:\n - Mission-critical databases\n - Financial systems\n - Zero data loss requirement\n\nExamples:\n - Database replication (PostgreSQL streaming replication)\n - Storage replication (DRBD, ZFS replication)\n - Application-level replication\n```\n\n## Backup Tools\n\n### Open Source Backup Tools\n\n**Rsync (File-level)**:\n```bash\n#!/bin/bash\n# Rsync backup script with rotation\n\nSOURCE=\"/var/www\"\nDEST=\"/backup/www\"\nDATE=$(date +%Y%m%d_%H%M%S)\nBACKUP_DIR=\"${DEST}/${DATE}\"\nLATEST_LINK=\"${DEST}/latest\"\n\n# Create backup with hard links to previous backup (space-efficient)\nrsync -avH \\\n --delete \\\n --link-dest=\"${LATEST_LINK}\" \\\n \"${SOURCE}/\" \\\n \"${BACKUP_DIR}/\"\n\n# Update latest symlink\nrm -f \"${LATEST_LINK}\"\nln -s \"${BACKUP_DIR}\" \"${LATEST_LINK}\"\n\n# Retention: Keep 7 daily backups\nfind \"${DEST}\" -maxdepth 1 -type d -mtime +7 -exec rm -rf {} \\;\n\necho \"Backup completed: ${BACKUP_DIR}\"\n```\n\n**Restic (Encrypted, deduplicated backups)**:\n```bash\n#!/bin/bash\n# Restic backup to S3\n\nexport RESTIC_REPOSITORY=\"s3:s3.amazonaws.com/my-backup-bucket\"\nexport RESTIC_PASSWORD=\"your-encryption-password\"\nexport AWS_ACCESS_KEY_ID=\"your-access-key\"\nexport AWS_SECRET_ACCESS_KEY=\"your-secret-key\"\n\n# Initialize repository (first time only)\n# restic init\n\n# Backup\nrestic backup /var/www /etc --exclude=\"*.log\" --tag daily\n\n# Verify backup\nrestic check\n\n# List snapshots\nrestic snapshots\n\n# Retention: Keep last 7 daily, 4 weekly, 12 monthly\nrestic forget --keep-daily 7 --keep-weekly 4 --keep-monthly 12 --prune\n\n# Restore (to different location)\n# restic restore latest --target /tmp/restore\n```\n\n**Borg Backup (Deduplicated, compressed)**:\n```bash\n#!/bin/bash\n# Borg backup script\n\nREPO=\"/backup/borg-repo\"\nHOSTNAME=$(hostname)\n\n# Initialize repository (first time only)\n# borg init --encryption=repokey ${REPO}\n\n# Create backup\nborg create \\\n --verbose \\\n --stats \\\n --compression lz4 \\\n ${REPO}::${HOSTNAME}-$(date +%Y%m%d_%H%M%S) \\\n /var/www \\\n /etc \\\n --exclude '/var/www/cache/*' \\\n --exclude '*.tmp'\n\n# Prune old backups\nborg prune \\\n --verbose \\\n --list \\\n ${REPO} \\\n --prefix ${HOSTNAME}- \\\n --keep-daily 7 \\\n --keep-weekly 4 \\\n --keep-monthly 6\n\n# List backups\nborg list ${REPO}\n\n# Restore\n# borg extract ${REPO}::${HOSTNAME}-20250115_120000\n```\n\n**Bacula (Enterprise backup suite)**:\n```conf\n# /etc/bacula/bacula-dir.conf - Director configuration\n\nDirector {\n Name = backup-dir\n DIRport = 9101\n QueryFile = \"/etc/bacula/query.sql\"\n WorkingDirectory = \"/var/lib/bacula\"\n PidDirectory = \"/var/run/bacula\"\n Maximum Concurrent Jobs = 20\n Password = \"director-password\"\n Messages = Daemon\n}\n\n# File sets\nFileSet {\n Name = \"WebServer Files\"\n Include {\n Options {\n signature = MD5\n compression = GZIP\n }\n File = /var/www\n File = /etc/nginx\n }\n Exclude {\n File = /var/www/cache\n }\n}\n\n# Job definition\nJob {\n Name = \"WebServerBackup\"\n Type = Backup\n Level = Incremental\n Client = webserver01-fd\n FileSet = \"WebServer Files\"\n Schedule = \"WeeklyCycle\"\n Storage = File\n Messages = Standard\n Pool = Default\n Priority = 10\n Write Bootstrap = \"/var/lib/bacula/%c.bsr\"\n}\n\n# Schedule\nSchedule {\n Name = \"WeeklyCycle\"\n Run = Full 1st sun at 23:05\n Run = Differential 2nd-5th sun at 23:05\n Run = Incremental mon-sat at 23:05\n}\n\n# Client\nClient {\n Name = webserver01-fd\n Address = 10.0.10.20\n FDPort = 9102\n Catalog = MyCatalog\n Password = \"client-password\"\n File Retention = 30 days\n Job Retention = 6 months\n AutoPrune = yes\n}\n```\n\n### Commercial Backup Solutions\n\n```yaml\nVeeam Backup & Replication:\n Best For: VMware, Hyper-V environments\n Features:\n - VM-aware backups\n - Instant VM recovery\n - Replication\n - Cloud backups\n Pricing: $$\n\nCommvault:\n Best For: Large enterprises\n Features:\n - Multi-platform (VM, physical, cloud)\n - Compliance and e-discovery\n - Data classification\n Pricing: $$\n\nAcronis Cyber Backup:\n Best For: MSPs, SMBs\n Features:\n - Image-based backups\n - Anti-ransomware\n - Cloud integration\n Pricing: $\n\nRubrik:\n Best For: Modern enterprises\n Features:\n - Cloud-native architecture\n - Instant recovery\n - Policy-driven automation\n Pricing: $$\n```\n\n## Disaster Recovery Planning\n\n### DR Site Types\n\n```yaml\nCold Site:\n Description: Empty datacenter space with basic infrastructure\n RTO: Days to weeks\n Cost: $\n Use Case: Non-critical systems, tight budget\n\n Includes:\n - Physical space\n - Power and cooling\n - Network connectivity\n Does NOT Include:\n - Hardware\n - Software\n - Data\n\nWarm Site:\n Description: Partially equipped datacenter with some systems ready\n RTO: Hours to days\n Cost: $\n Use Case: Medium criticality, balanced budget\n\n Includes:\n - Hardware installed\n - Software installed\n - Network configured\n - Older backups restored\n Missing:\n - Latest data (must restore from backup)\n\nHot Site:\n Description: Fully equipped datacenter with live replication\n RTO: Minutes to hours\n Cost: $$\n Use Case: Critical systems, zero data loss\n\n Includes:\n - Identical hardware\n - Real-time data replication\n - Ready to take over immediately\n - Regular testing\n\nActive-Active (No DR site):\n Description: Multiple production sites, all serving traffic\n RTO: Seconds to minutes\n Cost: $$$\n Use Case: Mission-critical, global services\n\n Includes:\n - Load balanced across sites\n - Automatic failover\n - No \"DR site\" (all are production)\n```\n\n### DR Plan Template\n\n```markdown\n# Disaster Recovery Plan\n\n## 1. Scope and Objectives\n\n### Systems Covered\n- Production database cluster\n- Application servers (web tier)\n- API gateway\n- Authentication service\n\n### Recovery Objectives\n- RTO: 4 hours\n- RPO: 1 hour\n- Maximum Tolerable Downtime: 24 hours\n\n## 2. Roles and Responsibilities\n\n| Role | Name | Phone | Email | Responsibility |\n|------|------|-------|-------|----------------|\n| DR Coordinator | John Doe | +1-555-0100 | [email protected] | Overall coordination |\n| Infrastructure Lead | Jane Smith | +1-555-0101 | [email protected] | Server recovery |\n| Database Lead | Bob Wilson | +1-555-0102 | [email protected] | Database recovery |\n| Application Lead | Alice Johnson | +1-555-0103 | [email protected] | Application recovery |\n| Communications Lead | Carol Martinez | +1-555-0104 | [email protected] | Stakeholder updates |\n\n## 3. Emergency Contact List\n\n### Internal Contacts\n- CTO: +1-555-0200\n- VP Engineering: +1-555-0201\n- On-Call Engineer: PagerDuty escalation\n\n### External Contacts\n- AWS Support: 1-877-632-3000\n- DNS Provider (Cloudflare): support ticket\n- ISP: 1-800-xxx-xxxx\n\n## 4. DR Invocation Criteria\n\nInvoke DR plan if:\n- Primary datacenter is inaccessible (fire, flood, power outage > 4 hours)\n- Catastrophic system failure (ransomware, data corruption)\n- Prolonged network outage (> 2 hours)\n- Legal/safety order to evacuate\n\nDecision Maker: CTO or VP Engineering\n\n## 5. Recovery Procedures\n\n### Phase 1: Assessment (0-30 minutes)\n1. Assess extent of disaster\n2. Activate DR team (conference call)\n3. Declare disaster (DR Coordinator)\n4. Notify stakeholders\n5. Update status page\n\n### Phase 2: Failover to DR Site (30 minutes - 2 hours)\n1. Verify DR site accessibility\n2. Restore latest backups to DR site\n - Database: Restore from S3 (1 hour)\n - Application: Deploy from Git (30 minutes)\n3. Update DNS to point to DR site (5 minutes + TTL propagation)\n4. Validate connectivity and functionality\n\n### Phase 3: Service Validation (2-3 hours)\n1. Run smoke tests\n2. Verify database integrity\n3. Test critical user workflows\n4. Monitor error rates and performance\n\n### Phase 4: Operations at DR Site (3-4 hours)\n1. Begin normal operations from DR site\n2. Continuous monitoring\n3. Communicate to users: \"Services restored\"\n\n### Phase 5: Return to Primary (Days/Weeks)\n1. Repair/rebuild primary site\n2. Replicate data back to primary\n3. Scheduled failback (low-traffic window)\n4. Validate primary site\n5. Return to normal operations\n\n## 6. Step-by-Step Recovery\n\n### Database Recovery\n```bash\n# 1. Restore database from S3 backup\naws s3 cp s3://backups/db-latest.sql.gz /tmp/\ngunzip /tmp/db-latest.sql.gz\n\n# 2. Create new database\ncreatedb production\n\n# 3. Restore data\npsql production \u003c /tmp/db-latest.sql\n\n# 4. Verify data\npsql production -c \"SELECT COUNT(*) FROM users;\"\npsql production -c \"SELECT MAX(created_at) FROM orders;\"\n\n# 5. Update connection string in application\n# Edit /etc/app/config.yaml\n# DB_HOST: dr-db.example.com\n```\n\n### Application Recovery\n```bash\n# 1. Pull latest code\ncd /opt/app\ngit fetch origin\ngit checkout production\n\n# 2. Install dependencies\npip install -r requirements.txt\n\n# 3. Update configuration for DR environment\ncp config/dr.yaml config/production.yaml\n\n# 4. Start application\nsystemctl start myapp\n\n# 5. Verify\ncurl https://dr.example.com/health\n```\n\n### DNS Failover\n```bash\n# Update DNS to point to DR site\n# Example: Cloudflare API\n\ncurl -X PUT \"https://api.cloudflare.com/client/v4/zones/{zone_id}/dns_records/{record_id}\" \\\n -H \"Authorization: Bearer {api_token}\" \\\n -H \"Content-Type: application/json\" \\\n --data '{\n \"type\": \"A\",\n \"name\": \"www\",\n \"content\": \"203.0.113.100\",\n \"ttl\": 300,\n \"proxied\": false\n }'\n```\n\n## 7. Communication Plan\n\n### Internal Communication\n- Slack channel: #incident-dr\n- Conference bridge: Zoom link\n- Update frequency: Every 30 minutes\n\n### External Communication\n- Status page: status.example.com\n- Twitter: @example_status\n- Email: [email protected]\n- Update frequency: Every hour or when status changes\n\n### Communication Template\n```\nSubject: Service Disruption - Disaster Recovery Activated\n\nWe are experiencing a major service disruption due to [reason].\n\nCurrent Status: Disaster recovery procedures are in progress.\n\nImpact: All services are currently unavailable.\n\nETA: We expect to restore services within 4 hours.\n\nWe will provide updates every hour.\n\nFor more information: https://status.example.com\n```\n\n## 8. Testing Schedule\n\n- Tabletop Exercise: Quarterly\n- DR Drill (partial): Bi-annually\n- Full DR Test: Annually\n\n## 9. Document Maintenance\n\n- Review: Quarterly\n- Owner: DR Coordinator\n- Last Updated: 2025-01-15\n- Next Review: 2025-04-15\n```\n\n## Business Continuity\n\n### Business Impact Analysis (BIA)\n\n```yaml\nPurpose: Identify critical business functions and their dependencies\n\nProcess:\n 1. Identify Business Functions:\n - Customer transactions (e-commerce)\n - Customer support (ticket system)\n - Employee email (email server)\n - Payroll (HR system)\n\n 2. Assess Impact of Downtime:\n Per Hour Downtime:\n - E-commerce: $10,000 revenue loss + reputation damage\n - Customer support: Customer satisfaction impact\n - Email: Productivity loss\n - Payroll: Minimal (unless near payday)\n\n 3. Determine RTO/RPO:\n E-commerce: RTO 1 hour, RPO 5 minutes\n Customer support: RTO 4 hours, RPO 1 hour\n Email: RTO 8 hours, RPO 4 hours\n Payroll: RTO 24 hours, RPO 24 hours\n\n 4. Identify Dependencies:\n E-commerce depends on:\n - Web servers\n - Database\n - Payment gateway (external)\n - Inventory system\n\n 5. Develop Recovery Strategies:\n E-commerce: Hot DR site with real-time replication\n Customer support: Warm DR site, daily backups\n Email: Cloud-based (Office 365) - already resilient\n Payroll: Weekly backups, manual processing possible\n```\n\n### BCP vs DR\n\n```yaml\nBusiness Continuity Plan (BCP):\n Scope: Entire business operations\n Focus: Keeping business running during disruption\n Includes:\n - Alternate work locations\n - Manual processes\n - Third-party vendors\n - Supply chain\n - Communication plans\n\n Example: COVID-19 pandemic\n - Work from home policy\n - VPN capacity expansion\n - Video conferencing tools\n - Policy changes\n\nDisaster Recovery Plan (DR):\n Scope: IT systems and data\n Focus: Restoring technology infrastructure\n Includes:\n - Server recovery\n - Data restoration\n - Network failover\n - Application recovery\n\n Example: Datacenter fire\n - Failover to DR site\n - Restore from backups\n - DNS updates\n\nRelationship: DR is a subset of BCP\n```\n\n## Recovery Testing\n\n### Testing Levels\n\n```yaml\nLevel 1: Tabletop Exercise (Quarterly)\n Duration: 2 hours\n Participants: DR team, stakeholders\n Process:\n - Present disaster scenario\n - Walk through DR plan\n - Discuss roles and procedures\n - Identify gaps and improvements\n\n No actual systems affected\n\nLevel 2: Partial DR Test (Bi-annually)\n Duration: 4 hours\n Participants: DR team\n Process:\n - Restore backups to DR environment\n - Validate data integrity\n - Test application startup\n - No traffic cutover\n\n Production unaffected\n\nLevel 3: Full DR Test (Annually)\n Duration: 1 day\n Participants: All teams\n Process:\n - Complete failover to DR site\n - Cutover traffic (planned maintenance window)\n - Run in DR mode for 4-8 hours\n - Failback to primary\n\n Brief production impact during cutover\n\nLevel 4: Surprise DR Test (Optional)\n Duration: Variable\n Participants: All teams\n Process:\n - Unannounced DR invocation\n - Tests team readiness\n - Identifies training gaps\n\n High stress, maximum learning\n```\n\n### Test Checklist\n\n```markdown\n# DR Test Checklist\n\n## Pre-Test (1 week before)\n- [ ] Schedule test date and time\n- [ ] Notify all stakeholders\n- [ ] Verify DR site readiness\n- [ ] Verify backup integrity\n- [ ] Review DR procedures with team\n- [ ] Prepare test scenarios\n- [ ] Set up monitoring and logging\n\n## During Test\n- [ ] Start timer (measure RTO)\n- [ ] Activate DR team\n- [ ] Begin recovery procedures\n- [ ] Document all actions taken\n- [ ] Document any deviations from plan\n- [ ] Record issues encountered\n- [ ] Capture screenshots/logs\n\n## Validation\n- [ ] Database connectivity\n- [ ] Application functionality\n- [ ] User authentication\n- [ ] External integrations\n- [ ] Performance metrics\n- [ ] Data integrity checks\n\n## Post-Test\n- [ ] Calculate actual RTO/RPO\n- [ ] Debrief with team (within 48 hours)\n- [ ] Document lessons learned\n- [ ] Create action items for improvements\n- [ ] Update DR plan\n- [ ] Update runbooks\n- [ ] Report results to management\n\n## Metrics to Capture\n- Time to detection: _____ minutes\n- Time to activation: _____ minutes\n- Time to recovery: _____ minutes\n- Data loss: _____ minutes/records\n- Issues encountered: _____\n- Success rate: _____%\n```\n\n## Cloud Backup Solutions\n\n### AWS Backup\n\n```yaml\n# AWS Backup plan (CloudFormation)\n\nResources:\n BackupVault:\n Type: AWS::Backup::BackupVault\n Properties:\n BackupVaultName: ProductionBackupVault\n EncryptionKeyArn: !GetAtt BackupKey.Arn\n\n BackupKey:\n Type: AWS::KMS::Key\n Properties:\n Description: Encryption key for backups\n KeyPolicy:\n Statement:\n - Sid: Enable IAM User Permissions\n Effect: Allow\n Principal:\n AWS: !Sub 'arn:aws:iam::${AWS::AccountId}:root'\n Action: 'kms:*'\n Resource: '*'\n\n BackupPlan:\n Type: AWS::Backup::BackupPlan\n Properties:\n BackupPlan:\n BackupPlanName: DailyBackupPlan\n BackupPlanRule:\n - RuleName: DailyBackup\n TargetBackupVault: !Ref BackupVault\n ScheduleExpression: \"cron(0 2 * * ? *)\" # 2 AM daily\n StartWindowMinutes: 60\n CompletionWindowMinutes: 120\n Lifecycle:\n DeleteAfterDays: 30\n MoveToColdStorageAfterDays: 7\n\n BackupSelection:\n Type: AWS::Backup::BackupSelection\n Properties:\n BackupPlanId: !Ref BackupPlan\n BackupSelection:\n SelectionName: ProductionResources\n IamRoleArn: !GetAtt BackupRole.Arn\n Resources:\n - !Sub 'arn:aws:ec2:${AWS::Region}:${AWS::AccountId}:instance/*'\n - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:db:*'\n ListOfTags:\n - ConditionType: STRINGEQUALS\n ConditionKey: Environment\n ConditionValue: Production\n```\n\n## Database Backups\n\n### PostgreSQL Backup\n\n```bash\n#!/bin/bash\n# PostgreSQL backup script with PITR (Point-in-Time Recovery)\n\n# Configuration\nDB_NAME=\"production\"\nDB_USER=\"postgres\"\nBACKUP_DIR=\"/backup/postgres\"\nS3_BUCKET=\"s3://my-db-backups\"\nDATE=$(date +%Y%m%d_%H%M%S)\n\n# Full backup (daily)\npg_dump -U ${DB_USER} -Fc ${DB_NAME} | gzip > ${BACKUP_DIR}/full_${DATE}.dump.gz\n\n# Upload to S3\naws s3 cp ${BACKUP_DIR}/full_${DATE}.dump.gz ${S3_BUCKET}/full/\n\n# Continuous archiving (transaction logs)\n# In postgresql.conf:\n# wal_level = replica\n# archive_mode = on\n# archive_command = 'aws s3 cp %p s3://my-db-backups/wal/%f'\n\n# Restore process:\n# 1. Restore from full backup\n# pg_restore -U postgres -d production /backup/full_latest.dump.gz\n#\n# 2. Create recovery.conf for PITR\n# cat > /var/lib/postgresql/data/recovery.conf \u003c\u003cEOF\n# restore_command = 'aws s3 cp s3://my-db-backups/wal/%f %p'\n# recovery_target_time = '2025-01-15 14:30:00'\n# EOF\n#\n# 3. Start PostgreSQL (will replay WAL logs)\n```\n\n### MySQL Backup\n\n```bash\n#!/bin/bash\n# MySQL/MariaDB backup with binlog\n\nDB_NAME=\"production\"\nBACKUP_DIR=\"/backup/mysql\"\nDATE=$(date +%Y%m%d_%H%M%S)\n\n# Full backup with mysqldump\nmysqldump \\\n --single-transaction \\\n --routines \\\n --triggers \\\n --events \\\n --flush-logs \\\n --master-data=2 \\\n ${DB_NAME} | gzip > ${BACKUP_DIR}/full_${DATE}.sql.gz\n\n# Binary log backups (continuous)\n# In my.cnf:\n# log_bin = /var/log/mysql/mysql-bin\n# expire_logs_days = 7\n# sync_binlog = 1\n\n# Backup binary logs\nmysqlbinlog /var/log/mysql/mysql-bin.* | gzip > ${BACKUP_DIR}/binlog_${DATE}.sql.gz\n\n# Point-in-Time Recovery:\n# 1. Restore full backup\n# gunzip \u003c full_20250115_020000.sql.gz | mysql production\n#\n# 2. Apply binary logs up to specific time\n# mysqlbinlog --stop-datetime=\"2025-01-15 14:30:00\" binlog_*.sql | mysql production\n```\n\n## Backup Monitoring\n\n### Backup Health Checks\n\n```python\n#!/usr/bin/env python3\n\"\"\"\nBackup monitoring and alerting\n\"\"\"\n\nimport boto3\nimport datetime\nimport smtplib\nfrom email.mime.text import MIMEText\n\ndef check_aws_backup_jobs():\n \"\"\"Check AWS Backup job status\"\"\"\n client = boto3.client('backup')\n\n # Get backup jobs from last 24 hours\n end_time = datetime.datetime.now()\n start_time = end_time - datetime.timedelta(days=1)\n\n response = client.list_backup_jobs(\n ByCreatedAfter=start_time,\n ByCreatedBefore=end_time\n )\n\n failed_jobs = []\n for job in response['BackupJobs']:\n if job['State'] == 'FAILED':\n failed_jobs.append({\n 'BackupJobId': job['BackupJobId'],\n 'ResourceArn': job['ResourceArn'],\n 'StatusMessage': job.get('StatusMessage', 'Unknown error')\n })\n\n return failed_jobs\n\ndef check_backup_age(bucket, prefix, max_age_hours=25):\n \"\"\"Check if backups are recent\"\"\"\n s3 = boto3.client('s3')\n\n response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)\n\n if 'Contents' not in response:\n return [f\"No backups found in {bucket}/{prefix}\"]\n\n # Get most recent backup\n latest = max(response['Contents'], key=lambda x: x['LastModified'])\n age = datetime.datetime.now(datetime.timezone.utc) - latest['LastModified']\n age_hours = age.total_seconds() / 3600\n\n if age_hours > max_age_hours:\n return [f\"Latest backup is {age_hours:.1f} hours old (threshold: {max_age_hours})\"]\n\n return []\n\ndef send_alert(subject, body):\n \"\"\"Send email alert\"\"\"\n msg = MIMEText(body)\n msg['Subject'] = subject\n msg['From'] = '[email protected]'\n msg['To'] = '[email protected]'\n\n s = smtplib.SMTP('localhost')\n s.send_message(msg)\n s.quit()\n\ndef main():\n issues = []\n\n # Check AWS Backup jobs\n failed_jobs = check_aws_backup_jobs()\n if failed_jobs:\n issues.append(f\"Failed backup jobs: {len(failed_jobs)}\")\n for job in failed_jobs:\n issues.append(f\" - {job['ResourceArn']}: {job['StatusMessage']}\")\n\n # Check backup age\n age_issues = check_backup_age('my-backups', 'database/')\n issues.extend(age_issues)\n\n # Alert if issues found\n if issues:\n send_alert(\n 'Backup Health Check FAILED',\n 'Backup issues detected:\\n\\n' + '\\n'.join(issues)\n )\n print('ALERT:', '\\n'.join(issues))\n else:\n print('All backup checks passed')\n\nif __name__ == '__main__':\n main()\n```\n\nThis comprehensive backup and disaster recovery guide provides all the necessary knowledge and procedures for protecting data and ensuring business continuity.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":26233,"content_sha256":"577f1644cf072b742ed35c5a46ef21feb320b1bbbfabd86075a67fa8d1db91c6"},{"filename":"reference/incident-management.md","content":"# Incident Management\n\nComprehensive guide to incident response, root cause analysis, post-mortems, and building resilient incident management processes.\n\n## Table of Contents\n- [Incident Lifecycle](#incident-lifecycle)\n- [Severity Classification](#severity-classification)\n- [Incident Response Roles](#incident-response-roles)\n- [Communication Protocols](#communication-protocols)\n- [Root Cause Analysis](#root-cause-analysis)\n- [Post-Incident Reviews](#post-incident-reviews)\n- [On-Call Management](#on-call-management)\n- [Incident Management Tools](#incident-management-tools)\n- [Runbook Development](#runbook-development)\n- [Metrics and Improvement](#metrics-and-improvement)\n\n## Incident Lifecycle\n\n### Incident States\n\n```\nDETECTED → ACKNOWLEDGED → INVESTIGATING → IDENTIFIED → RESOLVING → RESOLVED → CLOSED\n\nDetection:\n - Automated alert fired\n - User report received\n - Proactive monitoring identified anomaly\n\nAcknowledgment:\n - On-call engineer confirms receipt\n - Target: \u003c 5 minutes for P1\n\nInvestigation:\n - Gather symptoms and evidence\n - Check recent changes\n - Review logs and metrics\n - Identify affected components\n\nIdentification:\n - Root cause hypothesis formed\n - Scope of impact determined\n - Fix or workaround identified\n\nResolution:\n - Implement fix or workaround\n - Validate service restoration\n - Monitor for recurrence\n\nClosure:\n - Confirm no further impact\n - Document resolution\n - Schedule post-incident review\n```\n\n### Incident Response Workflow\n\n```yaml\nPhase 1: Detection & Triage (0-5 minutes)\n Actions:\n - Alert fires or user reports issue\n - On-call acknowledges within 5 minutes\n - Initial severity assessment\n - Create incident ticket\n - Page additional responders if needed\n\n Key Questions:\n - What is the user-facing impact?\n - How many users/customers affected?\n - Is data at risk?\n - Is this getting worse?\n\nPhase 2: Investigation (5-30 minutes)\n Actions:\n - Join incident war room (Slack/Zoom)\n - Assign Incident Commander role\n - Review recent changes (deployments, configs)\n - Check monitoring dashboards\n - Query logs for errors\n - Trace affected requests\n - Form initial hypothesis\n\n Key Questions:\n - What changed recently?\n - What do logs show?\n - Are dependencies healthy?\n - Can we reproduce the issue?\n\nPhase 3: Mitigation (30-60 minutes)\n Actions:\n - Implement fix or workaround\n - Roll back recent changes if needed\n - Scale resources if capacity issue\n - Enable feature flags to disable problematic code\n - Communicate status to stakeholders\n\n Decision Framework:\n - Fast workaround vs proper fix\n - Rollback vs roll forward\n - Partial restoration vs full fix\n\nPhase 4: Recovery (60+ minutes)\n Actions:\n - Validate service metrics returned to normal\n - Confirm user-facing functionality restored\n - Monitor for recurrence (30-60 min)\n - Gradual rollback of workarounds if applied\n - Clear alerts\n\n Success Criteria:\n - Error rate \u003c threshold\n - Latency within SLO\n - No new related alerts\n - User reports stopped\n\nPhase 5: Post-Incident (24-72 hours)\n Actions:\n - Schedule post-incident review within 48 hours\n - Complete incident report\n - Create follow-up tickets for fixes\n - Update runbooks\n - Share learnings with team\n```\n\n## Severity Classification\n\n### Priority Definitions\n\n```yaml\nPriority 1 (Critical):\n Description: Complete service outage or critical functionality unavailable\n User Impact: All or most users cannot use the service\n Business Impact: Revenue loss > $10K/hour, major SLA breach\n Data Risk: Active data loss or corruption\n\n Response:\n - Acknowledge: \u003c 5 minutes\n - Initial Response: Immediate (24/7)\n - Communication: Every 30 minutes\n - Escalation: Immediate to leadership\n - All hands on deck: Yes\n\n Examples:\n - Website completely down (503 errors)\n - Database unavailable\n - Payment processing offline\n - Security breach in progress\n - Data corruption affecting production\n\nPriority 2 (High):\n Description: Major functionality degraded or affecting many users\n User Impact: Significant portion of users impacted\n Business Impact: Revenue loss $1K-$10K/hour, SLA at risk\n Data Risk: Potential data issues if not resolved\n\n Response:\n - Acknowledge: \u003c 15 minutes\n - Initial Response: \u003c 30 minutes (during business hours)\n - Communication: Every 1-2 hours\n - Escalation: After 1 hour to team lead\n - All hands: If not resolved in 2 hours\n\n Examples:\n - Critical API endpoint timing out frequently\n - Significant performance degradation (5x normal latency)\n - Key feature unavailable (e.g., checkout flow broken)\n - Elevated error rates (> 5%)\n\nPriority 3 (Medium):\n Description: Partial degradation or minor functionality impaired\n User Impact: Some users affected, workaround available\n Business Impact: Minimal revenue impact\n Data Risk: No data at risk\n\n Response:\n - Acknowledge: \u003c 4 hours (business hours)\n - Initial Response: Same business day\n - Communication: Daily updates\n - Escalation: If not resolved in 1 business day\n - All hands: No\n\n Examples:\n - Non-critical feature broken\n - Performance degradation in secondary service\n - Intermittent errors affecting \u003c 1% of requests\n - UI display issues\n\nPriority 4 (Low):\n Description: Minor issues, cosmetic problems, or enhancement requests\n User Impact: Minimal or no user impact\n Business Impact: No business impact\n Data Risk: None\n\n Response:\n - Acknowledge: \u003c 1 business day\n - Initial Response: Next sprint/cycle\n - Communication: Via regular channels\n - Escalation: No escalation needed\n - All hands: No\n\n Examples:\n - Typo in UI\n - Logging issues\n - Minor visual inconsistencies\n - Feature enhancement requests\n```\n\n### Severity Decision Tree\n\n```\nStart: Issue Detected\n |\n v\n[Is service completely down?] → YES → P1\n |\n NO\n v\n[Are multiple users unable to complete critical tasks?] → YES → P2\n |\n NO\n v\n[Is there a workaround available?] → NO → P2\n |\n YES\n v\n[Does it affect revenue or security?] → YES → P2\n |\n NO\n v\n[Are only a few users affected?] → YES → P3\n |\n NO\n v\n[Is it a cosmetic or minor issue?] → YES → P4\n```\n\n## Incident Response Roles\n\n### Role Definitions\n\n```yaml\nIncident Commander (IC):\n Responsibilities:\n - Overall incident leadership\n - Coordinate all responders\n - Make key decisions (rollback, escalate, etc.)\n - Own communication to stakeholders\n - Ensure post-incident review happens\n\n Skills Required:\n - Calm under pressure\n - Strong communication\n - Technical breadth (not necessarily depth)\n - Decision-making ability\n\n Actions:\n - Declare severity and activate response\n - Delegate tasks to responders\n - Maintain incident timeline\n - Communicate status updates\n - Call for additional resources\n - Declare incident resolved\n\n Rotation: Senior engineers, on-call leads\n\nTechnical Lead (TL):\n Responsibilities:\n - Lead technical investigation\n - Propose fixes or workarounds\n - Coordinate debugging efforts\n - Make technical decisions\n\n Skills Required:\n - Deep technical knowledge of systems\n - Debugging expertise\n - System architecture understanding\n\n Actions:\n - Analyze logs, metrics, traces\n - Reproduce issues\n - Test hypotheses\n - Implement fixes\n - Validate resolution\n\n Rotation: Subject matter experts, senior engineers\n\nCommunications Lead (Comms):\n Responsibilities:\n - Internal stakeholder updates\n - External customer communication\n - Status page updates\n - Executive briefings\n\n Skills Required:\n - Clear written communication\n - Ability to translate technical to business impact\n - Stakeholder management\n\n Actions:\n - Post updates to status page\n - Send email updates to customers\n - Update internal stakeholders\n - Prepare executive summary\n - Manage support ticket responses\n\n Rotation: Product managers, customer success, tech leads\n\nScribe:\n Responsibilities:\n - Document all actions taken\n - Maintain incident timeline\n - Record decisions and rationale\n - Note important findings\n\n Skills Required:\n - Attention to detail\n - Fast typing\n - Ability to summarize technical discussions\n\n Actions:\n - Log every action in incident ticket\n - Timestamp all events\n - Record chat/call highlights\n - Capture relevant screenshots\n - Document who did what when\n\n Rotation: Junior engineers, on-call secondary\n\nSubject Matter Expert (SME):\n Responsibilities:\n - Provide specialized knowledge\n - Answer specific technical questions\n - Assist with debugging\n\n Skills Required:\n - Deep expertise in specific system/component\n\n Actions:\n - Answer questions about their system\n - Review code/configs\n - Provide context on recent changes\n - Suggest diagnostic steps\n\n Rotation: Engineers owning specific services\n```\n\n### Role Assignment\n\n```yaml\nP1 Incident:\n Roles Required:\n - Incident Commander (required)\n - Technical Lead (required)\n - Scribe (required)\n - Communications Lead (required)\n - SMEs (2-3, as needed)\n\n Activation:\n - IC self-designates or is assigned by on-call manager\n - IC assigns other roles explicitly\n - Clear handoff if roles change\n\nP2 Incident:\n Roles Required:\n - Incident Commander (required)\n - Technical Lead (same person as IC okay)\n - Scribe (optional but recommended)\n - SMEs (1-2, as needed)\n\n Activation:\n - On-call assumes IC role\n - Pulls in SMEs as needed\n\nP3/P4 Incident:\n Roles Required:\n - Single responder handles all roles\n\n Activation:\n - Assigned to team member\n - No formal structure needed\n```\n\n## Communication Protocols\n\n### Communication Channels\n\n```yaml\nWar Room (Slack Channel):\n Purpose: Real-time coordination during active incident\n Naming: #incident-YYYY-MM-DD-description\n Participants: All responders\n Content:\n - Findings and hypotheses\n - Actions being taken\n - Requests for help\n - Decisions made\n Rules:\n - Stay on topic (incident only)\n - Use threads for side discussions\n - No blame or speculation about causes\n\n Example:\n #incident-2025-01-15-api-outage\n\nStatus Page (External):\n Purpose: Customer-facing incident communication\n URL: status.company.com\n Update Frequency:\n - P1: Every 15-30 minutes\n - P2: Every 1-2 hours\n - P3: Daily\n Content Template:\n - What is affected\n - Current status\n - Estimated resolution time (if known)\n - Workarounds (if available)\n - Next update time\n\nInternal Stakeholder Updates:\n Purpose: Keep leadership and affected teams informed\n Channels:\n - Email to [email protected]\n - Slack to #incidents\n - Direct message to executives (P1 only)\n Update Frequency:\n - P1: Every 30 minutes\n - P2: Every 2 hours\n - P3: End of day\n Content Template:\n - Impact summary (users affected, revenue impact)\n - Current status\n - Actions being taken\n - Estimated resolution\n\nCustomer Support:\n Purpose: Equip support team to handle customer inquiries\n Channels:\n - Update internal wiki/knowledge base\n - Post in #support channel\n - Provide canned responses\n Update Frequency:\n - Immediately when incident declared\n - When status changes\n - When resolved\n```\n\n### Communication Templates\n\n```markdown\n# Initial Incident Notification (Internal)\n\nSubject: [P1] API Service Outage - Investigating\n\nSEVERITY: Priority 1 (Critical)\nSTATUS: Investigating\nIMPACT: All API requests failing with 503 errors, affecting 100% of users\nSTARTED: 2025-01-15 14:32 UTC\nINCIDENT COMMANDER: Jane Doe\n\nCURRENT SITUATION:\n- API gateway returning 503 errors\n- All downstream services appear healthy\n- Investigating gateway configuration\n\nACTIONS TAKEN:\n- Incident declared at 14:35 UTC\n- War room established: #incident-2025-01-15-api-outage\n- Team paged and responding\n\nNEXT STEPS:\n- Reviewing recent deployments\n- Checking gateway logs\n- Preparing rollback plan\n\nNEXT UPDATE: 15:00 UTC (in 30 minutes)\n\n---\n\n# Status Update (Internal)\n\nSubject: [P1] API Service Outage - Root Cause Identified\n\nSEVERITY: Priority 1 (Critical)\nSTATUS: Identified → Resolving\nIMPACT: All API requests failing, affecting 100% of users\nDURATION: 25 minutes\nINCIDENT COMMANDER: Jane Doe\n\nROOT CAUSE:\n- Deployment at 14:28 UTC introduced configuration error\n- API gateway max connections set to 10 (should be 10000)\n\nACTIONS TAKEN:\n- Root cause identified via gateway logs\n- Rollback initiated at 14:52 UTC\n- Rollback completed at 14:55 UTC\n- Service returning to normal\n\nCURRENT STATUS:\n- Error rate dropping from 100% to 5%\n- Monitoring for full recovery\n- ETA to full resolution: 15:05 UTC\n\nNEXT UPDATE: 15:10 UTC\n\n---\n\n# Resolution Notification (Internal)\n\nSubject: [P1] API Service Outage - RESOLVED\n\nSEVERITY: Priority 1 (Critical) → RESOLVED\nSTATUS: Resolved\nIMPACT: All API requests were failing for 28 minutes\nDURATION: 28 minutes (14:32 - 15:00 UTC)\nINCIDENT COMMANDER: Jane Doe\n\nRESOLUTION:\n- Service fully restored at 15:00 UTC\n- Root cause: Configuration error in deployment\n- Fix: Rolled back to previous version\n\nIMPACT SUMMARY:\n- Duration: 28 minutes\n- Users Affected: ~50,000 (100% of active users)\n- Failed Requests: ~2.1 million\n- Revenue Impact: ~$4,700 (estimated)\n\nFOLLOW-UP ACTIONS:\n- Post-incident review scheduled: 2025-01-16 10:00 UTC\n- Ticket created to add validation to deployment pipeline\n- Runbook updated with troubleshooting steps\n\nThank you to all responders: Jane Doe, John Smith, Alice Johnson\n\n---\n\n# Customer Status Page Update\n\nTitle: API Service Disruption\n\n[Resolved] - Jan 15, 15:00 UTC\nWe have resolved the issue affecting API requests. All services are now operating normally. We apologize for the disruption and are conducting a thorough review to prevent similar issues.\n\n[Update] - Jan 15, 14:55 UTC\nWe have identified the root cause and implemented a fix. Service is returning to normal. We expect full resolution within 5 minutes.\n\n[Investigating] - Jan 15, 14:35 UTC\nWe are currently investigating an issue affecting API requests. Users may experience errors when accessing our service. Our team is actively working on a resolution. We will provide an update in 30 minutes.\n\nWORKAROUND: None available at this time.\n\nIf you have questions, please contact [email protected].\n```\n\n## Root Cause Analysis\n\n### The 5 Whys Technique\n\n```yaml\nExample: Website Down\n\nSymptom: Website returning 503 errors\n\nWhy 1: Why is the website down?\n → The load balancer is marking all backend servers as unhealthy\n\nWhy 2: Why is the load balancer marking servers as unhealthy?\n → Health check requests are timing out after 2 seconds\n\nWhy 3: Why are health checks timing out?\n → Application is taking 5+ seconds to respond to health checks\n\nWhy 4: Why is the application slow to respond?\n → Database connection pool is exhausted (all connections in use)\n\nWhy 5: Why is the connection pool exhausted?\n → Recent deployment increased default connection timeout from 5s to 60s,\n causing connections to be held longer\n\nROOT CAUSE: Configuration change in deployment increased connection timeout,\ncausing connection pool exhaustion and cascading failure.\n\nCORRECTIVE ACTIONS:\n- Immediate: Rollback deployment\n- Short-term: Add connection pool monitoring and alerting\n- Long-term: Review and test all timeout configurations, add capacity planning\n```\n\n### Fishbone (Ishikawa) Diagram\n\n```\n Website Down (503 Errors)\n ↑\n ┌──────────────┬───────────────────┼───────────────────┬──────────────┐\n │ │ │ │ │\n METHODS MACHINES MATERIALS MANPOWER ENVIRONMENT\n │ │ │ │ │\n Deployment Load Balancer Configuration On-call team Traffic spike\n process │ │ │ │\n │ Health checks Timeout settings Understaffed Peak hours\n No staging not tuned │ Inexperienced │\n validation │ Connection pool │ Sudden load\n │ Timeout: 2s size: 10 No runbook │\n No rollback │ │ │ DDoS attack?\n plan Backend slow Default settings No training │\n │ │ not reviewed │ Geographic\n Auto-deploy App response: 5s │ Poor documentation event\n │ │ │ │ │\n └──────────────┴───────────────────┴───────────────────┴──────────────┘\n ↓\n Contributing Factors Analysis\n```\n\n### Fault Tree Analysis\n\n```\n Website Unavailable\n ↑\n ┌─────────┴─────────┐\n │ OR │\n ┌───────────┴───────┐ ┌───┴──────────┐\n Load Balancer Backend Servers DNS Failure\n Failure Down (rare)\n ↑ ↑\n │ ┌─────────┴─────────┐\n │ │ AND │\n │ ┌─────┴────┐ ┌────┴─────┐\n │ All Servers Health Check\n │ Unhealthy Failing\n │ ↑ ↑\n │ │ │\n Config Database Response Time\n Error Overload > Timeout\n ↑ ↑ ↑\n │ │ ┌────┴────┐\n Deploy Connection │ OR │\n Failed Pool Full ┌───┴───┐ ┌───┴───┐\n ↑ │ │ │ │\n Timeout Slow Database\n Too High Query Connection\n Pool Size\n Too Small\n\nROOT CAUSE PATH (highlighted):\nWebsite Unavailable ← Backend Servers Down ← All Servers Unhealthy\n← Health Check Failing ← Response Time > Timeout ← Connection Pool Full\n← Timeout Too High ← Deploy Failed (config error)\n```\n\n### Timeline Analysis\n\n```yaml\nIncident Timeline: 2025-01-15 API Outage\n\n14:28:00 - Deployment started (v2.3.1)\n14:30:00 - Deployment completed successfully\n14:32:00 - First spike in error rate (5% → 20%)\n14:32:30 - Error rate continues climbing (20% → 50%)\n14:33:00 - Complete outage (100% errors)\n14:33:15 - Automated alert fired: \"High error rate\"\n14:33:45 - PagerDuty page sent to on-call\n14:35:00 - On-call acknowledged alert [MTTA: 1m 45s]\n14:36:00 - Incident declared P1, war room created\n14:37:00 - Dashboard review shows all backend servers marked unhealthy\n14:38:00 - Health check logs show timeouts\n14:40:00 - Application logs show slow responses on /health endpoint\n14:42:00 - Database connection pool exhaustion identified\n14:45:00 - Recent deployment suspected\n14:48:00 - Config diff reveals timeout change: 5s → 60s\n14:50:00 - Decision made to rollback\n14:52:00 - Rollback initiated\n14:55:00 - Rollback completed\n14:56:00 - Error rate dropping (100% → 10%)\n14:58:00 - Error rate normal (\u003c 1%)\n15:00:00 - Monitoring for 5 minutes, no new errors\n15:05:00 - Incident declared resolved [MTTR: 30 minutes]\n\nKey Insights:\n- Detection delay: 3 minutes (first error to alert)\n- Response time: 2 minutes (alert to war room)\n- Investigation time: 15 minutes (war room to root cause)\n- Resolution time: 8 minutes (decision to rollback complete)\n- Recovery time: 5 minutes (rollback to normal)\n\nContributing Factors:\n- ✓ Automated alerting worked\n- ✓ On-call response was fast\n- ✗ No pre-deployment config validation\n- ✗ Timeout change not reviewed in code review\n- ✗ Health check timeout not considered during testing\n- ✗ No staging environment to catch issue\n```\n\n## Post-Incident Reviews\n\n### Blameless Post-Mortem Principles\n\n```yaml\nCore Values:\n 1. No Blame or Punishment:\n - Focus on systems and processes, not individuals\n - Assume everyone acted with best intentions\n - Create psychological safety for honest discussion\n\n 2. Learning Over Judgment:\n - Goal is to improve, not to find fault\n - Celebrate what went well\n - Identify opportunities for improvement\n\n 3. Systems Thinking:\n - Complex systems have complex failures\n - Multiple contributing factors, not single root cause\n - Focus on increasing system resilience\n\n 4. Actionable Outcomes:\n - Every insight must lead to action item\n - Action items must have owners and due dates\n - Track action items to completion\n\n 5. Shared Learning:\n - Share findings with entire organization\n - Build institutional knowledge\n - Prevent similar incidents elsewhere\n```\n\n### Post-Incident Review Template\n\n```markdown\n# Post-Incident Review: API Outage - 2025-01-15\n\n## Metadata\n- **Date**: 2025-01-15\n- **Duration**: 28 minutes (14:32 - 15:00 UTC)\n- **Severity**: P1 (Critical)\n- **Incident Commander**: Jane Doe\n- **Services Affected**: API Gateway, All API Endpoints\n- **Users Impacted**: ~50,000 (100% of active users)\n\n## Executive Summary\nOn January 15, 2025, our API service experienced a complete outage lasting 28 minutes. A configuration error in a routine deployment caused the database connection pool to exhaust, leading to health check failures and all backend servers being marked unhealthy by the load balancer. The issue was resolved by rolling back the deployment. No data was lost, but approximately $4,700 in revenue was impacted.\n\n## What Happened (Timeline)\n[Detailed timeline from Timeline Analysis section above]\n\n## Impact Assessment\n\n### User Impact\n- **Affected Users**: 50,000 active users (100%)\n- **User Experience**: Complete inability to access any API functionality\n- **Customer Complaints**: 127 support tickets filed\n- **Duration**: 28 minutes\n\n### Business Impact\n- **Revenue Loss**: ~$4,700 (estimated from transaction volume)\n- **SLA Breach**: Monthly SLO of 99.9% consumed 67% of error budget\n- **Reputation**: High-profile users tweeted about outage\n- **Support Cost**: ~40 hours of support time responding to tickets\n\n### Technical Impact\n- **Failed Requests**: ~2.1 million\n- **Data Loss**: None\n- **Services Affected**: All API endpoints\n- **Downstream Dependencies**: Mobile app, web app, third-party integrations\n\n## Root Cause Analysis\n\n### Immediate Cause\nLoad balancer marked all backend servers as unhealthy because health check requests exceeded the 2-second timeout.\n\n### Contributing Factors\n1. **Configuration Change**: Deployment v2.3.1 changed database connection timeout from 5s to 60s\n2. **Connection Pool Exhaustion**: Longer timeouts caused connections to be held longer, exhausting the pool (max 10 connections)\n3. **Slow Health Checks**: With no available connections, health check endpoint took 5+ seconds to respond\n4. **Health Check Timeout**: Load balancer timeout (2s) was lower than application response time (5s)\n5. **Lack of Validation**: Configuration change not flagged in code review or tested in staging\n\n### Root Cause\nA configuration change increasing database timeout was not properly reviewed or tested, leading to connection pool exhaustion and cascading failure when deployed to production.\n\n## What Went Well\n\n### Detection\n✓ Automated monitoring detected issue within 3 minutes of complete outage\n✓ Alert fired appropriately with correct severity\n✓ On-call engineer acknowledged within 2 minutes\n\n### Response\n✓ Incident Commander immediately declared P1 and activated full response\n✓ War room established quickly with all necessary responders\n✓ Clear role assignments (IC, TL, Scribe, Comms)\n✓ Excellent communication throughout incident\n✓ Rollback decision made decisively once root cause identified\n\n### Recovery\n✓ Rollback executed cleanly without issues\n✓ Service recovered fully within 5 minutes of rollback\n✓ No data loss or corruption\n✓ Post-incident review scheduled promptly\n\n## What Could Be Improved\n\n### Prevention\n✗ Configuration changes should be validated automatically\n✗ Code review didn't catch the impact of timeout change\n✗ No staging environment to test configuration changes\n✗ Connection pool size (10) too small for production load\n\n### Detection\n✗ 3-minute delay between first errors and alert (gradual degradation not caught)\n✗ No alerting on connection pool saturation\n✗ Health check failures not alerted separately\n\n### Response\n✗ Took 15 minutes to identify root cause (need better debugging tools)\n✗ No runbook for \"all servers unhealthy\" scenario\n✗ Rollback procedure not documented (relied on tribal knowledge)\n\n### Systemic Issues\n✗ No automated rollback on deployment failures\n✗ Configuration changes deployed same as code changes (should have different process)\n✗ No capacity planning for connection pools\n✗ Health check timeout not aligned with application timeouts\n\n## Action Items\n\n### Immediate (1 week)\n- [ ] **Add connection pool monitoring** [Owner: John Smith] [Due: 2025-01-22]\n - Alert at 70% utilization (warning)\n - Alert at 85% utilization (critical)\n\n- [ ] **Increase connection pool size** [Owner: Alice Johnson] [Due: 2025-01-22]\n - Calculate appropriate size based on load testing\n - Implement in production (target: 50-100 connections)\n\n- [ ] **Create runbook for \"All servers unhealthy\"** [Owner: Jane Doe] [Due: 2025-01-22]\n - Document diagnostic steps\n - Include rollback procedure\n - Add to on-call documentation\n\n### Short-term (1 month)\n- [ ] **Implement configuration validation** [Owner: Platform Team] [Due: 2025-02-15]\n - Add pre-deployment checks for timeout values\n - Validate connection pool size vs timeout settings\n - Block deployments that fail validation\n\n- [ ] **Set up staging environment** [Owner: DevOps Team] [Due: 2025-02-15]\n - Production-like configuration\n - Automated deployment testing\n - Required step before production deployment\n\n- [ ] **Align health check timeouts** [Owner: Infrastructure Team] [Due: 2025-02-15]\n - Load balancer timeout should be > app timeout\n - Document timeout hierarchy\n - Automate timeout configuration\n\n- [ ] **Implement gradual rollout** [Owner: Platform Team] [Due: 2025-02-28]\n - Canary deployments (10% → 50% → 100%)\n - Automatic rollback on error rate increase\n - Deployment gates based on metrics\n\n### Long-term (3 months)\n- [ ] **Separate config deployment pipeline** [Owner: Architecture Team] [Due: 2025-04-15]\n - Configuration changes reviewed by ops team\n - Gradual rollout for config changes\n - Different approval process than code\n\n- [ ] **Implement synthetic monitoring** [Owner: Observability Team] [Due: 2025-04-15]\n - Proactive health checks from external monitors\n - Alert before complete outage\n - Detect gradual degradation earlier\n\n- [ ] **Capacity planning framework** [Owner: SRE Team] [Due: 2025-04-30]\n - Document sizing guidelines for connection pools\n - Load testing requirements\n - Automated capacity recommendations\n\n## Lessons Learned\n\n1. **Configuration is Code**: Configuration changes should be treated with the same rigor as code changes, including review, testing, and validation.\n\n2. **Test in Staging**: A production-like staging environment would have caught this issue before it reached production.\n\n3. **Cascading Failures**: Small changes (timeout adjustment) can have large, unexpected effects. Better understanding of system interactions is needed.\n\n4. **Alerting Gaps**: We alerted on symptoms (errors) but not leading indicators (connection pool saturation). Adding more proactive monitoring would enable earlier intervention.\n\n5. **Response Worked Well**: Despite the outage, our incident response process performed admirably. Clear roles, good communication, and decisive action led to fast resolution.\n\n## Appendix\n\n### Supporting Data\n- [Link to Grafana Dashboard during incident]\n- [Link to error logs]\n- [Link to deployment change log]\n- [Link to war room Slack thread]\n\n### Glossary\n- **MTTA**: Mean Time to Acknowledge\n- **MTTR**: Mean Time to Recovery\n- **SLO**: Service Level Objective\n\n### Related Incidents\n- 2024-11-03: Database connection pool exhaustion (different cause)\n- 2024-09-12: Health check timeout issues on Redis\n\n### Review Attendees\n- Jane Doe (Incident Commander)\n- John Smith (Technical Lead)\n- Alice Johnson (Engineering Manager)\n- Bob Wilson (Product Manager)\n- Carol Martinez (Customer Success)\n```\n\n### Post-Incident Review Meeting Agenda\n\n```yaml\nDuration: 60 minutes\n\n00:00-00:05 (5 min): Intro and Ground Rules\n - Reminder: Blameless, focus on learning\n - Goal: Improve systems and processes\n - Everyone encouraged to participate\n\n00:05-00:15 (10 min): Timeline Walkthrough\n - Incident Commander presents timeline\n - Highlight key events\n - Clarifying questions only (no analysis yet)\n\n00:15-00:25 (10 min): What Went Well\n - Celebrate successes\n - What should we keep doing?\n - What practices helped us respond effectively?\n\n00:25-00:40 (15 min): What Could Be Improved\n - Open discussion\n - What could we have done differently?\n - What prevented faster detection/resolution?\n - Surface systemic issues\n\n00:40-00:55 (15 min): Action Items\n - Brainstorm improvements\n - Prioritize by impact and effort\n - Assign owners and due dates\n - Ensure items are specific and actionable\n\n00:55-01:00 (5 min): Wrap-up\n - Review action items\n - Schedule follow-ups\n - Thank participants\n```\n\n## On-Call Management\n\n### On-Call Rotation Best Practices\n\n```yaml\nRotation Schedule:\n Primary On-Call:\n Duration: 1 week (Monday-Monday)\n Responsibilities: First responder for all alerts\n Compensation: Stipend + time off\n\n Secondary On-Call:\n Duration: 1 week (Monday-Monday)\n Responsibilities: Backup for primary, escalation target\n Compensation: Stipend\n\n Rotation Size:\n Minimum: 4 engineers (2 weeks between shifts)\n Recommended: 6-8 engineers (4-6 weeks between shifts)\n Maximum: 12 engineers (risk of losing context)\n\nOn-Call Eligibility:\n Requirements:\n - Completed onboarding (30+ days)\n - Shadowed 2+ on-call shifts\n - Can access production systems\n - Familiar with monitoring and alerting\n - Completed incident response training\n\n Opt-out Reasons:\n - Vacation (blackout dates)\n - Personal circumstances\n - Heavy project deadlines (pre-approved)\n\nSchedule Management:\n Tool: PagerDuty, OpsGenie, or similar\n Visibility: Published 6 weeks in advance\n Changes: Self-service swap with approval\n Coverage: 24/7 for P1/P2, business hours for P3/P4\n\nEscalation Policy:\n Level 1: Primary On-Call (0-5 min)\n Level 2: Secondary On-Call (5-15 min)\n Level 3: Team Lead (15-30 min)\n Level 4: Engineering Manager (30-60 min)\n Level 5: VP Engineering (60+ min, P1 only)\n```\n\n### On-Call Runbook\n\n```markdown\n# On-Call Engineer Guide\n\n## Before Your Shift\n\n### 48 Hours Before\n- [ ] Review the on-call schedule\n- [ ] Identify your backup (secondary on-call)\n- [ ] Block calendar for any potential incident response\n- [ ] Review recent incidents and ongoing issues\n\n### 24 Hours Before\n- [ ] Test laptop and VPN access\n- [ ] Test PagerDuty app notifications\n- [ ] Ensure mobile phone is charged\n- [ ] Review monitoring dashboards\n- [ ] Check for scheduled deployments or maintenance\n\n### Start of Shift\n- [ ] Post in #on-call channel: \"Starting on-call shift\"\n- [ ] Review open incidents and alerts\n- [ ] Check upcoming changes in deployment calendar\n- [ ] Skim through recent post-mortems\n- [ ] Verify access to all critical systems\n\n## During Your Shift\n\n### When Alert Fires\n1. **Acknowledge** (within 5 minutes for P1/P2)\n - Open PagerDuty alert\n - Click \"Acknowledge\"\n - Alert stops paging\n\n2. **Initial Assessment** (first 5 minutes)\n - Read alert description\n - Check alert dashboard link\n - Assess severity (is P1 correct?)\n - Check recent changes (deployment, config)\n\n3. **Decide Next Steps**\n - If **clear fix**: Implement and monitor\n - If **quick rollback**: Execute rollback\n - If **unclear**: Declare incident and get help\n\n### When to Declare Incident\nDeclare incident (create war room) if:\n- You're unsure how to fix (need help)\n- User impact is significant\n- Will take > 30 minutes to resolve\n- Multiple systems affected\n\n### When to Escalate\nEscalate to secondary on-call if:\n- You're overwhelmed (multiple alerts)\n- You need specific expertise\n- You're stuck (30+ min no progress)\n\nEscalate to manager if:\n- P1 incident lasting > 1 hour\n- User data at risk\n- Security incident\n- Need executive decision\n\n### Alert Hygiene\nAfter each alert:\n- [ ] Update incident ticket with resolution\n- [ ] Mark alert as \"resolved\" in PagerDuty\n- [ ] If false positive: Create ticket to tune alert\n- [ ] If new issue: Create ticket to fix root cause\n- [ ] Update runbook if you learned something new\n\n## End of Shift\n\n### Handoff Checklist\n- [ ] Post in #on-call: \"Ending on-call shift\"\n- [ ] List open incidents and their status\n- [ ] Note any ongoing issues or concerns\n- [ ] Mention scheduled changes in next 24 hours\n- [ ] Thank the outgoing on-call\n\n### Feedback and Improvement\n- [ ] Log toil reduction opportunities\n- [ ] Update runbooks based on what you learned\n- [ ] File tickets for alert improvements\n- [ ] Provide feedback on on-call process\n\n## Common Scenarios\n\n### Scenario: High Error Rate Alert\n1. Check dashboard: Which service? Which endpoint?\n2. Check recent deployments: Anything in last hour?\n3. Check logs: What errors are users seeing?\n4. If recent deployment: Consider rollback\n5. If not recent: Investigate dependencies\n\n### Scenario: High Latency Alert\n1. Check dashboard: Which percentile? How high?\n2. Check database: Slow queries? Connection pool full?\n3. Check dependencies: External APIs slow?\n4. Check traffic: Unusual spike in requests?\n5. Consider scaling if capacity issue\n\n### Scenario: Service Down Alert\n1. Check monitoring: Complete outage or partial?\n2. Check infrastructure: Servers running? Network okay?\n3. Check recent changes: Deployment? Config change?\n4. Restart if safe (stateless services)\n5. Rollback if recent deployment\n\n## Emergency Contacts\n\nPrimary Escalation:\n- Secondary On-Call: [PagerDuty escalation]\n- Team Lead: [Phone number]\n- Engineering Manager: [Phone number]\n\nSMEs (Subject Matter Experts):\n- Database: [Name, phone]\n- Networking: [Name, phone]\n- Security: [Name, phone]\n- Cloud Infrastructure: [Name, phone]\n\nExternal:\n- Cloud Provider Support: [Phone, ticket system]\n- Third-party Vendor Support: [Phone, ticket system]\n\n## Useful Links\n\nDashboards:\n- [Overall System Health Dashboard]\n- [Service-Specific Dashboards]\n- [Infrastructure Dashboard]\n\nRunbooks:\n- [Runbook Index]\n- [Common Incident Scenarios]\n- [Rollback Procedures]\n\nTools:\n- [PagerDuty Incidents]\n- [Grafana Dashboards]\n- [Log Aggregation (ELK/Splunk)]\n- [Deployment Tool]\n- [ChatOps (Slack)]\n```\n\n## Incident Management Tools\n\n### Tool Comparison\n\n| Feature | PagerDuty | Opsgenie | Splunk On-Call | Incident.io | FireHydrant |\n|---------|-----------|----------|----------------|-------------|-------------|\n| **Alerting** | ✓✓✓ | ✓✓✓ | ✓✓✓ | ✓✓ | ✓✓ |\n| **On-Call Scheduling** | ✓✓✓ | ✓✓✓ | ✓✓✓ | ✓✓ | ✓✓ |\n| **Incident Timeline** | ✓✓ | ✓✓ | ✓ | ✓✓✓ | ✓✓✓ |\n| **Status Page Integration** | ✓✓✓ | ✓✓ | ✓✓ | ✓✓✓ | ✓✓✓ |\n| **Post-Mortem Templates** | ✓ | ✓ | ✓ | ✓✓✓ | ✓✓✓ |\n| **Slack Integration** | ✓✓✓ | ✓✓✓ | ✓✓✓ | ✓✓✓ | ✓✓✓ |\n| **Pricing** | $$ | $$ | $$ | $$ | $$ |\n| **Best For** | Mature teams | Mid-size teams | Splunk users | Modern incident mgmt | Modern incident mgmt |\n\n### PagerDuty Configuration Example\n\n```python\n# PagerDuty API - Create Incident\nimport requests\nimport json\n\nPAGERDUTY_API_KEY = \"YOUR_API_KEY\"\nPAGERDUTY_EMAIL = \"[email protected]\"\n\ndef create_incident(title, description, urgency=\"high\", service_id=\"SERVICE_ID\"):\n \"\"\"Create a PagerDuty incident\"\"\"\n\n url = \"https://api.pagerduty.com/incidents\"\n headers = {\n \"Authorization\": f\"Token token={PAGERDUTY_API_KEY}\",\n \"Content-Type\": \"application/json\",\n \"From\": PAGERDUTY_EMAIL\n }\n\n payload = {\n \"incident\": {\n \"type\": \"incident\",\n \"title\": title,\n \"service\": {\n \"id\": service_id,\n \"type\": \"service_reference\"\n },\n \"urgency\": urgency,\n \"body\": {\n \"type\": \"incident_body\",\n \"details\": description\n }\n }\n }\n\n response = requests.post(url, headers=headers, json=payload)\n\n if response.status_code == 201:\n incident = response.json()[\"incident\"]\n print(f\"Incident created: {incident['html_url']}\")\n return incident\n else:\n print(f\"Error: {response.status_code}\")\n print(response.text)\n return None\n\n# Example: Create incident from monitoring alert\nif __name__ == \"__main__\":\n create_incident(\n title=\"High Error Rate on API\",\n description=\"Error rate exceeded 5% for 10 minutes. Dashboard: https://grafana.example.com/...\",\n urgency=\"high\"\n )\n```\n\n## Runbook Development\n\n### Runbook Template\n\n```markdown\n# Runbook: [Service Name] - [Scenario]\n\n## Service Overview\n- **Service**: API Gateway\n- **Team**: Backend Team\n- **On-Call**: #backend-oncall\n- **SME**: John Smith ([email protected])\n\n## Purpose\nThis runbook covers troubleshooting and recovery procedures for the API Gateway service.\n\n## Architecture\n```\n[Include architecture diagram or ASCII art]\n\nExternal Clients → API Gateway → Backend Services → Database\n ↓\n Rate Limiter\n Auth Service\n```\n\n## SLIs/SLOs\n- **Availability**: 99.9% (43 minutes downtime/month)\n- **Latency (p95)**: \u003c 500ms\n- **Error Rate**: \u003c 0.1%\n\n## Common Issues\n\n### Issue 1: High Error Rate (5xx Errors)\n\n**Symptoms**:\n- Alert: \"HighErrorRate\" firing\n- Dashboard shows error rate > 5%\n- Users reporting \"Service Unavailable\" errors\n\n**Possible Causes**:\n1. Backend services down or unhealthy\n2. Database connection issues\n3. Recent deployment issue\n4. Upstream dependency failure\n\n**Diagnostic Steps**:\n```bash\n# 1. Check backend service health\nkubectl get pods -n backend\nkubectl describe pod \u003cpod-name> -n backend\n\n# 2. Check API Gateway logs\nkubectl logs -f deployment/api-gateway -n gateway --tail=100\n\n# 3. Check recent deployments\nkubectl rollout history deployment/api-gateway -n gateway\n\n# 4. Check database connections\n# (Connect to app pod and run)\nkubectl exec -it \u003cpod-name> -n backend -- /bin/sh\nnetstat -an | grep 5432 | grep ESTABLISHED | wc -l\n\n# 5. Check upstream dependencies\ncurl https://auth-service/health\ncurl https://payment-service/health\n```\n\n**Resolution Steps**:\n\nIf recent deployment (last 30 minutes):\n```bash\n# Rollback deployment\nkubectl rollout undo deployment/api-gateway -n gateway\nkubectl rollout status deployment/api-gateway -n gateway\n\n# Verify error rate dropping\n# Check dashboard: https://grafana.example.com/d/api-gateway\n```\n\nIf database connection issue:\n```bash\n# Restart API Gateway pods (will reset connection pools)\nkubectl rollout restart deployment/api-gateway -n gateway\n\n# Monitor for improvement\nwatch kubectl get pods -n gateway\n```\n\nIf upstream dependency down:\n```bash\n# Check status pages of dependencies\n# Escalate to owning team\n# Consider enabling fallback mode if available\n```\n\n**Escalation**:\n- If not resolved in 15 minutes: Page secondary on-call\n- If backend services issue: Page backend team\n- If database issue: Page database team\n\n### Issue 2: High Latency\n\n**Symptoms**:\n- Alert: \"HighLatency\" firing\n- Dashboard shows p95 latency > 1000ms\n- Users reporting slow responses\n\n**Possible Causes**:\n1. Database slow queries\n2. High traffic / insufficient capacity\n3. Downstream service latency\n4. Memory/CPU saturation\n\n**Diagnostic Steps**:\n```bash\n# 1. Check pod resources\nkubectl top pods -n gateway\n\n# 2. Check HPA status (auto-scaling)\nkubectl get hpa -n gateway\n\n# 3. Check slow queries\n# (Connect to database)\nSELECT pid, query, query_start, state\nFROM pg_stat_activity\nWHERE state != 'idle'\nAND (now() - query_start) > interval '5 seconds'\nORDER BY query_start;\n\n# 4. Check downstream services\ncurl -w \"@curl-format.txt\" https://service-a/health\ncurl -w \"@curl-format.txt\" https://service-b/health\n```\n\n**Resolution Steps**:\n\nIf capacity issue (CPU/memory high):\n```bash\n# Scale up deployment\nkubectl scale deployment/api-gateway -n gateway --replicas=10\n\n# Or wait for HPA to scale (if configured)\nkubectl get hpa -n gateway -w\n```\n\nIf slow database queries:\n```sql\n-- Kill long-running query (use with caution)\nSELECT pg_terminate_backend(pid)\nFROM pg_stat_activity\nWHERE pid = \u003cproblematic_pid>;\n```\n\n### Issue 3: Complete Service Outage\n\n**Symptoms**:\n- Alert: \"ServiceDown\" firing\n- Dashboard shows 0 requests/sec\n- All health checks failing\n\n**Immediate Actions**:\n1. Declare P1 incident\n2. Create war room: #incident-YYYY-MM-DD-api-outage\n3. Page backup on-call and team lead\n\n**Diagnostic Steps**:\n```bash\n# 1. Check if pods are running\nkubectl get pods -n gateway\n\n# 2. Check deployment status\nkubectl get deployment api-gateway -n gateway\n\n# 3. Check recent changes\ngit log --since=\"1 hour ago\" --oneline\n\n# 4. Check infrastructure (nodes, network)\nkubectl get nodes\nkubectl describe node \u003cnode-name>\n```\n\n**Resolution Steps**:\n[Detailed recovery steps based on cause]\n\n## Related Runbooks\n- [Database Troubleshooting Runbook](link)\n- [Kubernetes Troubleshooting Runbook](link)\n- [Rollback Procedures](link)\n\n## Useful Dashboards\n- [API Gateway Dashboard](https://grafana.example.com/d/api-gateway)\n- [Backend Services Dashboard](https://grafana.example.com/d/backend)\n- [Infrastructure Dashboard](https://grafana.example.com/d/infrastructure)\n\n## Useful Commands\n\n```bash\n# Check logs\nkubectl logs -f deployment/api-gateway -n gateway --tail=100\n\n# Get shell in pod\nkubectl exec -it \u003cpod-name> -n gateway -- /bin/bash\n\n# Port forward to local machine\nkubectl port-forward deployment/api-gateway 8080:8080 -n gateway\n\n# Describe resource\nkubectl describe pod \u003cpod-name> -n gateway\n\n# Check events\nkubectl get events -n gateway --sort-by='.lastTimestamp'\n```\n\n## Recent Changes\n- 2025-01-10: Added HPA configuration (autoscaling)\n- 2024-12-15: Increased connection pool size to 50\n- 2024-11-20: Updated rollback procedure\n\n## Document Info\n- **Last Updated**: 2025-01-15\n- **Owner**: Backend Team\n- **Review Cycle**: Monthly\n```\n\n## Metrics and Improvement\n\n### Key Metrics to Track\n\n```yaml\nIncident Metrics:\n MTTA (Mean Time to Acknowledge):\n Definition: Time from alert to acknowledgment\n Target: \u003c 5 minutes for P1, \u003c 15 minutes for P2\n Calculation: Sum(ack_time - alert_time) / Count(incidents)\n\n MTTI (Mean Time to Identify):\n Definition: Time from acknowledgment to root cause identified\n Target: \u003c 30 minutes for P1, \u003c 2 hours for P2\n Calculation: Sum(identified_time - ack_time) / Count(incidents)\n\n MTTR (Mean Time to Recovery):\n Definition: Time from alert to resolution\n Target: \u003c 1 hour for P1, \u003c 4 hours for P2\n Calculation: Sum(resolved_time - alert_time) / Count(incidents)\n\n MTBF (Mean Time Between Failures):\n Definition: Time between incidents\n Target: > 720 hours (30 days)\n Calculation: Total operational time / Count(incidents)\n\nQuality Metrics:\n Incident Recurrence Rate:\n Definition: % of incidents that recur within 90 days\n Target: \u003c 10%\n Calculation: Recurring incidents / Total incidents × 100\n\n Action Item Completion Rate:\n Definition: % of post-incident action items completed on time\n Target: > 90%\n Calculation: Completed on time / Total action items × 100\n\n Runbook Coverage:\n Definition: % of services with up-to-date runbooks\n Target: 100%\n Calculation: Services with runbooks / Total services × 100\n\nOn-Call Metrics:\n Alert Volume:\n Definition: Number of alerts per on-call shift\n Target: \u003c 20 per week\n Measurement: Count by week\n\n False Positive Rate:\n Definition: % of alerts that don't require action\n Target: \u003c 20%\n Calculation: False positives / Total alerts × 100\n\n After-Hours Pages:\n Definition: Pages outside business hours\n Target: \u003c 5 per week\n Measurement: Count by time of day\n```\n\n### Continuous Improvement Process\n\n```yaml\nWeekly:\n Alert Hygiene Review:\n - Review all alerts from past week\n - Identify false positives (> 20% = tune or remove)\n - Update alert thresholds\n - Create tickets for recurring issues\n\n On-Call Feedback:\n - Collect feedback from outgoing on-call\n - Identify toil reduction opportunities\n - Update runbooks based on learnings\n\nMonthly:\n Incident Retrospective:\n - Review all incidents from past month\n - Analyze trends (common causes, affected services)\n - Track MTTA, MTTI, MTTR trends\n - Review action item completion rate\n\n Runbook Audit:\n - Review and update all runbooks\n - Test procedures\n - Remove outdated information\n\n Training:\n - Onboard new on-call engineers\n - Incident response drills/simulations\n - Share learnings from recent incidents\n\nQuarterly:\n Metrics Review:\n - Present incident metrics to leadership\n - Track progress on reduction targets\n - Identify systemic issues\n - Celebrate improvements\n\n Process Improvements:\n - Review incident management process\n - Gather team feedback\n - Implement process changes\n - Update documentation\n```\n\nThis comprehensive incident management guide provides all the tools and processes needed for effective incident response and continuous improvement.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":46805,"content_sha256":"9851e1c7159bf6634bb0c3dd264b0d18e6a6a90ebb02b82814ff3a80401885d7"},{"filename":"reference/infrastructure.md","content":"# Infrastructure Management\n\nComprehensive guide to server management, network operations, capacity planning, and infrastructure operations for IT teams.\n\n## Table of Contents\n- [Server Management](#server-management)\n- [Network Operations](#network-operations)\n- [Capacity Planning](#capacity-planning)\n- [Storage Management](#storage-management)\n- [Virtualization](#virtualization)\n- [Cloud Infrastructure](#cloud-infrastructure)\n- [Infrastructure as Code](#infrastructure-as-code)\n- [Patching and Updates](#patching-and-updates)\n- [Performance Optimization](#performance-optimization)\n- [Cost Management](#cost-management)\n\n## Server Management\n\n### Server Lifecycle\n\n```yaml\nPhase 1: Procurement\n Actions:\n - Define requirements (CPU, RAM, storage, network)\n - Select vendor (Dell, HP, Lenovo, etc.)\n - Purchase or lease decision\n - Order hardware\n Timeline: 4-12 weeks\n\nPhase 2: Provisioning\n Actions:\n - Receive and inventory hardware\n - Rack and cable servers\n - Install operating system\n - Apply baseline configuration\n - Install monitoring agents\n - Document in CMDB\n Timeline: 1-2 days per server\n\nPhase 3: Deployment\n Actions:\n - Install application software\n - Configure networking and firewall rules\n - Set up backups\n - Load balancer configuration\n - Run acceptance tests\n - Hand off to application team\n Timeline: 2-5 days\n\nPhase 4: Operations (2-5 years)\n Actions:\n - Monitor performance and health\n - Apply security patches\n - Perform maintenance\n - Capacity planning\n - Incident response\n Timeline: 2-5 years typical hardware lifecycle\n\nPhase 5: Decommissioning\n Actions:\n - Migrate workloads to new servers\n - Backup all data\n - Wipe drives (secure erase)\n - Remove from monitoring\n - Update CMDB\n - Physical disposal or return\n Timeline: 1-2 weeks\n```\n\n### Operating System Management\n\n**Linux Server Setup (Ubuntu/RHEL)**:\n```bash\n#!/bin/bash\n# Server baseline configuration script\n\nset -e\n\necho \"=== Server Baseline Configuration ===\"\n\n# 1. System Updates\necho \"Updating system packages...\"\napt-get update && apt-get upgrade -y # Ubuntu/Debian\n# yum update -y # RHEL/CentOS\n\n# 2. Set hostname\nHOSTNAME=\"web-server-01.example.com\"\nhostnamectl set-hostname $HOSTNAME\necho \"Hostname set to: $HOSTNAME\"\n\n# 3. Configure NTP for time synchronization\necho \"Configuring NTP...\"\ntimedatectl set-timezone UTC\napt-get install -y chrony\nsystemctl enable chrony\nsystemctl start chrony\n\n# 4. Configure SSH hardening\necho \"Hardening SSH configuration...\"\nsed -i 's/#PermitRootLogin yes/PermitRootLogin no/' /etc/ssh/sshd_config\nsed -i 's/#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config\nsed -i 's/#Port 22/Port 2222/' /etc/ssh/sshd_config\nsystemctl restart sshd\n\n# 5. Configure firewall\necho \"Configuring firewall...\"\nufw default deny incoming\nufw default allow outgoing\nufw allow 2222/tcp # SSH\nufw allow 80/tcp # HTTP\nufw allow 443/tcp # HTTPS\nufw --force enable\n\n# 6. Install monitoring agent\necho \"Installing monitoring agent...\"\nwget -O /tmp/node_exporter.tar.gz https://github.com/prometheus/node_exporter/releases/download/v1.6.1/node_exporter-1.6.1.linux-amd64.tar.gz\ntar xvfz /tmp/node_exporter.tar.gz -C /opt/\ncat > /etc/systemd/system/node_exporter.service \u003c\u003cEOF\n[Unit]\nDescription=Node Exporter\nAfter=network.target\n\n[Service]\nType=simple\nExecStart=/opt/node_exporter-1.6.1.linux-amd64/node_exporter\nRestart=always\n\n[Install]\nWantedBy=multi-user.target\nEOF\nsystemctl enable node_exporter\nsystemctl start node_exporter\n\n# 7. Install logging agent (rsyslog to centralized server)\necho \"Configuring centralized logging...\"\ncat >> /etc/rsyslog.d/50-remote.conf \u003c\u003cEOF\n*.* @@log-server.example.com:514\nEOF\nsystemctl restart rsyslog\n\n# 8. Install essential tools\necho \"Installing essential tools...\"\napt-get install -y vim tmux htop iotop net-tools curl wget git\n\n# 9. Configure automatic security updates\necho \"Configuring automatic security updates...\"\napt-get install -y unattended-upgrades\ndpkg-reconfigure -plow unattended-upgrades\n\n# 10. Set up user accounts\necho \"Creating ops user...\"\nuseradd -m -s /bin/bash opsuser\nusermod -aG sudo opsuser\nmkdir -p /home/opsuser/.ssh\nchmod 700 /home/opsuser/.ssh\n# Add SSH public keys to /home/opsuser/.ssh/authorized_keys\n\n# 11. Install security tools\necho \"Installing security tools...\"\napt-get install -y fail2ban aide\nsystemctl enable fail2ban\nsystemctl start fail2ban\n\n# 12. Document in CMDB\ncurl -X POST https://cmdb.example.com/api/servers \\\n -H \"Content-Type: application/json\" \\\n -d \"{\n \\\"hostname\\\": \\\"$HOSTNAME\\\",\n \\\"ip_address\\\": \\\"$(hostname -I | awk '{print $1}')\\\",\n \\\"os\\\": \\\"$(lsb_release -d | cut -f2)\\\",\n \\\"provisioned_date\\\": \\\"$(date -I)\\\",\n \\\"owner\\\": \\\"ops-team\\\"\n }\"\n\necho \"=== Baseline configuration complete ===\"\n```\n\n**Windows Server Setup (PowerShell)**:\n```powershell\n# Windows Server Baseline Configuration\n\n# 1. Set Computer Name\n$computerName = \"APP-SERVER-01\"\nRename-Computer -NewName $computerName -Force\n\n# 2. Configure Windows Update\nInstall-Module PSWindowsUpdate -Force\nGet-WindowsUpdate\nInstall-WindowsUpdate -AcceptAll -AutoReboot\n\n# 3. Configure Windows Firewall\nSet-NetFirewallProfile -Profile Domain,Public,Private -Enabled True\nNew-NetFirewallRule -DisplayName \"Allow RDP\" -Direction Inbound -LocalPort 3389 -Protocol TCP -Action Allow\nNew-NetFirewallRule -DisplayName \"Allow HTTP\" -Direction Inbound -LocalPort 80 -Protocol TCP -Action Allow\nNew-NetFirewallRule -DisplayName \"Allow HTTPS\" -Direction Inbound -LocalPort 443 -Protocol TCP -Action Allow\n\n# 4. Disable unnecessary services\nSet-Service -Name \"Spooler\" -StartupType Disabled\nSet-Service -Name \"Fax\" -StartupType Disabled\n\n# 5. Install monitoring agent\n$nodeExporterUrl = \"https://github.com/prometheus-community/windows_exporter/releases/download/v0.23.1/windows_exporter-0.23.1-amd64.msi\"\nInvoke-WebRequest -Uri $nodeExporterUrl -OutFile \"$env:TEMP\\windows_exporter.msi\"\nStart-Process msiexec.exe -ArgumentList \"/i $env:TEMP\\windows_exporter.msi /quiet\" -Wait\n\n# 6. Configure Event Log forwarding\nwevtutil set-log ForwardedEvents /enabled:true\nwinrm quickconfig -q\n\n# 7. Harden RDP\nNew-ItemProperty -Path \"HKLM:\\System\\CurrentControlSet\\Control\\Terminal Server\\WinStations\\RDP-Tcp\" -Name \"UserAuthentication\" -Value 1 -PropertyType DWORD -Force\nNew-ItemProperty -Path \"HKLM:\\System\\CurrentControlSet\\Control\\Terminal Server\" -Name \"fDenyTSConnections\" -Value 0 -PropertyType DWORD -Force\n\n# 8. Enable BitLocker (if supported)\nEnable-BitLocker -MountPoint \"C:\" -EncryptionMethod Aes256 -RecoveryPasswordProtector\n\nWrite-Host \"Baseline configuration complete. Please reboot.\"\n```\n\n### Server Inventory Management\n\n**CMDB (Configuration Management Database) Schema**:\n```sql\n-- Servers table\nCREATE TABLE servers (\n server_id SERIAL PRIMARY KEY,\n hostname VARCHAR(255) UNIQUE NOT NULL,\n ip_address INET NOT NULL,\n environment VARCHAR(50) NOT NULL, -- production, staging, dev\n location VARCHAR(100) NOT NULL, -- datacenter or cloud region\n server_type VARCHAR(50) NOT NULL, -- physical, virtual, cloud\n os_type VARCHAR(50) NOT NULL, -- linux, windows\n os_version VARCHAR(100) NOT NULL,\n cpu_cores INT NOT NULL,\n ram_gb INT NOT NULL,\n disk_gb INT NOT NULL,\n manufacturer VARCHAR(100),\n model VARCHAR(100),\n serial_number VARCHAR(100),\n purchase_date DATE,\n warranty_expiry DATE,\n owner_team VARCHAR(100) NOT NULL,\n application VARCHAR(255),\n status VARCHAR(50) NOT NULL, -- active, decommissioned, maintenance\n created_at TIMESTAMP DEFAULT NOW(),\n updated_at TIMESTAMP DEFAULT NOW()\n);\n\n-- Network interfaces table\nCREATE TABLE network_interfaces (\n interface_id SERIAL PRIMARY KEY,\n server_id INT REFERENCES servers(server_id),\n interface_name VARCHAR(50) NOT NULL,\n mac_address VARCHAR(17) NOT NULL,\n ip_address INET NOT NULL,\n subnet_mask VARCHAR(18) NOT NULL,\n gateway INET,\n vlan_id INT,\n created_at TIMESTAMP DEFAULT NOW()\n);\n\n-- Installed software table\nCREATE TABLE installed_software (\n software_id SERIAL PRIMARY KEY,\n server_id INT REFERENCES servers(server_id),\n software_name VARCHAR(255) NOT NULL,\n version VARCHAR(100) NOT NULL,\n install_date DATE NOT NULL,\n license_key VARCHAR(255),\n license_expiry DATE\n);\n\n-- Patching history table\nCREATE TABLE patch_history (\n patch_id SERIAL PRIMARY KEY,\n server_id INT REFERENCES servers(server_id),\n patch_name VARCHAR(255) NOT NULL,\n patch_date TIMESTAMP NOT NULL,\n patch_status VARCHAR(50) NOT NULL, -- success, failed, rollback\n applied_by VARCHAR(100) NOT NULL,\n reboot_required BOOLEAN DEFAULT false\n);\n\n-- Sample queries\n\n-- Active production servers\nSELECT hostname, ip_address, cpu_cores, ram_gb, owner_team\nFROM servers\nWHERE environment = 'production' AND status = 'active'\nORDER BY hostname;\n\n-- Servers with expiring warranties (next 60 days)\nSELECT hostname, warranty_expiry, DATEDIFF(day, NOW(), warranty_expiry) as days_until_expiry\nFROM servers\nWHERE warranty_expiry BETWEEN NOW() AND NOW() + INTERVAL '60 days'\nORDER BY warranty_expiry;\n\n-- Servers by team\nSELECT owner_team, COUNT(*) as server_count, SUM(cpu_cores) as total_cores, SUM(ram_gb) as total_ram\nFROM servers\nWHERE status = 'active'\nGROUP BY owner_team\nORDER BY server_count DESC;\n```\n\n## Network Operations\n\n### Network Architecture\n\n```\nInternet\n ↓\nFirewall (Edge)\n ↓\nDMZ (VLAN 10) - 10.0.10.0/24\n ├─ Load Balancer (10.0.10.10)\n └─ Web Servers (10.0.10.20-29)\n ↓\nInternal Firewall\n ↓\nApplication Zone (VLAN 20) - 10.0.20.0/24\n ├─ App Servers (10.0.20.10-29)\n └─ Message Queue (10.0.20.30)\n ↓\nDatabase Zone (VLAN 30) - 10.0.30.0/24\n ├─ DB Primary (10.0.30.10)\n ├─ DB Replica (10.0.30.11)\n └─ DB Backup (10.0.30.12)\n ↓\nManagement Zone (VLAN 99) - 10.0.99.0/24\n ├─ Monitoring (10.0.99.10)\n ├─ Logging (10.0.99.11)\n └─ Jump Box (10.0.99.20)\n```\n\n### Network Configuration Examples\n\n**Switch VLAN Configuration (Cisco)**:\n```cisco\n! Create VLANs\nvlan 10\n name DMZ\nvlan 20\n name APPLICATION\nvlan 30\n name DATABASE\nvlan 99\n name MANAGEMENT\n\n! Configure trunk port (uplink to firewall)\ninterface GigabitEthernet0/1\n description Uplink to Firewall\n switchport mode trunk\n switchport trunk allowed vlan 10,20,30,99\n\n! Configure access port (web server)\ninterface GigabitEthernet0/10\n description Web-Server-01\n switchport mode access\n switchport access vlan 10\n spanning-tree portfast\n\n! Configure port-channel (link aggregation)\ninterface Port-channel1\n description Link to Core Switch\n switchport mode trunk\n switchport trunk allowed vlan 10,20,30,99\n\ninterface GigabitEthernet0/47\n description Member of Port-channel1\n channel-group 1 mode active\n\ninterface GigabitEthernet0/48\n description Member of Port-channel1\n channel-group 1 mode active\n```\n\n**Firewall Rules (iptables)**:\n```bash\n#!/bin/bash\n# Firewall configuration script\n\n# Flush existing rules\niptables -F\niptables -X\niptables -t nat -F\niptables -t nat -X\n\n# Default policies\niptables -P INPUT DROP\niptables -P FORWARD DROP\niptables -P OUTPUT ACCEPT\n\n# Allow loopback\niptables -A INPUT -i lo -j ACCEPT\n\n# Allow established connections\niptables -A INPUT -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT\n\n# Allow SSH (from management network only)\niptables -A INPUT -p tcp --dport 22 -s 10.0.99.0/24 -j ACCEPT\n\n# Allow HTTP/HTTPS\niptables -A INPUT -p tcp --dport 80 -j ACCEPT\niptables -A INPUT -p tcp --dport 443 -j ACCEPT\n\n# Allow ICMP (ping)\niptables -A INPUT -p icmp --icmp-type echo-request -j ACCEPT\n\n# Allow monitoring (Prometheus)\niptables -A INPUT -p tcp --dport 9100 -s 10.0.99.10 -j ACCEPT\n\n# Rate limiting (DDoS protection)\niptables -A INPUT -p tcp --dport 80 -m limit --limit 100/minute --limit-burst 200 -j ACCEPT\niptables -A INPUT -p tcp --dport 443 -m limit --limit 100/minute --limit-burst 200 -j ACCEPT\n\n# Log dropped packets\niptables -A INPUT -m limit --limit 5/min -j LOG --log-prefix \"iptables denied: \" --log-level 7\n\n# Save rules\niptables-save > /etc/iptables/rules.v4\n\necho \"Firewall rules configured.\"\n```\n\n**Load Balancer Configuration (HAProxy)**:\n```haproxy\n# /etc/haproxy/haproxy.cfg\n\nglobal\n log /dev/log local0\n log /dev/log local1 notice\n chroot /var/lib/haproxy\n stats socket /run/haproxy/admin.sock mode 660 level admin\n stats timeout 30s\n user haproxy\n group haproxy\n daemon\n\n # SSL/TLS configuration\n ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256\n ssl-default-bind-options ssl-min-ver TLSv1.2\n\ndefaults\n log global\n mode http\n option httplog\n option dontlognull\n timeout connect 5000\n timeout client 50000\n timeout server 50000\n errorfile 400 /etc/haproxy/errors/400.http\n errorfile 403 /etc/haproxy/errors/403.http\n errorfile 408 /etc/haproxy/errors/408.http\n errorfile 500 /etc/haproxy/errors/500.http\n errorfile 502 /etc/haproxy/errors/502.http\n errorfile 503 /etc/haproxy/errors/503.http\n errorfile 504 /etc/haproxy/errors/504.http\n\n# Frontend configuration (HTTPS)\nfrontend https_front\n bind *:443 ssl crt /etc/haproxy/certs/example.com.pem\n default_backend web_servers\n\n # Rate limiting\n stick-table type ip size 100k expire 30s store http_req_rate(10s)\n http-request track-sc0 src\n http-request deny deny_status 429 if { sc_http_req_rate(0) gt 100 }\n\n # Security headers\n http-response set-header Strict-Transport-Security \"max-age=31536000; includeSubDomains\"\n http-response set-header X-Frame-Options \"SAMEORIGIN\"\n http-response set-header X-Content-Type-Options \"nosniff\"\n\n# Backend configuration\nbackend web_servers\n balance roundrobin\n option httpchk GET /health HTTP/1.1\\r\\nHost:\\ example.com\n http-check expect status 200\n\n server web01 10.0.10.20:80 check inter 5s rise 2 fall 3\n server web02 10.0.10.21:80 check inter 5s rise 2 fall 3\n server web03 10.0.10.22:80 check inter 5s rise 2 fall 3\n\n# Statistics page\nlisten stats\n bind *:8404\n stats enable\n stats uri /stats\n stats refresh 30s\n stats auth admin:password123\n```\n\n### Network Troubleshooting\n\n**Network Diagnostic Commands**:\n```bash\n# Test connectivity\nping -c 4 8.8.8.8 # Basic connectivity\nping -c 4 google.com # DNS resolution + connectivity\n\n# Trace route\ntraceroute google.com # Linux\ntracert google.com # Windows\nmtr google.com # Continuous traceroute (Linux)\n\n# DNS troubleshooting\nnslookup google.com # Basic DNS lookup\ndig google.com # Detailed DNS query\ndig @8.8.8.8 google.com # Query specific DNS server\n\n# Port connectivity\ntelnet example.com 80 # Test if port is open\nnc -zv example.com 80 # Netcat port scan\ncurl -v https://example.com # HTTP/HTTPS test with verbose output\n\n# Network interfaces\nip addr show # Show IP addresses (Linux)\nip link show # Show interface status\nifconfig # Legacy interface info\nethtool eth0 # Interface details and statistics\n\n# Routing\nip route show # Show routing table\nroute -n # Numeric routing table\nnetstat -rn # Routing table (legacy)\n\n# Active connections\nnetstat -tuln # List listening ports\nss -tuln # Socket statistics (modern replacement)\nlsof -i :80 # Show what's using port 80\n\n# Packet capture\ntcpdump -i eth0 port 80 # Capture HTTP traffic\ntcpdump -i eth0 -w capture.pcap # Write to file\ntcpdump -r capture.pcap # Read from file\n\n# Bandwidth testing\niperf3 -s # Server mode\niperf3 -c server-ip # Client mode\n\n# Network statistics\nnetstat -s # Protocol statistics\nss -s # Socket statistics summary\niftop # Real-time bandwidth by connection\n```\n\n## Capacity Planning\n\n### Capacity Planning Process\n\n```yaml\nStep 1: Collect Baseline Data (Ongoing)\n Metrics to Track:\n - CPU utilization (%, by core)\n - Memory utilization (GB, %)\n - Disk I/O (IOPS, throughput)\n - Network throughput (Mbps)\n - Application metrics (requests/sec, users)\n\n Time Ranges:\n - Real-time (1-minute granularity)\n - Daily averages (for trend analysis)\n - Weekly averages (for seasonality)\n - Monthly aggregates (for year-over-year)\n\nStep 2: Analyze Trends (Monthly)\n Questions to Answer:\n - What is the growth rate? (linear, exponential, seasonal)\n - When will current capacity be exhausted?\n - What are the peak utilization periods?\n - Are there any unusual spikes or patterns?\n\n Analysis Methods:\n - Linear regression (simple growth)\n - Time series forecasting (seasonal patterns)\n - Percentile analysis (p50, p95, p99)\n\nStep 3: Forecast Future Demand (Quarterly)\n Inputs:\n - Historical growth trends\n - Business projections (user growth, new features)\n - Upcoming marketing campaigns or events\n - Industry benchmarks\n\n Forecasting Horizons:\n - Short-term (3 months): High confidence\n - Medium-term (6-12 months): Moderate confidence\n - Long-term (12-24 months): Low confidence, scenario planning\n\nStep 4: Capacity Modeling\n Calculate Required Capacity:\n - Current capacity\n - Growth rate\n - Target headroom (20-30%)\n - Expected utilization after expansion\n\n Example:\n Current CPU utilization: 70%\n Growth rate: 10% per month\n In 6 months: 70% × (1.1)^6 = 124% (will exceed capacity)\n Action: Add capacity within 3 months\n\nStep 5: Plan and Execute (As Needed)\n Options:\n - Vertical scaling (add CPU/RAM to existing servers)\n - Horizontal scaling (add more servers)\n - Optimize application (reduce resource usage)\n\n Considerations:\n - Lead time (procurement, deployment)\n - Budget approval process\n - Maintenance windows\n - Risk mitigation (pilot, canary, rollback plan)\n```\n\n### Capacity Planning Calculations\n\n**CPU Capacity**:\n```python\n# CPU capacity planning calculator\n\ndef calculate_cpu_capacity(current_util_pct, growth_rate_monthly, months, target_headroom=0.30):\n \"\"\"\n Calculate when CPU capacity will be exhausted\n\n Args:\n current_util_pct: Current CPU utilization (0-1)\n growth_rate_monthly: Monthly growth rate (e.g., 0.10 for 10%)\n months: Forecast period in months\n target_headroom: Desired headroom (0.30 = 30%)\n\n Returns:\n dict with forecast and recommendations\n \"\"\"\n import math\n\n # Calculate future utilization\n future_util = current_util_pct * ((1 + growth_rate_monthly) ** months)\n\n # Calculate when capacity will be exhausted (reach 100%)\n if growth_rate_monthly > 0:\n months_to_exhaustion = math.log(1.0 / current_util_pct) / math.log(1 + growth_rate_monthly)\n else:\n months_to_exhaustion = float('inf')\n\n # Calculate when to add capacity (to maintain headroom)\n target_max_util = 1.0 - target_headroom\n months_to_action = math.log(target_max_util / current_util_pct) / math.log(1 + growth_rate_monthly)\n\n # Calculate required scaling factor\n scaling_factor = future_util / target_max_util if future_util > target_max_util else 1.0\n\n return {\n 'current_utilization_pct': current_util_pct * 100,\n 'forecasted_utilization_pct': future_util * 100,\n 'months_to_exhaustion': months_to_exhaustion,\n 'months_to_action': months_to_action,\n 'scaling_factor': scaling_factor,\n 'recommendation': 'Add capacity' if scaling_factor > 1.0 else 'No action needed'\n }\n\n# Example usage\nresult = calculate_cpu_capacity(\n current_util_pct=0.65, # 65% current utilization\n growth_rate_monthly=0.08, # 8% monthly growth\n months=6, # 6-month forecast\n target_headroom=0.30 # Maintain 30% headroom\n)\n\nprint(f\"Current Utilization: {result['current_utilization_pct']:.1f}%\")\nprint(f\"Forecasted Utilization (6 months): {result['forecasted_utilization_pct']:.1f}%\")\nprint(f\"Months Until Capacity Exhausted: {result['months_to_exhaustion']:.1f}\")\nprint(f\"Months Until Action Needed: {result['months_to_action']:.1f}\")\nprint(f\"Scaling Factor Required: {result['scaling_factor']:.2f}x\")\nprint(f\"Recommendation: {result['recommendation']}\")\n\n# Output:\n# Current Utilization: 65.0%\n# Forecasted Utilization (6 months): 103.3%\n# Months Until Capacity Exhausted: 5.2\n# Months Until Action Needed: 2.7\n# Scaling Factor Required: 1.48x\n# Recommendation: Add capacity\n```\n\n**Storage Capacity**:\n```python\n# Storage capacity planning\n\ndef calculate_storage_capacity(current_usage_gb, growth_rate_daily_gb, days, total_capacity_gb):\n \"\"\"Calculate storage capacity forecast\"\"\"\n\n future_usage_gb = current_usage_gb + (growth_rate_daily_gb * days)\n utilization_pct = (future_usage_gb / total_capacity_gb) * 100\n days_to_full = (total_capacity_gb - current_usage_gb) / growth_rate_daily_gb if growth_rate_daily_gb > 0 else float('inf')\n\n return {\n 'current_usage_gb': current_usage_gb,\n 'current_utilization_pct': (current_usage_gb / total_capacity_gb) * 100,\n 'forecasted_usage_gb': future_usage_gb,\n 'forecasted_utilization_pct': utilization_pct,\n 'days_to_full': days_to_full,\n 'recommendation': 'Add storage' if utilization_pct > 80 else 'No action needed'\n }\n\n# Example: Database server storage\nresult = calculate_storage_capacity(\n current_usage_gb=3500, # 3.5 TB currently used\n growth_rate_daily_gb=15, # 15 GB per day growth\n days=90, # 90-day forecast\n total_capacity_gb=5000 # 5 TB total capacity\n)\n\nprint(f\"Current Usage: {result['current_usage_gb']} GB ({result['current_utilization_pct']:.1f}%)\")\nprint(f\"Forecasted Usage (90 days): {result['forecasted_usage_gb']} GB ({result['forecasted_utilization_pct']:.1f}%)\")\nprint(f\"Days Until Full: {result['days_to_full']:.0f}\")\nprint(f\"Recommendation: {result['recommendation']}\")\n\n# Output:\n# Current Usage: 3500 GB (70.0%)\n# Forecasted Usage (90 days): 4850 GB (97.0%)\n# Days Until Full: 100\n# Recommendation: Add storage\n```\n\n### Capacity Planning Dashboard Metrics\n\n```yaml\nCPU Capacity Dashboard:\n - Current Utilization: Gauge (0-100%)\n - 30-Day Trend: Line graph\n - Growth Rate: Percentage per month\n - Months Until 80% Capacity: Number\n - Peak Utilization: Histogram (by hour of day)\n\nMemory Capacity Dashboard:\n - Current Utilization: Gauge (0-100%)\n - Available Memory: GB\n - Memory Pressure Events: Count per day\n - Top Memory Consumers: Table (process, usage)\n\nStorage Capacity Dashboard:\n - Disk Usage by Volume: Bar chart\n - Growth Rate: GB per day\n - Days Until Full: Number (by volume)\n - Largest Files/Directories: Table\n\nNetwork Capacity Dashboard:\n - Bandwidth Utilization: Gauge (% of total)\n - Peak Throughput: Mbps\n - Connection Count: Time series\n - Network Errors: Count per minute\n```\n\n## Storage Management\n\n### Storage Types and Use Cases\n\n```yaml\nDirect Attached Storage (DAS):\n Description: Storage directly connected to server (internal drives)\n Use Cases:\n - Operating system\n - Local caching\n - Temporary files\n Pros: Fast, simple, low cost\n Cons: Not shared, limited capacity, no redundancy\n\nNetwork Attached Storage (NAS):\n Description: File-level storage over network (NFS, SMB/CIFS)\n Use Cases:\n - File shares\n - Home directories\n - Backup target\n Pros: Easy to share, centralized management\n Cons: Network dependent, file-level only\n\nStorage Area Network (SAN):\n Description: Block-level storage over dedicated network (FC, iSCSI)\n Use Cases:\n - Databases\n - Virtual machine storage\n - High-performance applications\n Pros: High performance, flexible, scalable\n Cons: Expensive, complex\n\nObject Storage:\n Description: Object/blob storage with metadata (S3, Azure Blob)\n Use Cases:\n - Backups\n - Archives\n - Media files\n - Static website content\n Pros: Unlimited scale, durable, cost-effective\n Cons: Higher latency, no POSIX filesystem\n```\n\n### RAID Configurations\n\n```yaml\nRAID 0 (Striping):\n Configuration: Data split across drives\n Minimum Drives: 2\n Usable Capacity: 100%\n Performance: Excellent (read & write)\n Redundancy: None (any drive failure = data loss)\n Use Case: Non-critical, high-performance (caching)\n\nRAID 1 (Mirroring):\n Configuration: Identical copies on each drive\n Minimum Drives: 2\n Usable Capacity: 50%\n Performance: Good reads, moderate writes\n Redundancy: Can lose 1 drive\n Use Case: OS drives, critical data, small arrays\n\nRAID 5 (Striping with Parity):\n Configuration: Data + parity distributed across drives\n Minimum Drives: 3\n Usable Capacity: (N-1)/N (e.g., 3 drives = 67%)\n Performance: Good reads, moderate writes\n Redundancy: Can lose 1 drive\n Use Case: File servers, general purpose\n\nRAID 6 (Striping with Double Parity):\n Configuration: Data + 2 parity blocks distributed\n Minimum Drives: 4\n Usable Capacity: (N-2)/N (e.g., 4 drives = 50%)\n Performance: Good reads, slower writes\n Redundancy: Can lose 2 drives\n Use Case: Large arrays, critical data\n\nRAID 10 (1+0, Mirrored Stripes):\n Configuration: Striped set of mirrors\n Minimum Drives: 4\n Usable Capacity: 50%\n Performance: Excellent (read & write)\n Redundancy: Can lose 1 drive per mirror\n Use Case: Databases, high-performance applications\n\nRecommendation:\n - OS drives: RAID 1 (or RAID 10 for performance)\n - Database: RAID 10 (best performance + redundancy)\n - File servers: RAID 5 or RAID 6 (capacity + redundancy)\n - Backup: RAID 6 (large capacity, double redundancy)\n```\n\n### LVM (Logical Volume Management)\n\n```bash\n# LVM Setup (Linux)\n\n# 1. Initialize physical volumes\npvcreate /dev/sdb\npvcreate /dev/sdc\npvdisplay\n\n# 2. Create volume group\nvgcreate data_vg /dev/sdb /dev/sdc\nvgdisplay data_vg\n\n# 3. Create logical volumes\nlvcreate -L 500G -n database_lv data_vg\nlvcreate -L 1T -n backups_lv data_vg\nlvdisplay\n\n# 4. Create filesystems\nmkfs.ext4 /dev/data_vg/database_lv\nmkfs.xfs /dev/data_vg/backups_lv\n\n# 5. Mount filesystems\nmkdir -p /data/database /data/backups\nmount /dev/data_vg/database_lv /data/database\nmount /dev/data_vg/backups_lv /data/backups\n\n# 6. Add to /etc/fstab for persistence\necho \"/dev/data_vg/database_lv /data/database ext4 defaults 0 2\" >> /etc/fstab\necho \"/dev/data_vg/backups_lv /data/backups xfs defaults 0 2\" >> /etc/fstab\n\n# Expand logical volume (online resize)\nlvextend -L +200G /dev/data_vg/database_lv\nresize2fs /dev/data_vg/database_lv # ext4\nxfs_growfs /data/backups # xfs\n\n# Create snapshot (for backups)\nlvcreate -L 50G -s -n database_snap /dev/data_vg/database_lv\nmount /dev/data_vg/database_snap /mnt/snapshot\n# ... perform backup from /mnt/snapshot ...\numount /mnt/snapshot\nlvremove /dev/data_vg/database_snap\n```\n\n## Virtualization\n\n### Virtualization Platforms\n\n```yaml\nVMware vSphere/ESXi:\n Type: Type-1 Hypervisor (bare metal)\n Pros: Mature, feature-rich, excellent management (vCenter)\n Cons: Expensive licensing\n Use Case: Enterprise environments, large deployments\n\nKVM (Kernel-based Virtual Machine):\n Type: Type-1 Hypervisor (Linux kernel module)\n Pros: Open source, high performance, flexible\n Cons: Management tools less mature than VMware\n Use Case: Linux-heavy environments, cost-conscious\n\nMicrosoft Hyper-V:\n Type: Type-1 Hypervisor\n Pros: Tight Windows integration, free with Windows Server\n Cons: Linux guest support limited\n Use Case: Windows-heavy environments\n\nProxmox VE:\n Type: Type-1 Hypervisor (KVM + LXC)\n Pros: Open source, web UI, container support\n Cons: Smaller ecosystem than VMware\n Use Case: Small to medium deployments, mixed VM/container\n```\n\n### VM Management with KVM/QEMU\n\n```bash\n# Install KVM on Ubuntu\napt-get install -y qemu-kvm libvirt-daemon-system libvirt-clients bridge-utils virt-manager\n\n# Start libvirt service\nsystemctl enable libvirtd\nsystemctl start libvirtd\n\n# Create VM from command line\nvirt-install \\\n --name web-server-vm \\\n --ram 4096 \\\n --vcpus 2 \\\n --disk path=/var/lib/libvirt/images/web-server.qcow2,size=50 \\\n --os-type linux \\\n --os-variant ubuntu20.04 \\\n --network bridge=br0 \\\n --graphics vnc,listen=0.0.0.0 \\\n --console pty,target_type=serial \\\n --cdrom /var/lib/libvirt/images/ubuntu-20.04-server.iso\n\n# List VMs\nvirsh list --all\n\n# Start/stop VM\nvirsh start web-server-vm\nvirsh shutdown web-server-vm\nvirsh destroy web-server-vm # force stop\n\n# Connect to VM console\nvirsh console web-server-vm\n\n# Clone VM\nvirt-clone \\\n --original web-server-vm \\\n --name web-server-vm-clone \\\n --file /var/lib/libvirt/images/web-server-clone.qcow2\n\n# Take snapshot\nvirsh snapshot-create-as web-server-vm snapshot1 \"Before upgrade\"\n\n# List snapshots\nvirsh snapshot-list web-server-vm\n\n# Revert to snapshot\nvirsh snapshot-revert web-server-vm snapshot1\n\n# Export VM (backup)\nvirsh dumpxml web-server-vm > web-server-vm.xml\ncp /var/lib/libvirt/images/web-server.qcow2 /backups/\n\n# Import VM (restore)\nvirsh define web-server-vm.xml\ncp /backups/web-server.qcow2 /var/lib/libvirt/images/\n```\n\n## Cloud Infrastructure\n\n### Cloud Provider Comparison\n\n| Feature | AWS | Azure | GCP |\n|---------|-----|-------|-----|\n| **Market Share** | ~32% | ~23% | ~10% |\n| **Compute** | EC2 | Virtual Machines | Compute Engine |\n| **Containers** | ECS, EKS | AKS | GKE |\n| **Serverless** | Lambda | Functions | Cloud Functions |\n| **Storage (Object)** | S3 | Blob Storage | Cloud Storage |\n| **Storage (Block)** | EBS | Managed Disks | Persistent Disks |\n| **Database (SQL)** | RDS | SQL Database | Cloud SQL |\n| **Database (NoSQL)** | DynamoDB | Cosmos DB | Firestore/Bigtable |\n| **Networking** | VPC | Virtual Network | VPC |\n| **Load Balancer** | ELB/ALB | Load Balancer | Cloud Load Balancing |\n| **DNS** | Route 53 | DNS | Cloud DNS |\n| **CDN** | CloudFront | CDN | Cloud CDN |\n| **Pricing** | $$ | $$ | $$ |\n\n### AWS EC2 Management\n\n```bash\n# AWS CLI - EC2 Management\n\n# List instances\naws ec2 describe-instances \\\n --filters \"Name=tag:Environment,Values=production\" \\\n --query 'Reservations[*].Instances[*].[InstanceId,InstanceType,State.Name,PrivateIpAddress]' \\\n --output table\n\n# Start instance\naws ec2 start-instances --instance-ids i-1234567890abcdef0\n\n# Stop instance\naws ec2 stop-instances --instance-ids i-1234567890abcdef0\n\n# Create AMI (backup/template)\naws ec2 create-image \\\n --instance-id i-1234567890abcdef0 \\\n --name \"web-server-backup-$(date +%Y%m%d)\" \\\n --description \"Backup before upgrade\"\n\n# Launch new instance from AMI\naws ec2 run-instances \\\n --image-id ami-0abcdef1234567890 \\\n --count 1 \\\n --instance-type t3.medium \\\n --key-name my-key-pair \\\n --security-group-ids sg-0123456789abcdef0 \\\n --subnet-id subnet-0123456789abcdef0 \\\n --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=web-server-03}]'\n\n# Create snapshot of EBS volume\naws ec2 create-snapshot \\\n --volume-id vol-1234567890abcdef0 \\\n --description \"Daily backup\"\n\n# Modify instance type (requires stop)\naws ec2 stop-instances --instance-ids i-1234567890abcdef0\naws ec2 modify-instance-attribute \\\n --instance-id i-1234567890abcdef0 \\\n --instance-type \"{\\\"Value\\\": \\\"t3.large\\\"}\"\naws ec2 start-instances --instance-ids i-1234567890abcdef0\n```\n\n## Infrastructure as Code\n\n### Terraform Example\n\n```hcl\n# main.tf - Web server infrastructure\n\nterraform {\n required_version = \">= 1.0\"\n\n required_providers {\n aws = {\n source = \"hashicorp/aws\"\n version = \"~> 5.0\"\n }\n }\n\n backend \"s3\" {\n bucket = \"my-terraform-state\"\n key = \"web-servers/terraform.tfstate\"\n region = \"us-east-1\"\n }\n}\n\nprovider \"aws\" {\n region = var.aws_region\n}\n\n# Variables\nvariable \"aws_region\" {\n default = \"us-east-1\"\n}\n\nvariable \"instance_count\" {\n default = 3\n}\n\nvariable \"instance_type\" {\n default = \"t3.medium\"\n}\n\n# Data source - Latest Ubuntu AMI\ndata \"aws_ami\" \"ubuntu\" {\n most_recent = true\n owners = [\"099720109477\"] # Canonical\n\n filter {\n name = \"name\"\n values = [\"ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*\"]\n }\n}\n\n# Security Group\nresource \"aws_security_group\" \"web\" {\n name = \"web-servers-sg\"\n description = \"Security group for web servers\"\n\n ingress {\n from_port = 22\n to_port = 22\n protocol = \"tcp\"\n cidr_blocks = [\"10.0.0.0/8\"] # Internal only\n }\n\n ingress {\n from_port = 80\n to_port = 80\n protocol = \"tcp\"\n cidr_blocks = [\"0.0.0.0/0\"]\n }\n\n ingress {\n from_port = 443\n to_port = 443\n protocol = \"tcp\"\n cidr_blocks = [\"0.0.0.0/0\"]\n }\n\n egress {\n from_port = 0\n to_port = 0\n protocol = \"-1\"\n cidr_blocks = [\"0.0.0.0/0\"]\n }\n\n tags = {\n Name = \"web-servers-sg\"\n Environment = \"production\"\n }\n}\n\n# EC2 Instances\nresource \"aws_instance\" \"web\" {\n count = var.instance_count\n ami = data.aws_ami.ubuntu.id\n instance_type = var.instance_type\n\n vpc_security_group_ids = [aws_security_group.web.id]\n\n user_data = file(\"${path.module}/user-data.sh\")\n\n root_block_device {\n volume_size = 50\n volume_type = \"gp3\"\n }\n\n tags = {\n Name = \"web-server-${count.index + 1}\"\n Environment = \"production\"\n Role = \"web\"\n }\n}\n\n# Load Balancer\nresource \"aws_lb\" \"web\" {\n name = \"web-lb\"\n internal = false\n load_balancer_type = \"application\"\n security_groups = [aws_security_group.web.id]\n subnets = data.aws_subnets.default.ids\n}\n\nresource \"aws_lb_target_group\" \"web\" {\n name = \"web-tg\"\n port = 80\n protocol = \"HTTP\"\n vpc_id = data.aws_vpc.default.id\n\n health_check {\n path = \"/health\"\n healthy_threshold = 2\n unhealthy_threshold = 10\n }\n}\n\nresource \"aws_lb_target_group_attachment\" \"web\" {\n count = var.instance_count\n target_group_arn = aws_lb_target_group.web.arn\n target_id = aws_instance.web[count.index].id\n port = 80\n}\n\nresource \"aws_lb_listener\" \"web\" {\n load_balancer_arn = aws_lb.web.arn\n port = \"80\"\n protocol = \"HTTP\"\n\n default_action {\n type = \"forward\"\n target_group_arn = aws_lb_target_group.web.arn\n }\n}\n\n# Outputs\noutput \"instance_ips\" {\n value = aws_instance.web[*].private_ip\n}\n\noutput \"load_balancer_dns\" {\n value = aws_lb.web.dns_name\n}\n```\n\n## Patching and Updates\n\n### Patch Management Process\n\n```yaml\nPhase 1: Planning (Monthly)\n Actions:\n - Review vendor security bulletins\n - Identify critical and high-priority patches\n - Test patches in dev/staging environment\n - Schedule maintenance window\n - Get change approval\n\n Prioritization:\n Critical: Security vulnerabilities (CVSS 9-10) - Apply within 7 days\n High: Security vulnerabilities (CVSS 7-8) - Apply within 30 days\n Medium: Bugs, moderate vulnerabilities - Apply within 90 days\n Low: Feature updates, minor fixes - Apply on regular schedule\n\nPhase 2: Testing (1-2 weeks before production)\n Actions:\n - Deploy patches to non-production environment\n - Run automated tests\n - Perform manual smoke tests\n - Monitor for unexpected issues\n - Document any compatibility issues\n\n Test Criteria:\n - Application starts successfully\n - All critical functionality works\n - No performance degradation\n - No new errors in logs\n\nPhase 3: Deployment (Maintenance Window)\n Actions:\n - Communicate to stakeholders\n - Take pre-patch snapshot/backup\n - Deploy patches in stages (canary approach)\n - Monitor system health\n - Validate functionality\n - Document results\n\n Rollout Strategy:\n - Non-production: 100% at once\n - Production: 10% → 50% → 100% with monitoring\n\nPhase 4: Validation (Post-deployment)\n Actions:\n - Run post-patch tests\n - Monitor for 24-48 hours\n - Check error rates, performance metrics\n - Rollback if issues detected\n - Document lessons learned\n```\n\n### Automated Patching Scripts\n\n**Linux (Ubuntu/Debian)**:\n```bash\n#!/bin/bash\n# Automated patch management script\n\nset -e\n\nLOG_FILE=\"/var/log/patch-management.log\"\nEMAIL_TO=\"[email protected]\"\n\nlog() {\n echo \"[$(date +'%Y-%m-%d %H:%M:%S')] $1\" | tee -a $LOG_FILE\n}\n\n# Pre-patch checks\nlog \"Starting pre-patch checks...\"\ndf -h > /tmp/disk-before.txt\nfree -h > /tmp/memory-before.txt\nsystemctl list-units --state=failed > /tmp/failed-services-before.txt\n\n# Create snapshot (if using LVM)\nlog \"Creating LVM snapshot...\"\nlvcreate -L 10G -s -n root_snap /dev/vg0/root\n\n# Update package list\nlog \"Updating package list...\"\napt-get update\n\n# List available updates\nlog \"Available updates:\"\napt list --upgradable | tee -a $LOG_FILE\n\n# Install security updates only\nlog \"Installing security updates...\"\nunattended-upgrade -d\n\n# Or install all updates:\n# apt-get upgrade -y\n\n# Check if reboot required\nif [ -f /var/run/reboot-required ]; then\n log \"Reboot required after patching\"\n cat /var/run/reboot-required.pkgs >> $LOG_FILE\n\n # Schedule reboot (or reboot immediately)\n log \"Scheduling reboot in 5 minutes...\"\n shutdown -r +5 \"System reboot for patches\"\nfi\n\n# Post-patch validation\nlog \"Running post-patch validation...\"\nsystemctl list-units --state=failed > /tmp/failed-services-after.txt\n\n# Compare before/after\nif diff /tmp/failed-services-before.txt /tmp/failed-services-after.txt > /dev/null; then\n log \"No new failed services after patching\"\nelse\n log \"WARNING: New failed services detected!\"\n diff /tmp/failed-services-before.txt /tmp/failed-services-after.txt | tee -a $LOG_FILE\nfi\n\n# Email report\nmail -s \"Patch Report: $(hostname)\" $EMAIL_TO \u003c $LOG_FILE\n\nlog \"Patching complete\"\n```\n\n**Windows (PowerShell)**:\n```powershell\n# Automated Windows patching script\n\n$LogFile = \"C:\\Logs\\patch-management.log\"\n$EmailTo = \"[email protected]\"\n\nfunction Write-Log {\n param([string]$Message)\n $timestamp = Get-Date -Format \"yyyy-MM-dd HH:mm:ss\"\n $logMessage = \"[$timestamp] $Message\"\n Write-Host $logMessage\n Add-Content -Path $LogFile -Value $logMessage\n}\n\n# Install PSWindowsUpdate module if not present\nif (-not (Get-Module -ListAvailable -Name PSWindowsUpdate)) {\n Write-Log \"Installing PSWindowsUpdate module...\"\n Install-Module PSWindowsUpdate -Force\n}\n\nImport-Module PSWindowsUpdate\n\n# Pre-patch checks\nWrite-Log \"Starting pre-patch checks...\"\nGet-Service | Where-Object {$_.Status -eq \"Stopped\"} | Out-File C:\\Temp\\stopped-services-before.txt\n\n# Create system restore point\nWrite-Log \"Creating system restore point...\"\nCheckpoint-Computer -Description \"Before Windows Updates\" -RestorePointType MODIFY_SETTINGS\n\n# Get available updates\nWrite-Log \"Checking for updates...\"\n$updates = Get-WindowsUpdate\n\nWrite-Log \"Available updates: $($updates.Count)\"\n$updates | Format-Table Title, KB, Size | Out-String | Write-Log\n\n# Install updates (excluding driver updates)\nWrite-Log \"Installing updates...\"\nInstall-WindowsUpdate -AcceptAll -IgnoreReboot -NotCategory \"Drivers\" | Out-String | Write-Log\n\n# Check if reboot required\nif (Get-WURebootStatus -Silent) {\n Write-Log \"Reboot required after updates\"\n\n # Schedule reboot (or reboot immediately)\n Write-Log \"Scheduling reboot in 5 minutes...\"\n shutdown /r /t 300 /c \"System reboot for Windows Updates\"\n}\n\n# Post-patch validation\nWrite-Log \"Running post-patch validation...\"\nGet-Service | Where-Object {$_.Status -eq \"Stopped\"} | Out-File C:\\Temp\\stopped-services-after.txt\n\n# Email report\nSend-MailMessage `\n -From \"[email protected]\" `\n -To $EmailTo `\n -Subject \"Patch Report: $env:COMPUTERNAME\" `\n -Body (Get-Content $LogFile | Out-String) `\n -SmtpServer \"smtp.example.com\"\n\nWrite-Log \"Patching complete\"\n```\n\n## Performance Optimization\n\n### System Performance Tuning\n\n**Linux Kernel Tuning**:\n```bash\n# /etc/sysctl.conf - Kernel parameter tuning\n\n# Network tuning\nnet.core.somaxconn = 4096 # Max socket connections\nnet.core.netdev_max_backlog = 5000 # Network device queue\nnet.ipv4.tcp_max_syn_backlog = 8192 # SYN backlog queue\nnet.ipv4.tcp_fin_timeout = 15 # FIN timeout (default 60)\nnet.ipv4.tcp_keepalive_time = 300 # Keep-alive time\nnet.ipv4.tcp_tw_reuse = 1 # Reuse TIME_WAIT sockets\nnet.ipv4.ip_local_port_range = 10240 65535 # Ephemeral port range\n\n# Memory tuning\nvm.swappiness = 10 # Reduce swap usage (default 60)\nvm.dirty_ratio = 15 # Max dirty pages before write\nvm.dirty_background_ratio = 5 # Background write threshold\n\n# File system tuning\nfs.file-max = 500000 # Max open files system-wide\nfs.inotify.max_user_watches = 524288 # Max inotify watches\n\n# Apply changes\nsysctl -p\n```\n\n**Application Tuning (Nginx Example)**:\n```nginx\n# /etc/nginx/nginx.conf - Performance tuning\n\nuser www-data;\nworker_processes auto; # One per CPU core\nworker_rlimit_nofile 65535;\n\nevents {\n worker_connections 4096;\n use epoll; # Efficient event model on Linux\n multi_accept on;\n}\n\nhttp {\n # Basic settings\n sendfile on;\n tcp_nopush on;\n tcp_nodelay on;\n keepalive_timeout 65;\n types_hash_max_size 2048;\n server_tokens off; # Security: hide version\n\n # Buffer sizes\n client_body_buffer_size 128k;\n client_max_body_size 50m;\n client_header_buffer_size 1k;\n large_client_header_buffers 4 16k;\n output_buffers 1 32k;\n postpone_output 1460;\n\n # Timeouts\n client_body_timeout 12;\n client_header_timeout 12;\n send_timeout 10;\n\n # Gzip compression\n gzip on;\n gzip_vary on;\n gzip_proxied any;\n gzip_comp_level 6;\n gzip_types text/plain text/css application/json application/javascript text/xml application/xml;\n\n # Caching\n open_file_cache max=200000 inactive=20s;\n open_file_cache_valid 30s;\n open_file_cache_min_uses 2;\n open_file_cache_errors on;\n\n # Rate limiting\n limit_req_zone $binary_remote_addr zone=one:10m rate=10r/s;\n limit_conn_zone $binary_remote_addr zone=addr:10m;\n\n server {\n listen 80;\n\n location / {\n limit_req zone=one burst=20 nodelay;\n limit_conn addr 10;\n\n proxy_pass http://backend;\n proxy_http_version 1.1;\n proxy_set_header Connection \"\";\n proxy_buffering on;\n proxy_buffer_size 4k;\n proxy_buffers 24 4k;\n proxy_busy_buffers_size 8k;\n }\n }\n}\n```\n\n## Cost Management\n\n### Cloud Cost Optimization Strategies\n\n```yaml\n1. Right-Sizing:\n - Analyze resource utilization (CPU, memory)\n - Downsize over-provisioned instances\n - Upsize under-provisioned instances (to avoid performance issues)\n\n Tools:\n - AWS: AWS Compute Optimizer\n - Azure: Azure Advisor\n - GCP: Recommender\n\n Expected Savings: 20-40%\n\n2. Reserved Instances / Savings Plans:\n - Commit to 1-year or 3-year usage\n - Save up to 72% vs on-demand\n - Analyze usage patterns first\n\n Best For:\n - Steady-state workloads (production databases, web servers)\n - Don't use for: Dev/test, variable workloads\n\n Expected Savings: 30-70%\n\n3. Spot Instances:\n - Use spare cloud capacity at discounted rates (up to 90% off)\n - Can be interrupted with 2-minute notice\n\n Best For:\n - Batch processing, big data, CI/CD\n - Stateless, fault-tolerant workloads\n\n Expected Savings: 50-90%\n\n4. Auto-Scaling:\n - Scale down during off-hours\n - Scale up during peak demand\n\n Example Schedule:\n - Business hours (8am-6pm): 10 instances\n - Off-hours (6pm-8am): 3 instances\n - Weekends: 2 instances\n\n Expected Savings: 30-50%\n\n5. Storage Optimization:\n - Delete unused EBS volumes and snapshots\n - Move infrequently accessed data to cheaper tiers\n - S3 Standard → S3 Infrequent Access → S3 Glacier\n - Enable S3 lifecycle policies\n\n Expected Savings: 20-60% on storage\n\n6. Serverless:\n - Replace idle servers with Lambda/Functions\n - Pay only for execution time\n\n Best For:\n - APIs with variable load\n - Event-driven processing\n - Scheduled tasks\n\n Expected Savings: 50-80% for low-to-moderate traffic\n```\n\n### Cost Monitoring Dashboard\n\n```yaml\nCloud Cost Dashboard (Monthly):\n Top Spenders:\n - Service breakdown (EC2, RDS, S3, etc.)\n - Top 10 resources by cost\n - Cost by team/project (using tags)\n\n Trend Analysis:\n - Month-over-month cost change\n - Year-over-year comparison\n - Forecast for next 3 months\n\n Waste Identification:\n - Unused resources (stopped instances, unattached volumes)\n - Over-provisioned resources (\u003c 30% utilization)\n - Untagged resources\n\n Savings Opportunities:\n - RI/Savings Plan recommendations\n - Right-sizing recommendations\n - Storage tier recommendations\n\n Budget Alerts:\n - Warning at 80% of budget\n - Critical at 100% of budget\n - Forecast to exceed budget\n```\n\nThis comprehensive infrastructure management guide provides all the necessary knowledge and tools for effective IT operations.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":45301,"content_sha256":"aa1ed351f5eb238b83c4eb26f28b4e84b90876f7840de34ac76e302ec1f8dd58"},{"filename":"reference/monitoring.md","content":"# Monitoring and Observability\n\nComprehensive guide to implementing observability, metrics collection, alerting strategies, and dashboard design for IT operations.\n\n## Table of Contents\n- [Observability Principles](#observability-principles)\n- [The Three Pillars](#the-three-pillars)\n- [Metrics Strategy](#metrics-strategy)\n- [Alerting Best Practices](#alerting-best-practices)\n- [Dashboard Design](#dashboard-design)\n- [SLI/SLO/SLA Framework](#slislosla-framework)\n- [Monitoring Tools](#monitoring-tools)\n- [Implementation Examples](#implementation-examples)\n\n## Observability Principles\n\n### Definition\n**Observability**: The ability to understand the internal state of a system by examining its external outputs (metrics, logs, traces).\n\n**Monitoring vs Observability**:\n| Monitoring | Observability |\n|------------|---------------|\n| Known unknowns | Unknown unknowns |\n| Predefined dashboards | Exploratory analysis |\n| Threshold-based alerts | Context-aware investigation |\n| \"Is the system up?\" | \"Why is the system behaving this way?\" |\n\n### Key Principles\n\n```yaml\n1. Instrument Everything:\n - Application code (business metrics, errors, latency)\n - Infrastructure (CPU, memory, disk, network)\n - Dependencies (databases, APIs, queues)\n - User experience (frontend performance, transactions)\n\n2. High Cardinality Data:\n - Enable filtering by user_id, region, version, etc.\n - Support arbitrary dimensional queries\n - Example: \"Show me errors for user_id=123 in us-west-2 for version 2.3.1\"\n\n3. Context and Correlation:\n - Link metrics, logs, and traces together\n - Use consistent labels and tags across telemetry\n - Include trace IDs in logs and metrics\n\n4. Real-Time and Historical:\n - Real-time for incident response (\u003c 1 min delay)\n - Historical for trend analysis (retain 13+ months)\n - Different retention policies by data type\n\n5. Self-Service:\n - Empower teams to create their own dashboards\n - Provide query language training\n - Build reusable dashboard templates\n```\n\n## The Three Pillars\n\n### 1. Metrics (What)\n\n**Definition**: Numeric measurements over time (counters, gauges, histograms).\n\n**Types**:\n```yaml\nCounter:\n Description: Monotonically increasing value\n Examples:\n - http_requests_total\n - errors_total\n - bytes_sent_total\n Operations: Rate, increase over time\n\nGauge:\n Description: Value that can go up or down\n Examples:\n - cpu_usage_percent\n - memory_available_bytes\n - queue_depth\n Operations: Current value, average, min, max\n\nHistogram:\n Description: Distribution of values in buckets\n Examples:\n - http_request_duration_seconds\n - database_query_duration_seconds\n Operations: Percentiles (p50, p95, p99), averages\n\nSummary:\n Description: Pre-computed percentiles\n Examples:\n - request_latency_summary\n Operations: Pre-defined percentiles\n```\n\n**Metric Naming Convention**:\n```\n{namespace}_{component}_{metric}_{unit}\n\nExamples:\n- api_http_requests_total\n- db_postgres_connections_active\n- cache_redis_hits_total\n- queue_sqs_messages_received_total\n```\n\n### 2. Logs (Why)\n\n**Definition**: Timestamped text records of discrete events.\n\n**Log Levels**:\n```yaml\nERROR:\n When: Failures requiring immediate attention\n Example: \"Database connection failed after 3 retries\"\n\nWARN:\n When: Unexpected but handled situations\n Example: \"API rate limit approaching (85% of quota)\"\n\nINFO:\n When: Important business events\n Example: \"User 12345 completed checkout for $150.00\"\n\nDEBUG:\n When: Detailed diagnostic information\n Example: \"Loaded configuration from /etc/app/config.yaml\"\n```\n\n**Structured Logging Format**:\n```json\n{\n \"timestamp\": \"2025-01-15T14:32:10.123Z\",\n \"level\": \"ERROR\",\n \"service\": \"payment-api\",\n \"version\": \"2.3.1\",\n \"environment\": \"production\",\n \"trace_id\": \"a1b2c3d4e5f6\",\n \"span_id\": \"1234567890\",\n \"user_id\": \"user-789\",\n \"message\": \"Payment processing failed\",\n \"error\": {\n \"type\": \"StripeAPIException\",\n \"message\": \"Card declined: insufficient funds\",\n \"stack_trace\": \"...\"\n },\n \"context\": {\n \"amount\": 150.00,\n \"currency\": \"USD\",\n \"payment_method\": \"card_****1234\"\n }\n}\n```\n\n**Log Aggregation Best Practices**:\n```yaml\nCollection:\n - Use lightweight agents (Fluentd, Filebeat, Vector)\n - Buffer locally to handle backend outages\n - Compress during transmission\n - Sample debug logs in high-volume scenarios\n\nStorage:\n - Hot tier (last 7 days): Fast SSD for queries\n - Warm tier (8-90 days): Standard storage\n - Cold tier (90+ days): Archive storage (S3, Glacier)\n\nIndexing:\n - Index critical fields: timestamp, level, service, trace_id, user_id\n - Full-text search on message field\n - Use field extraction for structured logs\n```\n\n### 3. Traces (Where)\n\n**Definition**: End-to-end request flow across distributed systems.\n\n**Trace Anatomy**:\n```\nTrace (entire request)\n├─ Span 1: API Gateway (50ms)\n│ ├─ Span 2: Auth Service (10ms)\n│ └─ Span 3: User Service (35ms)\n│ ├─ Span 4: Database Query (20ms)\n│ └─ Span 5: Cache Lookup (5ms)\n└─ Span 6: Response Serialization (5ms)\n\nTotal Trace Duration: 50ms\nCritical Path: Span 1 → Span 3 → Span 4\n```\n\n**Trace Context Propagation**:\n```python\n# OpenTelemetry Python Example\nfrom opentelemetry import trace\nfrom opentelemetry.propagate import inject, extract\n\ntracer = trace.get_tracer(__name__)\n\n# Starting a trace\nwith tracer.start_as_current_span(\"process_order\") as span:\n span.set_attribute(\"order.id\", order_id)\n span.set_attribute(\"order.amount\", amount)\n\n # Propagate context to downstream service\n headers = {}\n inject(headers) # Adds traceparent header\n\n response = requests.post(\n \"https://payment-service/charge\",\n headers=headers,\n json={\"amount\": amount}\n )\n\n if response.status_code != 200:\n span.set_status(Status(StatusCode.ERROR))\n span.record_exception(Exception(\"Payment failed\"))\n```\n\n**Sampling Strategies**:\n```yaml\nAlways Sample:\n - Errors and exceptions (100%)\n - Slow requests (p95+, 100%)\n - Specific user_ids (for debugging, 100%)\n\nHead Sampling (at trace start):\n - Random sampling (1% of all traces)\n - Rate limiting (max 1000 traces/second)\n\nTail Sampling (after trace completion):\n - Sample interesting traces (errors, slow, specific attributes)\n - Requires buffering and additional processing\n - More accurate but higher resource cost\n```\n\n## Metrics Strategy\n\n### The Four Golden Signals (Google SRE)\n\n```yaml\n1. Latency:\n Definition: Time to service a request\n Metrics:\n - http_request_duration_seconds (histogram)\n - Percentiles: p50, p90, p95, p99\n Thresholds:\n - p50 \u003c 100ms\n - p95 \u003c 500ms\n - p99 \u003c 1000ms\n\n2. Traffic:\n Definition: Demand on your system\n Metrics:\n - http_requests_per_second (counter rate)\n - active_connections (gauge)\n Analysis:\n - Daily patterns\n - Growth trends\n - Capacity planning\n\n3. Errors:\n Definition: Rate of failed requests\n Metrics:\n - http_requests_total{status=~\"5..\"} (counter)\n - error_rate = errors / total_requests\n Thresholds:\n - Error rate \u003c 0.1% (99.9% success)\n\n4. Saturation:\n Definition: How \"full\" your service is\n Metrics:\n - cpu_usage_percent (gauge)\n - memory_usage_percent (gauge)\n - disk_usage_percent (gauge)\n - connection_pool_utilization (gauge)\n Thresholds:\n - Warning at 70%\n - Critical at 85%\n```\n\n### RED Method (for request-driven services)\n\n```yaml\nRate: Number of requests per second\n PromQL: rate(http_requests_total[5m])\n\nErrors: Number of failed requests per second\n PromQL: rate(http_requests_total{status=~\"5..\"}[5m])\n\nDuration: Time taken per request\n PromQL: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))\n```\n\n### USE Method (for resources)\n\n```yaml\nUtilization: % time resource was busy\n Examples:\n - CPU: 100% - idle%\n - Disk: % time with I/O pending\n - Network: bandwidth used / max bandwidth\n\nSaturation: Amount of queued work\n Examples:\n - CPU: Load average\n - Disk: I/O queue depth\n - Network: retransmit rate\n\nErrors: Count of error events\n Examples:\n - Network: CRC errors, packet loss\n - Disk: I/O errors\n - Memory: OOM kills\n```\n\n### Metric Collection Patterns\n\n**Push vs Pull**:\n```yaml\nPush Model (StatsD, CloudWatch):\n Pros:\n - Application controls when to send\n - Works with ephemeral jobs (batch, Lambda)\n - NAT/firewall friendly\n Cons:\n - Needs aggregation server\n - Can overwhelm receiver\n - Hard to detect silent failures\n\n Use When:\n - Short-lived processes\n - Restricted network (can't open ports)\n - Cloud-native serverless\n\nPull Model (Prometheus):\n Pros:\n - Service discovery integration\n - Centralized control of scrape interval\n - Easy to detect down targets\n Cons:\n - Requires open network path\n - Doesn't work well with NAT\n - Challenges with ephemeral jobs\n\n Use When:\n - Long-running services\n - Kubernetes/container environments\n - Need service discovery\n```\n\n**Prometheus Metrics Exposition**:\n```python\n# Python Flask Example\nfrom prometheus_client import Counter, Histogram, Gauge, generate_latest\nfrom flask import Flask, Response\nimport time\n\napp = Flask(__name__)\n\n# Define metrics\nhttp_requests_total = Counter(\n 'http_requests_total',\n 'Total HTTP requests',\n ['method', 'endpoint', 'status']\n)\n\nhttp_request_duration_seconds = Histogram(\n 'http_request_duration_seconds',\n 'HTTP request latency',\n ['method', 'endpoint']\n)\n\nactive_requests = Gauge(\n 'active_requests',\n 'Number of active requests'\n)\n\[email protected]_request\ndef before_request():\n active_requests.inc()\n request.start_time = time.time()\n\[email protected]_request\ndef after_request(response):\n active_requests.dec()\n\n duration = time.time() - request.start_time\n http_request_duration_seconds.labels(\n method=request.method,\n endpoint=request.endpoint or 'unknown'\n ).observe(duration)\n\n http_requests_total.labels(\n method=request.method,\n endpoint=request.endpoint or 'unknown',\n status=response.status_code\n ).inc()\n\n return response\n\[email protected]('/metrics')\ndef metrics():\n return Response(generate_latest(), mimetype='text/plain')\n\[email protected]('/api/users')\ndef get_users():\n # Your application logic\n return {'users': []}\n\nif __name__ == '__main__':\n app.run(port=8080)\n```\n\n**Prometheus Scrape Configuration**:\n```yaml\n# prometheus.yml\nglobal:\n scrape_interval: 15s\n evaluation_interval: 15s\n external_labels:\n cluster: 'production'\n region: 'us-east-1'\n\nscrape_configs:\n - job_name: 'api-servers'\n static_configs:\n - targets:\n - 'api-1.example.com:8080'\n - 'api-2.example.com:8080'\n - 'api-3.example.com:8080'\n metrics_path: '/metrics'\n scrape_interval: 10s\n\n - job_name: 'kubernetes-pods'\n kubernetes_sd_configs:\n - role: pod\n relabel_configs:\n - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]\n action: keep\n regex: true\n - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]\n action: replace\n target_label: __metrics_path__\n regex: (.+)\n - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]\n action: replace\n regex: ([^:]+)(?::\\d+)?;(\\d+)\n replacement: $1:$2\n target_label: __address__\n```\n\n## Alerting Best Practices\n\n### Alert Design Principles\n\n```yaml\n1. Alerts Must Be Actionable:\n BAD: \"CPU usage is high\"\n GOOD: \"CPU usage > 85% for 10 minutes on app-server-3\"\n\n Every alert should answer:\n - What is wrong?\n - Which component is affected?\n - What should I do about it?\n\n2. Reduce False Positives:\n - Use sustained thresholds (not instantaneous spikes)\n - Example: Alert after 5 minutes > threshold, not first breach\n - Avoid alerting on symptoms if root cause is already alerting\n\n3. Alert on Symptoms, Not Causes:\n BETTER: \"API error rate > 1%\" (user-facing symptom)\n WORSE: \"Redis connection count \u003c 10\" (internal cause)\n\n Exception: Alert on causes that lead to immediate failures\n Example: \"Disk will be full in 4 hours\"\n\n4. Context in Alerts:\n - Include current value and threshold\n - Link to runbook\n - Link to relevant dashboard\n - Include recent changes\n\n5. Appropriate Severity:\n - Page only for urgent, user-impacting issues\n - Ticket for important but not urgent issues\n - Dashboard/log for informational data\n```\n\n### Alert Fatigue Prevention\n\n```yaml\nSymptoms of Alert Fatigue:\n - Acknowledgments without investigation\n - Growing MTTA (Mean Time to Acknowledge)\n - Team frustration and burnout\n - Important alerts getting missed\n\nSolutions:\n 1. Alert Hygiene Reviews:\n - Weekly review of all fired alerts\n - Tune or remove alerts with >20% false positive rate\n - Track alert effectiveness metrics\n\n 2. Alert Grouping:\n - Group related alerts (same root cause)\n - Example: Don't alert on every pod failure if deployment is alerting\n\n 3. Dynamic Thresholds:\n - Use anomaly detection instead of static thresholds\n - Adjust thresholds based on time of day/week\n\n 4. Escalation Policies:\n - Primary on-call: 5 min\n - Secondary on-call: 15 min\n - Team lead: 30 min\n - Engineering manager: 60 min\n\n 5. Maintenance Windows:\n - Silence alerts during planned maintenance\n - Auto-create maintenance windows from change tickets\n```\n\n### Prometheus Alerting Rules\n\n```yaml\n# alerts.yml\ngroups:\n - name: api_alerts\n interval: 30s\n rules:\n # High error rate\n - alert: HighErrorRate\n expr: |\n sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (service)\n /\n sum(rate(http_requests_total[5m])) by (service)\n > 0.05\n for: 5m\n labels:\n severity: critical\n team: backend\n annotations:\n summary: \"High error rate on {{ $labels.service }}\"\n description: \"Error rate is {{ $value | humanizePercentage }} on {{ $labels.service }}\"\n runbook: \"https://wiki.example.com/runbooks/high-error-rate\"\n dashboard: \"https://grafana.example.com/d/api-dashboard\"\n\n # High latency (p95)\n - alert: HighLatency\n expr: |\n histogram_quantile(0.95,\n sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service)\n ) > 1.0\n for: 10m\n labels:\n severity: warning\n team: backend\n annotations:\n summary: \"High p95 latency on {{ $labels.service }}\"\n description: \"p95 latency is {{ $value }}s on {{ $labels.service }}\"\n\n # Saturation (CPU)\n - alert: HighCPUUsage\n expr: |\n 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) > 85\n for: 10m\n labels:\n severity: warning\n team: infrastructure\n annotations:\n summary: \"High CPU usage on {{ $labels.instance }}\"\n description: \"CPU usage is {{ $value | humanize }}% on {{ $labels.instance }}\"\n\n # Disk space prediction\n - alert: DiskWillFillIn4Hours\n expr: |\n predict_linear(node_filesystem_free_bytes{fstype!~\"tmpfs|fuse.lxcfs\"}[1h], 4*3600) \u003c 0\n for: 5m\n labels:\n severity: critical\n team: infrastructure\n annotations:\n summary: \"Disk will fill on {{ $labels.instance }}\"\n description: \"Filesystem {{ $labels.mountpoint }} will fill in approximately 4 hours\"\n\n # Service down\n - alert: ServiceDown\n expr: up == 0\n for: 5m\n labels:\n severity: critical\n team: infrastructure\n annotations:\n summary: \"Service {{ $labels.job }} is down\"\n description: \"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes\"\n\n # Certificate expiration\n - alert: CertificateExpiringSoon\n expr: |\n (probe_ssl_earliest_cert_expiry - time()) / 86400 \u003c 14\n for: 1h\n labels:\n severity: warning\n team: infrastructure\n annotations:\n summary: \"SSL certificate expiring soon\"\n description: \"Certificate for {{ $labels.instance }} expires in {{ $value | humanize }} days\"\n```\n\n### PagerDuty Integration\n\n```yaml\n# alertmanager.yml\nglobal:\n resolve_timeout: 5m\n pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'\n\nroute:\n receiver: 'default-receiver'\n group_by: ['alertname', 'cluster', 'service']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 4h\n\n routes:\n # Critical alerts go to PagerDuty\n - match:\n severity: critical\n receiver: pagerduty-critical\n continue: true\n\n # Warnings go to Slack\n - match:\n severity: warning\n receiver: slack-warnings\n\n # Infrastructure team alerts\n - match:\n team: infrastructure\n receiver: slack-infrastructure\n routes:\n - match:\n severity: critical\n receiver: pagerduty-infrastructure\n\nreceivers:\n - name: 'default-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/XXX'\n channel: '#alerts'\n title: '{{ .GroupLabels.alertname }}'\n text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'\n\n - name: 'pagerduty-critical'\n pagerduty_configs:\n - service_key: 'YOUR_PAGERDUTY_KEY'\n description: '{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}'\n details:\n firing: '{{ .Alerts.Firing | len }}'\n resolved: '{{ .Alerts.Resolved | len }}'\n num_alerts: '{{ .Alerts | len }}'\n links:\n - href: '{{ .CommonAnnotations.runbook }}'\n text: 'Runbook'\n - href: '{{ .CommonAnnotations.dashboard }}'\n text: 'Dashboard'\n\n - name: 'slack-warnings'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/YYY'\n channel: '#alerts-warnings'\n color: 'warning'\n\ninhibit_rules:\n # Inhibit warning if critical is firing\n - source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n equal: ['alertname', 'service', 'instance']\n```\n\n## Dashboard Design\n\n### Dashboard Principles\n\n```yaml\n1. Audience-Specific Dashboards:\n - Executive Dashboard: Business metrics, SLAs, revenue impact\n - Operations Dashboard: System health, alerts, capacity\n - Development Dashboard: Deployment status, error rates, traces\n - Service Dashboard: Detailed metrics for specific service\n\n2. Information Hierarchy:\n Top: Most critical information (current status)\n Middle: Supporting metrics and trends\n Bottom: Detailed breakdowns and diagnostics\n\n3. Visual Best Practices:\n - Use color purposefully (red=bad, green=good, yellow=warning)\n - Avoid more than 6-8 panels per row\n - Consistent time ranges across panels\n - Include units in axis labels\n - Use logarithmic scale for wide-ranging data\n\n4. Dashboard Variables:\n - Environment (production, staging, dev)\n - Service/Component\n - Time range\n - Region/Datacenter\n\n5. Actionable Context:\n - Link panels to detailed views\n - Include threshold lines on graphs\n - Add annotations for deployments/incidents\n```\n\n### Grafana Dashboard Structure\n\n```yaml\nExecutive Dashboard (Business Metrics):\n Row 1: Key Business Metrics\n - Revenue (last hour, today, this month)\n - Active Users (gauge)\n - Transaction Volume (time series)\n - Conversion Rate (percentage)\n\n Row 2: System Health Overview\n - Overall Availability (SLA compliance)\n - P95 Latency Across All Services\n - Error Budget Remaining\n - Active Incidents (count)\n\n Row 3: Trends\n - Revenue Trend (7 days)\n - User Growth (30 days)\n - Error Rate Trend (7 days)\n\nOperations Dashboard (System Health):\n Row 1: Traffic Light Status\n - All Services Status (red/yellow/green stat panels)\n - Active Alerts Count\n - On-Call Engineer\n\n Row 2: Golden Signals\n - Request Rate (requests/sec across all services)\n - Error Rate (% errors)\n - P50/P95/P99 Latency\n - Saturation (CPU, Memory, Disk across fleet)\n\n Row 3: Infrastructure Health\n - CPU Usage by Host (heatmap)\n - Memory Usage by Host\n - Disk Usage by Host\n - Network Traffic\n\n Row 4: Recent Changes\n - Deployments (annotations)\n - Configuration Changes\n - Infrastructure Changes\n\nService-Specific Dashboard:\n Row 1: Service Overview\n - Request Rate\n - Error Rate\n - Latency (p50, p95, p99)\n - Active Instances\n\n Row 2: RED Metrics Breakdown\n - Requests by Endpoint\n - Errors by Type\n - Latency Distribution (histogram)\n\n Row 3: Dependencies\n - Database Query Performance\n - External API Call Performance\n - Cache Hit Rate\n - Queue Depth\n\n Row 4: Resource Usage\n - CPU per Instance\n - Memory per Instance\n - JVM/Runtime Metrics (if applicable)\n```\n\n### Grafana JSON Dashboard Example\n\n```json\n{\n \"dashboard\": {\n \"title\": \"API Service Dashboard\",\n \"tags\": [\"api\", \"production\"],\n \"timezone\": \"browser\",\n \"templating\": {\n \"list\": [\n {\n \"name\": \"environment\",\n \"type\": \"query\",\n \"datasource\": \"Prometheus\",\n \"query\": \"label_values(http_requests_total, environment)\",\n \"current\": {\n \"text\": \"production\",\n \"value\": \"production\"\n }\n },\n {\n \"name\": \"service\",\n \"type\": \"query\",\n \"datasource\": \"Prometheus\",\n \"query\": \"label_values(http_requests_total{environment=\\\"$environment\\\"}, service)\",\n \"multi\": true\n }\n ]\n },\n \"annotations\": {\n \"list\": [\n {\n \"name\": \"Deployments\",\n \"datasource\": \"Prometheus\",\n \"expr\": \"deployment_events{service=\\\"$service\\\"}\",\n \"iconColor\": \"green\"\n }\n ]\n },\n \"panels\": [\n {\n \"id\": 1,\n \"title\": \"Request Rate\",\n \"type\": \"graph\",\n \"gridPos\": {\"x\": 0, \"y\": 0, \"w\": 12, \"h\": 8},\n \"targets\": [\n {\n \"expr\": \"sum(rate(http_requests_total{service=\\\"$service\\\", environment=\\\"$environment\\\"}[5m])) by (service)\",\n \"legendFormat\": \"{{service}}\"\n }\n ],\n \"yaxes\": [\n {\"format\": \"reqps\", \"label\": \"Requests/sec\"}\n ]\n },\n {\n \"id\": 2,\n \"title\": \"Error Rate\",\n \"type\": \"graph\",\n \"gridPos\": {\"x\": 12, \"y\": 0, \"w\": 12, \"h\": 8},\n \"targets\": [\n {\n \"expr\": \"sum(rate(http_requests_total{service=\\\"$service\\\", status=~\\\"5..\\\"}[5m])) / sum(rate(http_requests_total{service=\\\"$service\\\"}[5m]))\",\n \"legendFormat\": \"Error Rate\"\n }\n ],\n \"thresholds\": [\n {\n \"value\": 0.01,\n \"colorMode\": \"critical\",\n \"op\": \"gt\",\n \"line\": true,\n \"fill\": true\n }\n ],\n \"yaxes\": [\n {\"format\": \"percentunit\", \"max\": 0.05}\n ]\n },\n {\n \"id\": 3,\n \"title\": \"Latency (p95)\",\n \"type\": \"graph\",\n \"gridPos\": {\"x\": 0, \"y\": 8, \"w\": 12, \"h\": 8},\n \"targets\": [\n {\n \"expr\": \"histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\\\"$service\\\"}[5m])) by (le, endpoint))\",\n \"legendFormat\": \"{{endpoint}}\"\n }\n ],\n \"yaxes\": [\n {\"format\": \"s\", \"label\": \"Duration\"}\n ]\n }\n ]\n }\n}\n```\n\n## SLI/SLO/SLA Framework\n\n### Definitions\n\n```yaml\nSLI (Service Level Indicator):\n Definition: Quantitative measure of service level\n Examples:\n - Request latency (95th percentile \u003c 500ms)\n - Availability (% of successful requests)\n - Throughput (requests per second)\n - Data freshness (lag in minutes)\n\nSLO (Service Level Objective):\n Definition: Target value or range for an SLI\n Examples:\n - 99.9% of requests complete in \u003c 500ms\n - 99.95% availability over 30 days\n - Data lag \u003c 5 minutes for 99% of data\n\nSLA (Service Level Agreement):\n Definition: Contractual commitment with consequences\n Examples:\n - 99.9% uptime or customer gets credit\n - \u003c500ms p95 latency or penalty payment\n```\n\n### SLO Design\n\n```yaml\n1. Choose Meaningful SLIs:\n User-Facing:\n - Availability: Can users access the service?\n - Latency: How fast do requests complete?\n - Quality: Are results correct/fresh?\n\n Behind-the-Scenes:\n - Throughput: Can system handle load?\n - Durability: Is data safe?\n - Correctness: Are computations accurate?\n\n2. Set Realistic SLOs:\n - Start with current performance baseline\n - Add buffer for improvement (don't set SLO = current performance)\n - Consider user expectations and business requirements\n - Remember: 100% is the wrong SLO (no room for changes)\n\n Example:\n Current p95 latency: 300ms\n User expectation: \u003c 1 second\n Set SLO: 500ms (between current and user max tolerance)\n\n3. Error Budget:\n Formula: Error Budget = 100% - SLO\n\n Example:\n SLO: 99.9% availability\n Error Budget: 0.1% = 43.2 minutes/month\n\n Use:\n - Budget consumed = Actual downtime / Error budget\n - If budget exhausted: Freeze deployments, focus on reliability\n - If budget remaining: Safe to take risks (new features, refactors)\n\n4. Multi-Window SLOs:\n - Short window (7 days): Detect immediate issues\n - Long window (30 days): Track trends\n - Rolling window: Continuous monitoring\n\n Example:\n 7-day SLO: 99.5% (allows 50 minutes downtime)\n 30-day SLO: 99.9% (allows 43 minutes downtime)\n```\n\n### SLO Monitoring with Prometheus\n\n```yaml\n# SLO recording rules\ngroups:\n - name: slo_recording_rules\n interval: 30s\n rules:\n # Total requests\n - record: slo:http_requests:total\n expr: sum(rate(http_requests_total[5m]))\n\n # Successful requests (not 5xx)\n - record: slo:http_requests:success\n expr: sum(rate(http_requests_total{status!~\"5..\"}[5m]))\n\n # Availability SLI (success rate)\n - record: slo:availability:ratio\n expr: slo:http_requests:success / slo:http_requests:total\n\n # Latency SLI (% of requests under threshold)\n - record: slo:latency:good_requests\n expr: |\n sum(rate(http_request_duration_seconds_bucket{le=\"0.5\"}[5m]))\n\n - record: slo:latency:ratio\n expr: slo:latency:good_requests / slo:http_requests:total\n\n # Error budget calculation (30-day window)\n - record: slo:error_budget:availability:30d\n expr: |\n 1 - (\n (1 - 0.999) / # SLO target\n (1 - avg_over_time(slo:availability:ratio[30d]))\n )\n\n# SLO alerting rules\n - name: slo_alerts\n rules:\n # Availability SLO burn rate alerts\n - alert: AvailabilitySLOBurnRateCritical\n expr: |\n (\n slo:availability:ratio \u003c 0.999 # Below SLO\n and\n (1 - slo:availability:ratio) > 14.4 * (1 - 0.999) # Burn rate > 14.4x (will exhaust budget in 2 days)\n )\n for: 5m\n labels:\n severity: critical\n annotations:\n summary: \"Critical SLO burn rate\"\n description: \"At current rate, 30-day error budget will be exhausted in 2 days\"\n\n - alert: AvailabilitySLOBurnRateWarning\n expr: |\n (\n slo:availability:ratio \u003c 0.999\n and\n (1 - slo:availability:ratio) > 6 * (1 - 0.999) # Burn rate > 6x\n )\n for: 30m\n labels:\n severity: warning\n annotations:\n summary: \"Elevated SLO burn rate\"\n description: \"Error budget consumption is higher than expected\"\n\n # Error budget exhausted\n - alert: ErrorBudgetExhausted\n expr: slo:error_budget:availability:30d \u003c= 0\n for: 5m\n labels:\n severity: critical\n annotations:\n summary: \"Error budget exhausted\"\n description: \"30-day error budget is exhausted. Freeze non-critical changes.\"\n```\n\n### SLO Dashboard Example\n\n```json\n{\n \"dashboard\": {\n \"title\": \"SLO Dashboard\",\n \"panels\": [\n {\n \"title\": \"Availability SLO (30 days)\",\n \"type\": \"gauge\",\n \"targets\": [{\n \"expr\": \"avg_over_time(slo:availability:ratio[30d])\"\n }],\n \"options\": {\n \"min\": 0.99,\n \"max\": 1.0,\n \"thresholds\": [\n {\"value\": 0.999, \"color\": \"green\"},\n {\"value\": 0.995, \"color\": \"yellow\"},\n {\"value\": 0.99, \"color\": \"red\"}\n ]\n }\n },\n {\n \"title\": \"Error Budget Remaining\",\n \"type\": \"gauge\",\n \"targets\": [{\n \"expr\": \"slo:error_budget:availability:30d\"\n }],\n \"options\": {\n \"min\": 0,\n \"max\": 1,\n \"thresholds\": [\n {\"value\": 0.5, \"color\": \"green\"},\n {\"value\": 0.25, \"color\": \"yellow\"},\n {\"value\": 0, \"color\": \"red\"}\n ]\n }\n },\n {\n \"title\": \"Error Budget Burn Rate\",\n \"type\": \"graph\",\n \"targets\": [{\n \"expr\": \"(1 - slo:availability:ratio) / (1 - 0.999)\",\n \"legendFormat\": \"Burn Rate (1x = normal consumption)\"\n }],\n \"yaxes\": [{\n \"label\": \"Burn Rate Multiplier\"\n }],\n \"alert\": {\n \"threshold\": 1,\n \"message\": \"Burn rate above normal\"\n }\n },\n {\n \"title\": \"SLO Compliance History\",\n \"type\": \"table\",\n \"targets\": [{\n \"expr\": \"avg_over_time(slo:availability:ratio[7d])\",\n \"format\": \"table\",\n \"legendFormat\": \"7 days\"\n }]\n }\n ]\n }\n}\n```\n\n## Monitoring Tools\n\n### Tool Comparison Matrix\n\n| Tool | Best For | Strengths | Weaknesses | Cost |\n|------|----------|-----------|------------|------|\n| **Prometheus + Grafana** | Kubernetes, metrics | Open source, powerful querying, service discovery | Logs/traces need separate tools, scale challenges | Free (self-hosted) |\n| **Datadog** | Full-stack observability | All-in-one, easy setup, great UX | Expensive at scale, vendor lock-in | $$ |\n| **New Relic** | APM, application performance | Deep code insights, distributed tracing | Can be complex, pricing | $$ |\n| **ELK Stack** | Log aggregation, search | Powerful search, flexible, open source | Complex to operate, resource-intensive | Free-$ |\n| **Splunk** | Enterprise logs, security | Mature, powerful, compliance features | Very expensive, steep learning curve | $$$ |\n| **Cloudwatch** | AWS-native monitoring | Native AWS integration, no setup | Limited outside AWS, basic features | $ |\n| **Azure Monitor** | Azure-native monitoring | Native Azure integration | Limited outside Azure | $ |\n| **Google Cloud Monitoring** | GCP-native monitoring | Native GCP integration, free tier | Limited outside GCP | $ - $ |\n\n### Prometheus Architecture\n\n```\n┌─────────────────────────────────────────────────────────────┐\n│ Prometheus Server │\n│ ┌────────────┐ ┌──────────────┐ ┌─────────────────────┐ │\n│ │ Retrieval │→ │ Time Series │→ │ HTTP Server (API) │ │\n│ │ (Scrape) │ │ Database │ │ │ │\n│ └─────┬──────┘ └──────────────┘ └──────────┬──────────┘ │\n│ │ │ │\n└────────┼───────────────────────────────────────┼─────────────┘\n │ │\n │ Pull metrics │ PromQL queries\n ↓ ↓\n┌──────────────────┐ ┌─────────────────┐\n│ Service Targets │ │ Grafana │\n│ ┌────────────┐ │ │ Dashboards │\n│ │ /metrics │ │ └─────────────────┘\n│ └────────────┘ │\n└──────────────────┘ ┌─────────────────┐\n │ Alertmanager │\n┌──────────────────┐ │ ┌───────────┐ │\n│ Service Discovery│ │ │ PagerDuty │ │\n│ - Kubernetes │ │ │ Slack │ │\n│ - Consul │ │ └───────────┘ │\n│ - DNS │ └─────────────────┘\n└──────────────────┘\n```\n\n### ELK Stack Architecture\n\n```\nApplication Servers\n├─ App 1 → Filebeat →\n├─ App 2 → Filebeat → ┌──────────────┐\n└─ App 3 → Filebeat → →→→ │ Logstash │\n │ (Aggregation │\nDocker Containers │ & Transform)│\n└─ Fluentd → → → → → → → →└──────┬───────┘\n │\nNetwork Devices ↓\n└─ Syslog → → → → → → → → ┌─────────────────┐\n │ Elasticsearch │\nCloud Services │ (Storage & │\n└─ CloudWatch Logs → → → → │ Indexing) │\n └────────┬────────┘\n ↓\n ┌─────────────────┐\n │ Kibana │\n │ (Visualization)│\n └─────────────────┘\n```\n\n## Implementation Examples\n\n### Complete Monitoring Stack (Docker Compose)\n\n```yaml\nversion: '3.8'\n\nservices:\n prometheus:\n image: prom/prometheus:latest\n ports:\n - \"9090:9090\"\n volumes:\n - ./prometheus.yml:/etc/prometheus/prometheus.yml\n - ./alerts.yml:/etc/prometheus/alerts.yml\n - prometheus-data:/prometheus\n command:\n - '--config.file=/etc/prometheus/prometheus.yml'\n - '--storage.tsdb.path=/prometheus'\n - '--storage.tsdb.retention.time=30d'\n restart: unless-stopped\n\n alertmanager:\n image: prom/alertmanager:latest\n ports:\n - \"9093:9093\"\n volumes:\n - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml\n command:\n - '--config.file=/etc/alertmanager/alertmanager.yml'\n restart: unless-stopped\n\n grafana:\n image: grafana/grafana:latest\n ports:\n - \"3000:3000\"\n environment:\n - GF_SECURITY_ADMIN_PASSWORD=admin\n - GF_USERS_ALLOW_SIGN_UP=false\n volumes:\n - grafana-data:/var/lib/grafana\n - ./grafana/provisioning:/etc/grafana/provisioning\n depends_on:\n - prometheus\n restart: unless-stopped\n\n node-exporter:\n image: prom/node-exporter:latest\n ports:\n - \"9100:9100\"\n command:\n - '--path.procfs=/host/proc'\n - '--path.sysfs=/host/sys'\n - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($|/)'\n volumes:\n - /proc:/host/proc:ro\n - /sys:/host/sys:ro\n - /:/rootfs:ro\n restart: unless-stopped\n\n cadvisor:\n image: gcr.io/cadvisor/cadvisor:latest\n ports:\n - \"8080:8080\"\n volumes:\n - /:/rootfs:ro\n - /var/run:/var/run:ro\n - /sys:/sys:ro\n - /var/lib/docker:/var/lib/docker:ro\n restart: unless-stopped\n\n loki:\n image: grafana/loki:latest\n ports:\n - \"3100:3100\"\n command: -config.file=/etc/loki/local-config.yaml\n restart: unless-stopped\n\n promtail:\n image: grafana/promtail:latest\n volumes:\n - /var/log:/var/log:ro\n - ./promtail-config.yml:/etc/promtail/config.yml\n command: -config.file=/etc/promtail/config.yml\n restart: unless-stopped\n\nvolumes:\n prometheus-data:\n grafana-data:\n```\n\n### Kubernetes Monitoring with Prometheus Operator\n\n```yaml\n# Install kube-prometheus-stack (Helm)\nhelm repo add prometheus-community https://prometheus-community.github.io/helm-charts\nhelm repo update\n\nhelm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \\\n --namespace monitoring \\\n --create-namespace \\\n --set prometheus.prometheusSpec.retention=30d \\\n --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi \\\n --set grafana.adminPassword=admin\n\n# ServiceMonitor for custom application\napiVersion: monitoring.coreos.com/v1\nkind: ServiceMonitor\nmetadata:\n name: api-service-monitor\n namespace: monitoring\n labels:\n release: kube-prometheus-stack\nspec:\n selector:\n matchLabels:\n app: api\n endpoints:\n - port: metrics\n interval: 30s\n path: /metrics\n\n# PrometheusRule for custom alerts\napiVersion: monitoring.coreos.com/v1\nkind: PrometheusRule\nmetadata:\n name: api-alerts\n namespace: monitoring\n labels:\n release: kube-prometheus-stack\nspec:\n groups:\n - name: api\n interval: 30s\n rules:\n - alert: ApiHighErrorRate\n expr: |\n sum(rate(http_requests_total{app=\"api\",status=~\"5..\"}[5m]))\n /\n sum(rate(http_requests_total{app=\"api\"}[5m]))\n > 0.05\n for: 5m\n labels:\n severity: critical\n annotations:\n summary: \"High error rate on API\"\n```\n\nThis comprehensive monitoring guide provides everything needed to implement robust observability for IT operations.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":38260,"content_sha256":"445a7b0ef74871e53c43f62a1910ee7a04f245ff2a4d097aecbee522b24da6df"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"IT Operations Expert","type":"text"}]},{"type":"paragraph","content":[{"text":"A comprehensive skill for managing IT infrastructure operations, ensuring service reliability, implementing monitoring and alerting strategies, managing incidents, and maintaining operational excellence through automation and best practices.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Core Principles","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"1. Service Reliability First","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Proactive Monitoring","type":"text","marks":[{"type":"strong"}]},{"text":": Implement comprehensive observability before incidents occur","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Incident Management","type":"text","marks":[{"type":"strong"}]},{"text":": Structured response processes with clear escalation paths","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"SLA/SLO Management","type":"text","marks":[{"type":"strong"}]},{"text":": Define and maintain service level objectives aligned with business needs","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Continuous Improvement","type":"text","marks":[{"type":"strong"}]},{"text":": Learn from incidents through blameless post-mortems","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"2. Automation Over Manual Processes","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Infrastructure as Code","type":"text","marks":[{"type":"strong"}]},{"text":": Manage infrastructure configuration through version-controlled code","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Runbook Automation","type":"text","marks":[{"type":"strong"}]},{"text":": Convert manual procedures into automated workflows","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Self-Healing Systems","type":"text","marks":[{"type":"strong"}]},{"text":": Implement automated remediation for common issues","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Configuration Management","type":"text","marks":[{"type":"strong"}]},{"text":": Maintain consistency across environments","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"3. ITIL Service Management","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Service Strategy","type":"text","marks":[{"type":"strong"}]},{"text":": Align IT services with business objectives","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Service Design","type":"text","marks":[{"type":"strong"}]},{"text":": Design resilient, scalable services","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Service Transition","type":"text","marks":[{"type":"strong"}]},{"text":": Manage changes with minimal disruption","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Service Operation","type":"text","marks":[{"type":"strong"}]},{"text":": Deliver and support services effectively","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Continual Service Improvement","type":"text","marks":[{"type":"strong"}]},{"text":": Iteratively enhance service quality","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"4. Operational Excellence","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Documentation","type":"text","marks":[{"type":"strong"}]},{"text":": Maintain current runbooks, procedures, and architecture diagrams","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Knowledge Management","type":"text","marks":[{"type":"strong"}]},{"text":": Build searchable knowledge bases from incident resolutions","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Capacity Planning","type":"text","marks":[{"type":"strong"}]},{"text":": Forecast and provision resources proactively","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Cost Optimization","type":"text","marks":[{"type":"strong"}]},{"text":": Balance performance requirements with infrastructure costs","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Core Workflow","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Infrastructure Operations Workflow","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"1. MONITORING & OBSERVABILITY\n ├─ Define SLIs/SLOs/SLAs for critical services\n ├─ Implement metrics collection (infrastructure, application, business)\n ├─ Configure alerting with proper thresholds and escalation\n ├─ Build dashboards for different audiences (ops, devs, executives)\n └─ Establish on-call rotation and escalation procedures\n\n2. INCIDENT MANAGEMENT\n ├─ Receive alert or user report\n ├─ Assess severity and impact (P1/P2/P3/P4)\n ├─ Engage appropriate responders\n ├─ Investigate and diagnose root cause\n ├─ Implement fix or workaround\n ├─ Communicate status to stakeholders\n ├─ Document resolution in knowledge base\n └─ Conduct post-incident review\n\n3. CHANGE MANAGEMENT\n ├─ Submit change request with impact assessment\n ├─ Review and approve through CAB (Change Advisory Board)\n ├─ Schedule change window\n ├─ Execute change with rollback plan ready\n ├─ Validate success criteria\n ├─ Document actual vs planned results\n └─ Close change ticket\n\n4. CAPACITY PLANNING\n ├─ Collect resource utilization trends\n ├─ Analyze growth patterns\n ├─ Forecast future requirements\n ├─ Plan procurement or provisioning\n ├─ Execute capacity additions\n └─ Monitor effectiveness\n\n5. AUTOMATION & OPTIMIZATION\n ├─ Identify repetitive manual tasks\n ├─ Document current process\n ├─ Design automated solution\n ├─ Implement and test automation\n ├─ Deploy to production\n ├─ Measure time/cost savings\n └─ Iterate and improve","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Decision Frameworks","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Alert Configuration Decision Matrix","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Scenario","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Alert Type","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Threshold","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Response Time","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Escalation","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Service completely down","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Page","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Immediate","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\u003c 5 min","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Immediate to on-call","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Service degraded","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Page","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"2-3 failures","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\u003c 15 min","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"After 15 min to on-call","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"High resource usage","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Warning","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"> 80% sustained","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\u003c 1 hour","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"After 2 hours to team lead","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Approaching capacity","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Info","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"> 70% trend","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\u003c 24 hours","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Weekly capacity review","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Configuration drift","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Ticket","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Any deviation","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\u003c 7 days","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Monthly review","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Incident Severity Classification","type":"text"}]},{"type":"paragraph","content":[{"text":"Priority 1 (Critical)","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Complete service outage affecting all users","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Data loss or security breach","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Financial impact > $10K/hour","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Response: Immediate, 24/7, all hands on deck","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Priority 2 (High)","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Partial service outage affecting many users","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Significant performance degradation","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Financial impact $1K-$10K/hour","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Response: \u003c 30 minutes during business hours","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Priority 3 (Medium)","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Service degradation affecting some users","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Non-critical functionality impaired","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Workaround available","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Response: \u003c 4 hours during business hours","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Priority 4 (Low)","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Minor issues with minimal impact","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Cosmetic problems","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Enhancement requests","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Response: Next business day","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Change Management Risk Assessment","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"Risk Level = Impact × Likelihood × Complexity\n\nImpact (1-5):\n1 = Single user\n2 = Team\n3 = Department\n4 = Company-wide\n5 = Customer-facing\n\nLikelihood of Issues (1-5):\n1 = Routine, tested\n2 = Familiar, documented\n3 = Some uncertainty\n4 = New territory\n5 = Never done before\n\nComplexity (1-5):\n1 = Single component\n2 = Few components\n3 = Multiple systems\n4 = Cross-platform\n5 = Enterprise-wide\n\nRisk Score Interpretation:\n1-20: Standard change (pre-approved)\n21-50: Normal change (CAB review)\n51-75: High-risk change (extensive testing, senior approval)\n76-125: Emergency change only (executive approval)","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Monitoring Tool Selection","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Requirement","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Prometheus + Grafana","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Datadog","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"New Relic","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"ELK Stack","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Splunk","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Cost","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Free (self-hosted)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"$$","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"$$","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Free-$","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"$$$","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Metrics","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Good","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Good","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Logs","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Via Loki","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Traces","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Via Tempo","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Limited","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Good","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Learning Curve","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Steep","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Moderate","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Moderate","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Steep","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Steep","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Cloud-Native","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Good","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Good","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"On-Premises","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Good","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Good","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"APM","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Via exporters","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Limited","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Good","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Common Operational Challenges","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Challenge 1: Alert Fatigue","type":"text"}]},{"type":"paragraph","content":[{"text":"Problem","type":"text","marks":[{"type":"strong"}]},{"text":": Too many false positive alerts causing team burnout","type":"text"}]},{"type":"paragraph","content":[{"text":"Solution","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"yaml"},"content":[{"text":"Alert Tuning Process:\n1. Measure baseline alert volume and false positive rate\n2. Categorize alerts by actionability:\n - Actionable + Urgent = Keep as page\n - Actionable + Not Urgent = Ticket\n - Not Actionable = Remove or convert to dashboard metric\n3. Implement alert aggregation (group similar alerts)\n4. Add context to alerts (runbook links, relevant metrics)\n5. Regular review meetings (weekly) to tune thresholds\n6. Track metrics:\n - MTTA (Mean Time to Acknowledge): \u003c 5 min target\n - False Positive Rate: \u003c 20% target\n - Alert Volume per Week: Trending down","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Challenge 2: Incident Documentation During Crisis","type":"text"}]},{"type":"paragraph","content":[{"text":"Problem","type":"text","marks":[{"type":"strong"}]},{"text":": Teams skip documentation during high-pressure incidents","type":"text"}]},{"type":"paragraph","content":[{"text":"Solution","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Assign dedicated scribe role (not the incident commander)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use incident management tools (PagerDuty, Opsgenie) with automatic timeline","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Template-based incident reports with required fields","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Post-incident review scheduled automatically (within 48 hours)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Gamify documentation (track and recognize thorough documentation)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Challenge 3: Knowledge Silos","type":"text"}]},{"type":"paragraph","content":[{"text":"Problem","type":"text","marks":[{"type":"strong"}]},{"text":": Critical knowledge trapped in individual team members' heads","type":"text"}]},{"type":"paragraph","content":[{"text":"Solution","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"yaml"},"content":[{"text":"Knowledge Transfer Strategy:\n- Pair Programming/Shadowing: 20% of sprint capacity\n- Runbook Requirements: Every system must have runbook\n- Lunch & Learn Sessions: Weekly 30-min knowledge sharing\n- Cross-Training Matrix: Track who knows what, identify gaps\n- On-Call Rotation: Everyone rotates to spread knowledge\n- Post-Incident Reviews: Mandatory team sharing\n- Documentation Sprints: Quarterly focus on doc completion","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Challenge 4: Balancing Stability vs Innovation","type":"text"}]},{"type":"paragraph","content":[{"text":"Problem","type":"text","marks":[{"type":"strong"}]},{"text":": Operations team resists change to maintain stability","type":"text"}]},{"type":"paragraph","content":[{"text":"Solution","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Implement change windows (planned maintenance periods)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use blue-green or canary deployments for lower risk","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Establish \"innovation time\" (Google 20% time model)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Create sandbox environments for experimentation","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Measure and reward both stability AND improvement metrics","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Include \"toil reduction\" as OKR target","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Key Metrics & KPIs","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Service Reliability Metrics","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"yaml"},"content":[{"text":"Availability:\n Formula: (Total Time - Downtime) / Total Time × 100\n Target: 99.9% (43.8 min/month downtime)\n Measurement: Per service, monthly\n\nMTTR (Mean Time to Recovery):\n Formula: Sum of recovery times / Number of incidents\n Target: \u003c 30 minutes for P1, \u003c 4 hours for P2\n Measurement: Per severity level, monthly\n\nMTBF (Mean Time Between Failures):\n Formula: Total operational time / Number of failures\n Target: > 720 hours (30 days)\n Measurement: Per service, quarterly\n\nMTTA (Mean Time to Acknowledge):\n Formula: Sum of acknowledgment times / Number of alerts\n Target: \u003c 5 minutes for pages\n Measurement: Per on-call engineer, weekly\n\nChange Success Rate:\n Formula: Successful changes / Total changes × 100\n Target: > 95%\n Measurement: Monthly\n\nIncident Recurrence Rate:\n Formula: Repeat incidents / Total incidents × 100\n Target: \u003c 10%\n Measurement: Quarterly (same root cause within 90 days)","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Operational Efficiency Metrics","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"yaml"},"content":[{"text":"Toil Percentage:\n Definition: Time spent on manual, repetitive tasks\n Target: \u003c 30% of team capacity\n Measurement: Weekly time tracking\n\nAutomation Coverage:\n Formula: Automated tasks / Total repetitive tasks × 100\n Target: > 70%\n Measurement: Quarterly audit\n\nOn-Call Load:\n Formula: Alerts per on-call shift\n Target: \u003c 5 actionable alerts per shift\n Measurement: Per engineer, weekly\n\nRunbook Coverage:\n Formula: Services with runbooks / Total services × 100\n Target: 100%\n Measurement: Monthly audit\n\nKnowledge Base Utilization:\n Formula: Incidents resolved via KB / Total incidents × 100\n Target: > 40%\n Measurement: Monthly","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Integration Points","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"With Development Teams","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Participate in design reviews for operational requirements","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Provide deployment automation and CI/CD pipeline support","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Share monitoring and logging requirements","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Collaborate on incident response and post-mortems","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Joint ownership of SLOs and error budgets","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"With Security Teams","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Implement security monitoring and alerting","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Manage access controls and authentication systems","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Coordinate vulnerability patching and remediation","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Conduct security incident response","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Maintain compliance with security policies","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"With Business Stakeholders","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Report on service availability and performance","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Communicate planned maintenance windows","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Provide capacity planning forecasts","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Translate technical metrics to business impact","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Participate in business continuity planning","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Best Practices","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"1. Blameless Post-Mortems","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"markdown"},"content":[{"text":"Post-Incident Review Template:\n- Incident Summary (what happened, when, impact)\n- Timeline of Events (detailed chronology)\n- Root Cause Analysis (5 Whys or Fishbone)\n- What Went Well (strengths during response)\n- What Could Be Improved (opportunities)\n- Action Items (with owners and due dates)\n- Lessons Learned (shareable insights)\n\nRules:\n- No blame or punishment\n- Focus on systems and processes, not people\n- Everyone can speak freely\n- Action items must be tracked to completion","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"2. Runbook Standards","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"yaml"},"content":[{"text":"Runbook Contents:\n - Service Overview: Purpose, dependencies, architecture\n - SLIs/SLOs/SLAs: Defined thresholds and targets\n - Common Issues: Symptoms, causes, solutions\n - Troubleshooting Steps: Step-by-step procedures\n - Escalation Paths: Who to contact and when\n - Useful Commands: Copy-paste ready commands\n - Dashboard Links: Direct links to relevant dashboards\n - Recent Changes: Link to change log\n - Contact Information: Team, product owner, SMEs\n\nMaintenance:\n - Review quarterly or after major incidents\n - Test procedures during low-traffic periods\n - Update after every significant change\n - Track usage metrics (page views, helpfulness ratings)","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"3. On-Call Best Practices","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"yaml"},"content":[{"text":"On-Call Preparation:\n - Laptop with VPN access\n - Mobile device with notification apps\n - Contact list (escalation paths)\n - Access to all critical systems\n - Runbooks bookmarked\n - Backup on-call identified\n\nDuring On-Call:\n - Acknowledge alerts within 5 minutes\n - Update incident status regularly\n - Follow escalation procedures\n - Document all actions in incident ticket\n - Handoff clearly to next on-call\n\nPost On-Call:\n - Complete incident reports\n - Submit toil reduction tickets\n - Provide feedback on runbooks\n - Update on-call documentation","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"4. Change Management Discipline","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"yaml"},"content":[{"text":"Standard Change Process:\n 1. Create change request (RFC)\n 2. Document:\n - What: Specific changes being made\n - Why: Business justification\n - When: Proposed date/time\n - Who: Change implementer and approver\n - How: Step-by-step procedure\n - Risk: Assessment and mitigation\n - Rollback: Detailed rollback plan\n - Testing: Validation steps\n 3. Submit for CAB review (7 days advance notice)\n 4. Implement during approved window\n 5. Validate success criteria\n 6. Close change with actual results\n 7. Post-implementation review if issues occurred\n\nEmergency Change Process:\n - Executive approval required\n - Implement with heightened monitoring\n - Full team notification\n - Complete documentation within 24 hours\n - Mandatory post-change review","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Reference Files","type":"text"}]},{"type":"paragraph","content":[{"text":"For detailed technical guidance, see:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"reference/monitoring.md","type":"text","marks":[{"type":"link","attrs":{"href":"reference/monitoring.md","title":null}}]},{"text":" - Observability, metrics, alerting, and dashboard design","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"reference/incident-management.md","type":"text","marks":[{"type":"link","attrs":{"href":"reference/incident-management.md","title":null}}]},{"text":" - Incident response, root cause analysis, post-mortems","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"reference/infrastructure.md","type":"text","marks":[{"type":"link","attrs":{"href":"reference/infrastructure.md","title":null}}]},{"text":" - Server management, network operations, capacity planning","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"reference/automation.md","type":"text","marks":[{"type":"link","attrs":{"href":"reference/automation.md","title":null}}]},{"text":" - Scripting, configuration management, orchestration tools","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"reference/backup-recovery.md","type":"text","marks":[{"type":"link","attrs":{"href":"reference/backup-recovery.md","title":null}}]},{"text":" - Backup strategies, disaster recovery, business continuity","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Getting Started","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"For New Infrastructure","type":"text","marks":[{"type":"strong"}]},{"text":": Start with ","type":"text"},{"text":"reference/infrastructure.md","type":"text","marks":[{"type":"link","attrs":{"href":"reference/infrastructure.md","title":null}}]},{"text":" for setup guidance","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"For Monitoring Setup","type":"text","marks":[{"type":"strong"}]},{"text":": Review ","type":"text"},{"text":"reference/monitoring.md","type":"text","marks":[{"type":"link","attrs":{"href":"reference/monitoring.md","title":null}}]},{"text":" for observability strategy","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"For Incident Response","type":"text","marks":[{"type":"strong"}]},{"text":": See ","type":"text"},{"text":"reference/incident-management.md","type":"text","marks":[{"type":"link","attrs":{"href":"reference/incident-management.md","title":null}}]},{"text":" for procedures","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"For Automation Projects","type":"text","marks":[{"type":"strong"}]},{"text":": Check ","type":"text"},{"text":"reference/automation.md","type":"text","marks":[{"type":"link","attrs":{"href":"reference/automation.md","title":null}}]},{"text":" for tooling recommendations","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"For DR Planning","type":"text","marks":[{"type":"strong"}]},{"text":": Consult ","type":"text"},{"text":"reference/backup-recovery.md","type":"text","marks":[{"type":"link","attrs":{"href":"reference/backup-recovery.md","title":null}}]},{"text":" for recovery strategies","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"it-operations","author":"@skillopedia","source":{"stars":27714,"repo_name":"claude-code-templates","origin_url":"https://github.com/davila7/claude-code-templates/blob/HEAD/cli-tool/components/skills/development/it-operations/SKILL.md","repo_owner":"davila7","body_sha256":"46b7a2efd38ff2192314ea86dfcfabd8554a56ea5e4edbe381c7dd4f823c35f9","cluster_key":"339154de4207d9c21f60f23f7779097280753aa75603c1171de9fc1a176681dd","clean_bundle":{"format":"clean-skill-bundle-v1","source":"davila7/claude-code-templates/cli-tool/components/skills/development/it-operations/SKILL.md","attachments":[{"id":"f6fbd0c4-b54d-50f3-a8fa-eac53e80164a","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/f6fbd0c4-b54d-50f3-a8fa-eac53e80164a/attachment.md","path":"README.md","size":9738,"sha256":"f6c6b9674e1ee9997b9a52a21c3de2273477432ee0059831f71a508c05c9b746","contentType":"text/markdown; charset=utf-8"},{"id":"658ef54f-f5e7-5132-883c-e009035e9f94","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/658ef54f-f5e7-5132-883c-e009035e9f94/attachment.md","path":"reference/automation.md","size":40343,"sha256":"4903c9d1a65ecfd96ff0708a8bda5322d49440d20683756719d5198659992def","contentType":"text/markdown; charset=utf-8"},{"id":"a84bb6b4-b030-53bf-9c42-7af44719b386","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/a84bb6b4-b030-53bf-9c42-7af44719b386/attachment.md","path":"reference/backup-recovery.md","size":26233,"sha256":"577f1644cf072b742ed35c5a46ef21feb320b1bbbfabd86075a67fa8d1db91c6","contentType":"text/markdown; charset=utf-8"},{"id":"26f146c2-1245-5208-a924-ecb74ac07212","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/26f146c2-1245-5208-a924-ecb74ac07212/attachment.md","path":"reference/incident-management.md","size":46805,"sha256":"9851e1c7159bf6634bb0c3dd264b0d18e6a6a90ebb02b82814ff3a80401885d7","contentType":"text/markdown; charset=utf-8"},{"id":"8dc50bc3-e2df-5d71-8874-cb242144f481","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/8dc50bc3-e2df-5d71-8874-cb242144f481/attachment.md","path":"reference/infrastructure.md","size":45301,"sha256":"aa1ed351f5eb238b83c4eb26f28b4e84b90876f7840de34ac76e302ec1f8dd58","contentType":"text/markdown; charset=utf-8"},{"id":"cc0bb19d-de06-502a-9950-680914a79b1d","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/cc0bb19d-de06-502a-9950-680914a79b1d/attachment.md","path":"reference/monitoring.md","size":38260,"sha256":"445a7b0ef74871e53c43f62a1910ee7a04f245ff2a4d097aecbee522b24da6df","contentType":"text/markdown; charset=utf-8"}],"bundle_sha256":"715f191d64e1566c345345dce24aadc59942125fe138f901e7f595bd19f954f6","attachment_count":6,"text_attachments":6,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":0,"excluded_attachments":[]},"cluster_size":2,"skill_md_path":"cli-tool/components/skills/development/it-operations/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"devops-infrastructure","category_label":"DevOps"},"exact_dupes_collapsed_into_this":1},"version":"v1","category":"devops-infrastructure","import_tag":"clean-skills-v1","description":"Manages IT infrastructure, monitoring, incident response, and service reliability. Provides frameworks for ITIL service management, observability strategies, automation, backup/recovery, capacity planning, and operational excellence practices."}},"renderedAt":1782987252567}

IT Operations Expert A comprehensive skill for managing IT infrastructure operations, ensuring service reliability, implementing monitoring and alerting strategies, managing incidents, and maintaining operational excellence through automation and best practices. Core Principles 1. Service Reliability First - Proactive Monitoring : Implement comprehensive observability before incidents occur - Incident Management : Structured response processes with clear escalation paths - SLA/SLO Management : Define and maintain service level objectives aligned with business needs - Continuous Improvement :…