Kubernetes Operator Build operators that reconcile correctly. Most operator bugs are not Kubernetes bugs — they are reconcile-loop bugs: missing finalizers, blocking calls, no requeue on transient errors, status drift, RBAC over-grants. This skill catches them deterministically before they reach a cluster. When to use - Building a new Kubernetes Operator (controller for a CRD) - Reviewing an existing operator for capability-level gaps - Auditing a CRD spec for status/conditions/finalizer correctness - Choosing a framework (controller-runtime / kubebuilder / operator-sdk / metacontroller / KOP…

\n description: Semver version of the application\n replicas:\n type: integer\n minimum: 1\n maximum: 100\n default: 3\n description: Number of replicas to run\n status:\n type: object\n properties:\n phase:\n type: string\n enum: [Pending, Running, Failed]\n observedGeneration:\n type: integer\n description: Spec generation last reconciled\n conditions:\n type: array\n items:\n type: object\n required: [type, status, lastTransitionTime]\n properties:\n type: { type: string }\n status: { type: string, enum: [\"True\", \"False\", \"Unknown\"] }\n reason: { type: string }\n message: { type: string }\n lastTransitionTime: { type: string, format: date-time }\n observedGeneration: { type: integer }\n subresources:\n status: {} # CRITICAL — enables /status subresource\n additionalPrinterColumns:\n - name: Phase\n type: string\n jsonPath: .status.phase\n - name: Ready\n type: string\n jsonPath: .status.conditions[?(@.type==\"Ready\")].status\n - name: Age\n type: date\n jsonPath: .metadata.creationTimestamp\n","content_type":"application/yaml; charset=utf-8","language":"yaml","size":2620,"content_sha256":"3afaafc885780b30210032d537629f01b0f7aeebdd734912895a52fcd6aede19"},{"filename":"assets/reconcile_skeleton.go","content":"// Reconcile skeleton — passes reconcile_lint.py.\n// Replace \u003cPLACEHOLDER> markers; rename receiver + types to match your CR.\npackage controllers\n\nimport (\n\t\"context\"\n\t\"errors\"\n\t\"time\"\n\n\tapierrors \"k8s.io/apimachinery/pkg/api/errors\"\n\t\"k8s.io/apimachinery/pkg/api/meta\"\n\tmetav1 \"k8s.io/apimachinery/pkg/apis/meta/v1\"\n\tctrl \"sigs.k8s.io/controller-runtime\"\n\t\"sigs.k8s.io/controller-runtime/pkg/client\"\n\t\"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil\"\n\t\"sigs.k8s.io/controller-runtime/pkg/log\"\n\t\"sigs.k8s.io/controller-runtime/pkg/predicate\"\n\n\tappsv1alpha1 \"\u003cMODULE>/api/v1alpha1\"\n)\n\nconst finalizerName = \"\u003cgroup>/finalizer\"\n\ntype MyAppReconciler struct {\n\tclient.Client\n\tScheme *runtime.Scheme\n}\n\nfunc (r *MyAppReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {\n\tlogger := log.FromContext(ctx).WithValues(\"myapp\", req.NamespacedName)\n\n\tvar cr appsv1alpha1.MyApp\n\tif err := r.Get(ctx, req.NamespacedName, &cr); err != nil {\n\t\tif apierrors.IsNotFound(err) {\n\t\t\treturn ctrl.Result{}, nil\n\t\t}\n\t\treturn ctrl.Result{}, err\n\t}\n\n\tif !cr.DeletionTimestamp.IsZero() {\n\t\treturn r.reconcileDelete(ctx, &cr)\n\t}\n\n\tif !controllerutil.ContainsFinalizer(&cr, finalizerName) {\n\t\tcontrollerutil.AddFinalizer(&cr, finalizerName)\n\t\treturn ctrl.Result{}, r.Update(ctx, &cr)\n\t}\n\n\tmeta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{\n\t\tType: \"Reconciling\",\n\t\tStatus: metav1.ConditionTrue,\n\t\tReason: \"InProgress\",\n\t\tMessage: \"Converging to desired state\",\n\t\tObservedGeneration: cr.Generation,\n\t})\n\n\tres, recErr := r.reconcileNormal(ctx, &cr)\n\n\tif recErr == nil {\n\t\tmeta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{\n\t\t\tType: \"Ready\", Status: metav1.ConditionTrue,\n\t\t\tReason: \"AllReady\", Message: \"all components healthy\",\n\t\t\tObservedGeneration: cr.Generation,\n\t\t})\n\t} else {\n\t\tmeta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{\n\t\t\tType: \"Ready\", Status: metav1.ConditionFalse,\n\t\t\tReason: \"ReconcileError\", Message: recErr.Error(),\n\t\t\tObservedGeneration: cr.Generation,\n\t\t})\n\t}\n\n\tcr.Status.ObservedGeneration = cr.Generation\n\n\tif statusErr := r.Status().Update(ctx, &cr); statusErr != nil {\n\t\tlogger.Error(statusErr, \"failed to update status\")\n\t\treturn res, errors.Join(recErr, statusErr)\n\t}\n\treturn res, recErr\n}\n\nfunc (r *MyAppReconciler) reconcileNormal(ctx context.Context, cr *appsv1alpha1.MyApp) (ctrl.Result, error) {\n\t// Idempotent: read desired, build child, CreateOrUpdate.\n\tdeployment := &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: cr.Name, Namespace: cr.Namespace}}\n\top, err := controllerutil.CreateOrUpdate(ctx, r.Client, deployment, func() error {\n\t\tdeployment.Spec.Replicas = &cr.Spec.Replicas\n\t\t// Build container spec from cr.Spec — extracted helper for clarity\n\t\t// deployment.Spec.Template.Spec.Containers = buildContainers(&cr.Spec)\n\t\treturn controllerutil.SetControllerReference(cr, deployment, r.Scheme)\n\t})\n\tif err != nil {\n\t\treturn ctrl.Result{}, err\n\t}\n\tlog.FromContext(ctx).Info(\"deployment\", \"operation\", op)\n\n\t// Periodic resync — keeps status fresh even when nothing changes.\n\treturn ctrl.Result{RequeueAfter: 5 * time.Minute}, nil\n}\n\nfunc (r *MyAppReconciler) reconcileDelete(ctx context.Context, cr *appsv1alpha1.MyApp) (ctrl.Result, error) {\n\tif !controllerutil.ContainsFinalizer(cr, finalizerName) {\n\t\treturn ctrl.Result{}, nil\n\t}\n\tif err := r.deleteExternalResources(ctx, cr); err != nil {\n\t\treturn ctrl.Result{RequeueAfter: 30 * time.Second}, err\n\t}\n\tcontrollerutil.RemoveFinalizer(cr, finalizerName)\n\treturn ctrl.Result{}, r.Update(ctx, cr)\n}\n\nfunc (r *MyAppReconciler) deleteExternalResources(ctx context.Context, cr *appsv1alpha1.MyApp) error {\n\t// Implement teardown of external state (cloud DB, S3 bucket, DNS record, ...)\n\treturn nil\n}\n\nfunc (r *MyAppReconciler) SetupWithManager(mgr ctrl.Manager) error {\n\treturn ctrl.NewControllerManagedBy(mgr).\n\t\tFor(&appsv1alpha1.MyApp{}).\n\t\tOwns(&appsv1.Deployment{}).\n\t\tWithEventFilter(predicate.GenerationChangedPredicate{}).\n\t\tComplete(r)\n}\n","content_type":"text/plain; charset=utf-8","language":"go","size":4052,"content_sha256":"ff5f3d75b77d7f4053be74182acf0fe2482f5c42c4c726d28754b2e719733ea2"},{"filename":"references/crd_design.md","content":"# CRD design\n\nCustom Resource Definitions (CRDs) define the API surface of your operator. A bad CRD design locks you into hard-to-evolve schemas, forces wrapper APIs, and creates user-facing UX problems via `kubectl`.\n\n## Anatomy of a production CRD\n\n```yaml\napiVersion: apiextensions.k8s.io/v1\nkind: CustomResourceDefinition\nmetadata:\n name: myapps.apps.example.com # plural.group\nspec:\n group: apps.example.com\n names:\n kind: MyApp # PascalCase\n plural: myapps # lowercase\n singular: myapp # lowercase\n listKind: MyAppList # KindList\n shortNames: [ma] # optional\n scope: Namespaced # or Cluster (justify)\n versions:\n - name: v1alpha1\n served: true\n storage: true\n schema:\n openAPIV3Schema:\n type: object\n properties:\n spec:\n type: object\n required: [version]\n properties:\n version:\n type: string\n pattern: '^[0-9]+\\.[0-9]+\\.[0-9]+

Kubernetes Operator Build operators that reconcile correctly. Most operator bugs are not Kubernetes bugs — they are reconcile-loop bugs: missing finalizers, blocking calls, no requeue on transient errors, status drift, RBAC over-grants. This skill catches them deterministically before they reach a cluster. When to use - Building a new Kubernetes Operator (controller for a CRD) - Reviewing an existing operator for capability-level gaps - Auditing a CRD spec for status/conditions/finalizer correctness - Choosing a framework (controller-runtime / kubebuilder / operator-sdk / metacontroller / KOP…

\n replicas:\n type: integer\n minimum: 1\n maximum: 100\n default: 3\n status:\n type: object\n properties:\n phase:\n type: string\n enum: [Pending, Running, Failed]\n conditions:\n type: array\n items:\n type: object\n required: [type, status, lastTransitionTime]\n properties:\n type: { type: string }\n status: { type: string, enum: [\"True\", \"False\", \"Unknown\"] }\n reason: { type: string }\n message: { type: string }\n lastTransitionTime: { type: string, format: date-time }\n observedGeneration: { type: integer }\n subresources:\n status: {} # CRITICAL — see below\n scale: # if scaling is meaningful\n specReplicasPath: .spec.replicas\n statusReplicasPath: .status.readyReplicas\n additionalPrinterColumns:\n - name: Phase\n type: string\n jsonPath: .status.phase\n - name: Ready\n type: string\n jsonPath: .status.conditions[?(@.type==\"Ready\")].status\n - name: Age\n type: date\n jsonPath: .metadata.creationTimestamp\n```\n\n## Required structural elements\n\n### 1. Status subresource — `subresources.status: {}`\n\nWithout it:\n- `r.Status().Update(ctx, obj)` doesn't work — falls back to `r.Update`\n- Status updates re-trigger spec reconcile → loop\n- RBAC can't be split between spec writers and status writers\n\n**Always declare it.**\n\n### 2. Conditions array\n\nUse the standard `metav1.Condition` shape. Required fields: `type`, `status`, `lastTransitionTime`. Recommended: `reason`, `message`, `observedGeneration`.\n\nConventional condition types:\n- `Ready` — overall readiness\n- `Reconciling` — controller is actively working\n- `Degraded` — operating but with reduced capability\n- `Progressing` — change in progress (mostly for Deployments-style flows)\n\nUse `meta.SetStatusCondition()` from `k8s.io/apimachinery/pkg/api/meta` — don't write to the slice directly.\n\n### 3. observedGeneration\n\nTrack which spec generation the controller has acted on:\n\n```go\nstatus.ObservedGeneration = obj.Generation\n```\n\nLets users tell whether status reflects the latest spec or a previous one.\n\n### 4. Printer columns\n\n`kubectl get myapp` UX is determined by `additionalPrinterColumns`. Always include:\n- `Phase` or `Ready` (status)\n- `Age` (so users know when it was created)\n\nOptionally: replicas, version, key spec field.\n\n### 5. Validation in the schema, not the controller\n\nExpress constraints declaratively:\n\n| Constraint | OpenAPI |\n|---|---|\n| Range | `minimum`/`maximum` |\n| String pattern | `pattern: '^...

Kubernetes Operator Build operators that reconcile correctly. Most operator bugs are not Kubernetes bugs — they are reconcile-loop bugs: missing finalizers, blocking calls, no requeue on transient errors, status drift, RBAC over-grants. This skill catches them deterministically before they reach a cluster. When to use - Building a new Kubernetes Operator (controller for a CRD) - Reviewing an existing operator for capability-level gaps - Auditing a CRD spec for status/conditions/finalizer correctness - Choosing a framework (controller-runtime / kubebuilder / operator-sdk / metacontroller / KOP…

` |\n| Enum | `enum: [Pending, Running]` |\n| Required field | `required: [...]` |\n| Default value | `default: 3` |\n| Min/max length | `minLength`/`maxLength` |\n\nReserve controller validation for cross-field rules and external dependencies (e.g., \"this name is taken in our DB\").\n\n### 6. Avoid `x-kubernetes-preserve-unknown-fields: true`\n\nIt disables structural validation. Sometimes needed (e.g., raw `kubectl apply` patches), but never at the spec root. Use it sparingly on a single sub-tree.\n\n## Versioning strategy\n\nCRDs evolve. Plan from day 1:\n\n| Stage | Version | Stability | Allowed changes |\n|---|---|---|---|\n| Internal preview | `v1alpha1` | None | Anything; document breaking changes |\n| Beta | `v1beta1` | Some | Additive only; deprecate fields |\n| GA | `v1` | Strong | Additive only; never remove fields |\n\nConversion webhook required when:\n- Multiple versions are served simultaneously\n- A field's shape changed between versions\n\nFor simple field renames, `x-kubernetes-conversion-strategy: None` works.\n\n## Scope: Namespaced vs Cluster\n\nDefault to **Namespaced**. Cluster-scoped CRDs:\n- Can't be RBAC-restricted by namespace\n- Can't have `OwnerReferences` from namespaced parents\n- Are appropriate only for cluster-wide resources (`StorageClass`-like things)\n\nIf your operator manages namespace-bound things (apps, databases, queues), use Namespaced.\n\n## Naming\n\n- **Group**: `\u003cdomain>.\u003creverse-domain>` — e.g., `apps.example.com`. Don't use generic groups (`com`, `io`).\n- **Kind**: PascalCase, singular, descriptive — `MyApp`, `Database`, `Cache`. Avoid `MyAppResource` (the `Resource` suffix is implicit).\n- **Plural**: lowercase, plural — `myapps`, `databases`, `caches`.\n- **Short name**: 2-3 letters; check for conflicts with built-in resources.\n\n## Validation tooling\n\n- `kubectl apply --dry-run=server` — validates against your CRD\n- `kubectl explain \u003ckind>.\u003cfield>` — shows what your schema documents\n- `crd_validator.py` — this skill's tool, structural rules\n\n## Documentation in the schema\n\nUse the `description` field on every property. `kubectl explain` reads it:\n\n```yaml\nproperties:\n replicas:\n type: integer\n minimum: 1\n description: |\n Number of replicas to run. Production deployments should use ≥3.\n Increases above 100 require quota approval.\n```\n\n## Anti-patterns\n\n- **Top-level `x-kubernetes-preserve-unknown-fields: true`** — defeats validation\n- **No `scope:` declared** — defaults to namespaced but make intent explicit\n- **No printer columns** — `kubectl get` shows only `NAME AGE`\n- **Conditions written by hand** (not via `SetStatusCondition`) — easy to lose `lastTransitionTime`\n- **Status fields that duplicate spec** — keep them separate\n- **Using `metadata.annotations` to encode operator state** — use status fields\n- **Single huge CRD with 50+ fields** — split into multiple CRDs (e.g., MyApp + MyAppBackup + MyAppRestore)\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":6998,"content_sha256":"970db86bfe5c27f71f673d1acb5ac973534c44c99944a8721b0a95645fb99a31"},{"filename":"references/operator_pattern.md","content":"# The operator pattern\n\nAn operator is a controller that reconciles a Custom Resource (CR) toward its declared spec. It encodes operational knowledge — installation, upgrades, backups, failover — that would otherwise live in tribal knowledge or runbooks.\n\n## When you need an operator\n\nBuild an operator when:\n- The application has nontrivial **lifecycle operations** (backup, restore, version upgrade, failover) that go beyond a simple Deployment\n- The application has **statefulness or topology** that Helm/Deployment can't express (leader election, peer discovery, rolling state migration)\n- Multiple teams need to provision instances of the application via **a Kubernetes API**, not a custom UI\n- The application's operational discipline is documented in runbooks but unevenly applied\n\nDon't build an operator when:\n- A **Helm chart** is enough (most stateless apps fit here)\n- A **CronJob** can run the operational task on a schedule\n- The custom logic is a **one-time migration** (use a Job)\n- Three engineers can manage it via Deployment + ConfigMap\n\n## Operator pattern shape\n\n```\n┌────────────────────────────────────────────────────────┐\n│ apiVersion: apps.example.com/v1alpha1 │\n│ kind: MyApp ← Custom Resource │\n│ spec: │\n│ replicas: 3 ← user's intent │\n│ version: 1.4.2 │\n│ status: │\n│ conditions: ← controller's view │\n│ - type: Ready │\n│ status: \"True\" │\n│ phase: Running │\n└────────────────────────────────────────────────────────┘\n ↑\n │ owns\n │\n┌────────────────────────────────────────────────────────┐\n│ controller.Reconcile(ctx, req) ⟶ ctrl.Result, error │\n│ 1. read CR (the spec) from the cache │\n│ 2. read actual state (Pods, Services, ConfigMaps) │\n│ 3. diff actual against desired │\n│ 4. act idempotently to converge │\n│ 5. update status with observed state │\n│ 6. return RequeueAfter or done │\n└────────────────────────────────────────────────────────┘\n```\n\nReconcile runs whenever:\n- The CR changes\n- A child resource changes\n- A periodic resync fires (default 10h, configurable)\n- An explicit requeue from a previous run\n\n## Spec vs status — the cardinal split\n\n| spec | status |\n|---|---|\n| Authored by the user | Authored by the controller |\n| Mutable through `kubectl edit` | Mutable only via the status subresource |\n| Captures *intent* | Captures *observed reality* |\n| Triggers reconcile | Does NOT trigger reconcile (when subresource is enabled) |\n\nViolating the split is the #1 cause of operator bugs:\n- Mutating spec from the controller → user changes get overwritten\n- Updating status without the subresource → status update triggers spec reconcile → loop\n\n## Reconcile must be idempotent\n\nReconcile is called repeatedly for the same state. The function must:\n\n- Produce the same outcome regardless of call count\n- Use `Create-or-Update` patterns (`controllerutil.CreateOrUpdate`)\n- Compare current state to desired before writing\n- Never assume \"this is the first time we've seen this resource\"\n\nIdempotence test: if reconcile is called 100 times in a row with the same spec and no external change, the system must converge after the first call and do nothing on the next 99.\n\n## OwnerReferences and cascading deletion\n\nEvery child resource the operator creates must have its `OwnerReferences` set to the parent CR. Then:\n- Deleting the CR deletes children automatically\n- The garbage collector handles orphan cleanup\n- The operator doesn't need explicit teardown logic for owned resources\n\nExternal resources (cloud DBs, S3 buckets, DNS records) don't have OwnerReferences. Use **finalizers** to clean them up.\n\n## Finalizers\n\nA finalizer blocks deletion until the controller has cleaned up external state.\n\n```\n1. User: kubectl delete myapp foo\n2. API server: sets metadata.deletionTimestamp; does NOT delete\n3. Controller: sees deletionTimestamp; does cleanup; removes finalizer\n4. API server: deletion now proceeds\n```\n\nWithout a finalizer, external resources orphan. With one, the controller has a guaranteed hook to run cleanup before the CR disappears.\n\n## Conditions\n\nThe standard pattern for status reporting:\n\n```yaml\nstatus:\n conditions:\n - type: Ready # type values are operator-defined\n status: \"True\" # True | False | Unknown\n reason: \"AllReady\" # PascalCase, programmatic\n message: \"All replicas ready\" # human-readable\n lastTransitionTime: \"2026-05-08T12:00:00Z\"\n - type: Reconciling\n status: \"False\"\n reason: \"Idle\"\n lastTransitionTime: \"2026-05-08T12:00:00Z\"\n```\n\nUse `meta/v1.Conditions` and `meta/v1.SetStatusCondition` from kubebuilder/controller-runtime — don't roll your own.\n\n## Webhooks\n\nTwo types:\n\n- **ValidatingWebhook** — reject invalid CRs at admission (better than failing in reconcile)\n- **MutatingWebhook** — fill in defaults / inject sidecars (use sparingly; surprising side effects)\n\nRun webhooks in the same controller binary or a sidecar; cert-manager rotates the certs.\n\n## Anti-patterns\n\n- **Imperative reconcile**: \"if event = create, do X; if event = update, do Y\". Wrong shape. Reconcile = make actual=desired regardless of how we got here.\n- **No status subresource**: status updates re-trigger reconcile.\n- **Status mutation in many places**: centralize in a `setStatus` helper.\n- **Reconcile depending on event order**: events can be missed; reconcile must converge from any starting state.\n- **Long reconcile (>2 min)**: blocks the work queue; split work via RequeueAfter.\n\n## Decision flow: when an operator is the right answer\n\n```\nNeed: I want to manage \u003cX> in Kubernetes.\n\nIs \u003cX> a stateless web app? → Deployment + Service. Done.\nIs \u003cX> a stateless web app with config? → Deployment + ConfigMap.\nNeed version upgrade automation? → Helm. Done.\nNeed stateful behaviour (leader, peers)? → StatefulSet.\nNeed application-aware operations\n (backup, version migration, repair)? → Operator.\nNeed to expose \u003cX> as a k8s resource\n to other teams? → Operator.\n```\n\nWhen in doubt: start with Helm. Move to an operator only when Helm can't express the operational logic.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":7157,"content_sha256":"28ad0d8c1e165f5b0a2d095ad5890dcc122885dab88da666072714b8d2cb257d"},{"filename":"references/reconcile_loop.md","content":"# The reconcile loop\n\nReconcile is the heart of an operator. Most operator bugs are reconcile-loop bugs. The patterns below are deterministic — copy them.\n\n## Skeleton — `Reconcile(ctx, req)`\n\n```go\nfunc (r *MyAppReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {\n log := log.FromContext(ctx)\n\n // 1. Fetch the CR\n var cr appsv1alpha1.MyApp\n if err := r.Get(ctx, req.NamespacedName, &cr); err != nil {\n if apierrors.IsNotFound(err) {\n return ctrl.Result{}, nil // CR is gone; nothing to do\n }\n return ctrl.Result{}, err // transient error → requeue\n }\n\n // 2. Handle deletion via finalizer\n if !cr.DeletionTimestamp.IsZero() {\n return r.reconcileDelete(ctx, &cr)\n }\n if !controllerutil.ContainsFinalizer(&cr, finalizerName) {\n controllerutil.AddFinalizer(&cr, finalizerName)\n return ctrl.Result{}, r.Update(ctx, &cr)\n }\n\n // 3. Mark Reconciling\n meta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{\n Type: \"Reconciling\", Status: metav1.ConditionTrue,\n Reason: \"InProgress\", Message: \"Converging to desired state\",\n ObservedGeneration: cr.Generation,\n })\n\n // 4. Do the work, idempotently\n res, err := r.reconcileNormal(ctx, &cr)\n\n // 5. Update status (always — even on error)\n if statusErr := r.Status().Update(ctx, &cr); statusErr != nil {\n log.Error(statusErr, \"failed to update status\")\n return res, errors.Join(err, statusErr)\n }\n\n return res, err\n}\n```\n\n## The 5-step shape\n\n1. **Fetch the CR.** Handle `NotFound` cleanly — the CR may have been deleted between event and reconcile.\n2. **Handle deletion.** If `DeletionTimestamp` is set, run cleanup, remove finalizer, return.\n3. **Set Reconciling condition.** Mark that the controller is working.\n4. **Do work idempotently.** Use `CreateOrUpdate`, compare desired-vs-actual, only act on differences.\n5. **Update status.** Even on error — partial progress is signal.\n\n## Idempotence patterns\n\n### Pattern: CreateOrUpdate\n\n```go\ndeployment := &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: cr.Name, Namespace: cr.Namespace}}\nop, err := controllerutil.CreateOrUpdate(ctx, r.Client, deployment, func() error {\n deployment.Spec.Replicas = &cr.Spec.Replicas\n deployment.Spec.Template.Spec.Containers = buildContainers(&cr.Spec)\n return controllerutil.SetControllerReference(&cr, deployment, r.Scheme)\n})\nif err != nil { return ctrl.Result{}, err }\nlog.Info(\"deployment\", \"operation\", op) // \"created\", \"updated\", or \"unchanged\"\n```\n\nThis pattern is idempotent by construction.\n\n### Pattern: SetControllerReference\n\nAlways set the OwnerReference so cascading deletion works:\n\n```go\ncontrollerutil.SetControllerReference(&cr, child, r.Scheme)\n```\n\n### Pattern: Finalizer for external resources\n\n```go\nconst finalizerName = \"myapp.apps.example.com/finalizer\"\n\nfunc (r *MyAppReconciler) reconcileDelete(ctx context.Context, cr *appsv1alpha1.MyApp) (ctrl.Result, error) {\n if !controllerutil.ContainsFinalizer(cr, finalizerName) {\n return ctrl.Result{}, nil\n }\n if err := r.deleteExternalResources(ctx, cr); err != nil {\n return ctrl.Result{RequeueAfter: 30 * time.Second}, err\n }\n controllerutil.RemoveFinalizer(cr, finalizerName)\n return ctrl.Result{}, r.Update(ctx, cr)\n}\n```\n\n## Error handling and requeue\n\n| Situation | Return |\n|---|---|\n| Permanent error (bad spec) | `ctrl.Result{}, nil` + condition with reason |\n| Transient error (API timeout, throttling) | `ctrl.Result{}, err` (auto-requeue with backoff) |\n| Need a retry in N seconds | `ctrl.Result{RequeueAfter: 30*time.Second}, nil` |\n| Done; no follow-up | `ctrl.Result{}, nil` |\n\n**Don't use `time.Sleep` inside reconcile.** It blocks the work queue, starving other reconciles. Use `RequeueAfter`.\n\n## Status update patterns\n\n```go\n// Set a condition\nmeta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{\n Type: \"Ready\", Status: metav1.ConditionTrue,\n Reason: \"AllReady\", Message: \"all components healthy\",\n ObservedGeneration: cr.Generation,\n})\n\n// Track observed generation\ncr.Status.ObservedGeneration = cr.Generation\n\n// Update status — uses /status subresource\nif err := r.Status().Update(ctx, &cr); err != nil { ... }\n```\n\n**Never** call `r.Update(ctx, &cr)` to update status. It uses the spec subresource, which the user owns.\n\n## Read once, decide, act\n\nDon't observe the world repeatedly during reconcile. The cache is read-only and consistent within a single reconcile pass:\n\n```go\n// Good: read once, decide, act\nvar pods corev1.PodList\nr.List(ctx, &pods, client.InNamespace(cr.Namespace), client.MatchingLabels{\"app\": cr.Name})\ndesired := computeDesired(&cr, &pods)\napplyDesired(ctx, r.Client, desired)\n\n// Bad: observe-act-observe-act\nfor _, container := range cr.Spec.Containers {\n pod := r.Get(...) // re-reading the cache\n if needsRestart(pod) {\n r.Delete(...)\n pod = r.Get(...) // again\n ...\n }\n}\n```\n\n## Predicates — filter events you don't care about\n\n```go\nfunc (r *MyAppReconciler) SetupWithManager(mgr ctrl.Manager) error {\n return ctrl.NewControllerManagedBy(mgr).\n For(&appsv1alpha1.MyApp{}).\n Owns(&appsv1.Deployment{}).\n WithEventFilter(predicate.GenerationChangedPredicate{}). // ignore status-only updates\n Complete(r)\n}\n```\n\n`GenerationChangedPredicate` skips reconciles when only status changed — important to avoid loops.\n\n## Leader election\n\nAlways enable leader election when running >1 controller replica:\n\n```go\nmgr, _ := manager.New(cfg, manager.Options{\n LeaderElection: true,\n LeaderElectionID: \"myapp-operator-leader\",\n})\n```\n\nWithout it: split-brain. Two controllers both think they own the resource and fight.\n\n## Performance — bounded reconcile time\n\nA reconcile pass should complete in \u003c30s for typical work, \u003c2min for heavy work. Longer = the work queue starves other reconciles.\n\nIf work takes longer:\n- Break into phases; emit `RequeueAfter` between them\n- Move long-running work to a separate process (Job)\n- Cache expensive computations on `cr.Status`\n\n## Logging conventions\n\n```go\nlog := log.FromContext(ctx).WithValues(\"phase\", \"create-deployment\")\nlog.Info(\"creating deployment\", \"name\", cr.Name)\nlog.Error(err, \"failed to create deployment\")\n```\n\n- Use `log.FromContext(ctx)` — picks up controller-runtime's contextual logger\n- Use `Info` for normal flow, `Error` for retryable failures\n- Add structured fields, not formatted strings\n\n## Anti-patterns checklist\n\n- `time.Sleep` inside reconcile → starves queue; use `RequeueAfter`\n- `os.Exit` / `log.Fatal` → kills the controller; return an error\n- `panic` → same; return an error\n- `r.Update` to set status → use `r.Status().Update`\n- `r.Update` of the CR while the user could be editing it → use `r.Status().Update` or use Patch\n- Reading the same resource multiple times in one reconcile → read once\n- Reconcile body > 80 lines → extract `reconcileXxx` subroutines per phase\n- HTTP calls without `ctx` → can't cancel during shutdown\n- No requeue path for transient errors → silent failures\n- Missing `OwnerReferences` on children → cascading deletion broken\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":7275,"content_sha256":"3ece01fa9b29c88429255e6920885ded737b2d761b2b68791641682b0595e619"},{"filename":"references/tooling_landscape.md","content":"# Tooling landscape\n\nFive mainstream operator frameworks. Pick by language, complexity, and target environment.\n\n## At-a-glance\n\n| Framework | Language | Scaffolding | Webhook support | Best for | Project status |\n|---|---|---|---|---|---|\n| **controller-runtime** | Go | None (library) | Yes | Production-grade, low-level | Active (sig-api-machinery) |\n| **kubebuilder** | Go | Yes (CLI) | Yes | Standard Go operator path | Active (Kubernetes SIGs) |\n| **operator-sdk** | Go / Helm / Ansible | Yes (CLI) | Yes | OpenShift, mixed paradigms | Active (Red Hat) |\n| **metacontroller** | Any (webhook) | None | N/A (uses webhooks) | Polyglot, avoid Go | Less active |\n| **KOPF** | Python | None (library) | Yes | Python shops, async-first | Active (community) |\n| **java-operator-sdk** | Java | Yes | Yes | JVM shops | Active (Red Hat / Java SIG) |\n\n## Decision tree\n\n```\nPrimary language?\n├── Go ──┬── Need scaffolding + opinionated path → kubebuilder\n│ ├── Targeting OpenShift / OLM → operator-sdk (Go)\n│ └── Library-only, full control → controller-runtime\n├── Python ─────────────────────────────────────────→ KOPF\n├── Java ─────────────────────────────────────────→ java-operator-sdk\n└── Other (Node, Ruby, Rust)\n └── webhook-based, polyglot → metacontroller\n```\n\n## controller-runtime (Go library)\n\n**What it is:** The Go library that everyone else builds on. Provides `Manager`, `Reconciler`, cache, client, predicates, leader election.\n\n**Use when:**\n- You need fine-grained control over the manager and event sources\n- You're building reusable operator components\n- Your team has Go experience and prefers libraries to scaffolders\n\n**Skip when:**\n- You want bootstrap-by-CLI (use kubebuilder)\n- You don't speak Go\n\n**Example:**\n```go\nmgr, _ := ctrl.NewManager(cfg, ctrl.Options{Scheme: scheme})\nctrl.NewControllerManagedBy(mgr).\n For(&apps.MyApp{}).\n Complete(&MyAppReconciler{Client: mgr.GetClient()})\nmgr.Start(ctx)\n```\n\n## kubebuilder (Go scaffolder)\n\n**What it is:** The standard scaffolding tool. Wraps controller-runtime with project layout, code generation, and the `kubebuilder` CLI.\n\n**Use when:**\n- New Go operator\n- You want predictable project structure\n- You'll publish the operator publicly\n\n**Workflow:**\n```bash\nkubebuilder init --domain example.com --repo github.com/org/myapp-operator\nkubebuilder create api --group apps --version v1alpha1 --kind MyApp\nmake manifests\nmake generate\nmake run\n```\n\n**Strengths:** Excellent docs, mature, used by everyone from cert-manager to Crossplane.\n\n**Weaknesses:** Some teams find the layout opinionated; sometimes hard to escape from.\n\n## operator-sdk (Red Hat / OpenShift)\n\n**What it is:** Wraps kubebuilder for Go and adds Helm-based and Ansible-based operators (no Go required).\n\n**Use when:**\n- Targeting OpenShift / OLM (Operator Lifecycle Manager)\n- Building a Helm-based operator from an existing chart\n- Building an Ansible-based operator from existing playbooks\n\n**Helm-based operator:**\n```bash\noperator-sdk init --plugins=helm --domain example.com --group apps --version v1 --kind MyApp\noperator-sdk create api --group apps --version v1 --kind MyApp --helm-chart=./mychart\n```\n\nThe operator's reconcile becomes `helm upgrade --install`. Fast on-ramp; less power.\n\n**Ansible-based operator:**\nSimilar, but reconcile invokes a playbook. Useful for ops teams already deep in Ansible.\n\n**Skip when:**\n- Vanilla k8s target (kubebuilder is more direct)\n- You want a Go operator without OpenShift coupling\n\n## metacontroller (webhook-based, language-agnostic)\n\n**What it is:** Runs in-cluster, watches your CRDs, and POSTs webhook calls to your endpoints with desired-state computations. You implement the logic in any language behind an HTTP endpoint.\n\n**Use when:**\n- Polyglot team (Python, Node, Ruby, etc.)\n- Want to avoid Go and Java\n- Operator logic is genuinely simple (compute children from parent)\n\n**Example sync hook:**\n```python\n# Python webhook returns desired children given parent + observed\ndef sync(request):\n parent = request['parent']\n return {\n 'status': {'phase': 'Ready'},\n 'children': [{'apiVersion': 'apps/v1', 'kind': 'Deployment', ...}],\n }\n```\n\n**Strengths:** No Go required; fast iteration in any language.\n\n**Weaknesses:** Lower ecosystem activity; not great for complex multi-CRD operators; webhook-based latency.\n\n## KOPF (Python)\n\n**What it is:** A Python framework for building operators. Async-first, decorator-based, no scaffolding step.\n\n**Use when:**\n- Python shop\n- Operator logic is moderate complexity\n- Want fast iteration without recompilation\n\n**Example:**\n```python\nimport kopf\n\[email protected]('apps.example.com', 'v1alpha1', 'myapps')\nasync def create_fn(spec, name, namespace, logger, **_):\n logger.info(f\"creating MyApp {name}\")\n # ... create children\n return {'phase': 'Ready'}\n\[email protected]('apps.example.com', 'v1alpha1', 'myapps')\nasync def delete_fn(spec, name, namespace, **_):\n # cleanup external resources\n pass\n```\n\n**Strengths:**\n- Async/await native (good for many concurrent reconciles)\n- No code generation\n- Good for ML/data teams already in Python\n\n**Weaknesses:**\n- Smaller ecosystem than Go\n- Some features lag controller-runtime (e.g., complex caching)\n- Python startup cost in the controller pod\n\n## java-operator-sdk\n\n**What it is:** Java framework, Quarkus integration, modeled after controller-runtime.\n\n**Use when:** JVM shop with strong Spring/Quarkus skills.\n\n**Skip when:** You don't already have a JVM ops setup.\n\n## Comparison: complexity vs control\n\n```\ncontrol ↑\n │ controller-runtime (full control, library)\n │ │\n │ kubebuilder (scaffolded controller-runtime)\n │ │\n │ operator-sdk Go (kubebuilder + OLM)\n │ │\n │ KOPF (Python decorators)\n │ │\n │ java-operator-sdk (JVM)\n │ │\n │ operator-sdk Ansible (playbooks)\n │ │\n │ operator-sdk Helm (chart-based)\n │ │\n │ metacontroller (webhook hooks)\n ↓\ncomplexity ↓\n```\n\nHigher control = more code, more flexibility. Lower complexity = faster start, less power.\n\n## Cross-cutting concerns\n\nRegardless of framework:\n\n- **Webhooks for validation** — reject bad CRs at admission\n- **cert-manager** — rotate webhook certs automatically\n- **Prometheus** — `/metrics` endpoint via controller-runtime's built-in metrics\n- **OLM** (Operator Lifecycle Manager) — for OperatorHub publishing\n- **OperatorHub Capability Levels** — see `operator_capability_audit.py`\n\n## Migration paths\n\n| From | To | Effort |\n|---|---|---|\n| controller-runtime | kubebuilder | Low (kubebuilder uses controller-runtime) |\n| Helm chart | Helm-based operator-sdk | Low |\n| Helm chart | Go operator (kubebuilder) | High (rewrite logic in Go) |\n| KOPF | Go operator | High (language change) |\n| Any | metacontroller | Medium (move logic behind HTTP) |\n\n## Selection checklist\n\nBefore committing:\n- [ ] Identify primary language constraint\n- [ ] Target environment (vanilla k8s vs OpenShift/OLM)\n- [ ] Operator complexity: 1 CRD vs many\n- [ ] Need webhooks?\n- [ ] Need OLM publishing?\n- [ ] Build a 1-week proof-of-concept; verify reconcile latency, status update flow, and dev-loop ergonomics\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":7642,"content_sha256":"8d561136192786f37bee2560fac0ab4dbc3594dc607da853ffd545dde721e8bc"},{"filename":"scripts/crd_validator.py","content":"#!/usr/bin/env python3\n\"\"\"Validate a Kubernetes CRD YAML against operator-pattern best practices.\n\nChecks for status subresource, structural schema, conditions support, printer\ncolumns, version policy, and other operator-grade design rules. Stdlib-only —\nparses YAML via a minimal embedded reader (no PyYAML dependency).\n\"\"\"\nimport argparse\nimport json\nimport os\nimport re\nimport sys\n\nCHECKS = [\n (\"status_subresource\", \"Each version must declare subresources.status (otherwise status updates loop spec reconciles)\"),\n (\"storage_version\", \"Exactly one version must be storage:true\"),\n (\"served_version\", \"At least one version must be served:true\"),\n (\"schema_present\", \"Each version must declare schema.openAPIV3Schema\"),\n (\"schema_typed\", \"Schema must declare 'type: object' at root (no x-kubernetes-preserve-unknown-fields at root)\"),\n (\"conditions_array\", \"Schema should declare a conditions array under status (for metav1.Conditions)\"),\n (\"printer_columns\", \"additionalPrinterColumns should include Age and a status indicator\"),\n (\"scope\", \"scope should be Namespaced unless cluster-scoped is justified\"),\n (\"singular_listkind\", \"names.singular and names.listKind must be declared\"),\n]\n\n\ndef _load_yaml_minimal(path):\n \"\"\"Yield top-level YAML documents from a multi-doc file as text blocks.\n\n Stdlib-only — splits on '---' separators. We grep relevant fields with\n regex rather than fully parse. Crude but enough for the structural\n checks below; a full YAML parser would be the upgrade path.\"\"\"\n with open(path, \"r\", encoding=\"utf-8\", errors=\"replace\") as f:\n text = f.read()\n docs = re.split(r\"^---\\s*$\", text, flags=re.MULTILINE)\n return [d for d in docs if d.strip()]\n\n\ndef _is_crd_doc(doc):\n return bool(re.search(r\"^kind:\\s*CustomResourceDefinition\\s*$\", doc, re.MULTILINE))\n\n\ndef _check_one(doc, path):\n findings = []\n has_status_sub = bool(re.search(r\"subresources:\\s*\\n\\s*status:\\s*\\{?\\s*\\}?\", doc))\n if not has_status_sub:\n findings.append((\"FAIL\", \"status_subresource\", \"no subresources.status block found\"))\n storage_count = len(re.findall(r\"storage:\\s*true\\b\", doc))\n if storage_count != 1:\n findings.append((\"FAIL\", \"storage_version\", f\"expected exactly 1 storage:true, found {storage_count}\"))\n served_count = len(re.findall(r\"served:\\s*true\\b\", doc))\n if served_count \u003c 1:\n findings.append((\"FAIL\", \"served_version\", \"no served:true version\"))\n if \"openAPIV3Schema\" not in doc:\n findings.append((\"FAIL\", \"schema_present\", \"no openAPIV3Schema declared\"))\n if re.search(r\"x-kubernetes-preserve-unknown-fields:\\s*true\", doc):\n findings.append((\"WARN\", \"schema_typed\", \"x-kubernetes-preserve-unknown-fields: true present (defeats validation)\"))\n if \"conditions\" not in doc.lower():\n findings.append((\"WARN\", \"conditions_array\", \"no conditions array referenced (Karpathy: declare an explicit shape)\"))\n if \"additionalPrinterColumns\" not in doc:\n findings.append((\"WARN\", \"printer_columns\", \"no additionalPrinterColumns (kubectl get UX is poor)\"))\n elif not re.search(r\"name:\\s*Age\\b\", doc):\n findings.append((\"WARN\", \"printer_columns\", \"additionalPrinterColumns missing Age column\"))\n if not re.search(r\"^\\s*scope:\\s*\\w+\", doc, re.MULTILINE):\n findings.append((\"WARN\", \"scope\", \"scope not explicitly set\"))\n if not re.search(r\"^\\s*singular:\\s*[\\w\u003c]\", doc, re.MULTILINE):\n findings.append((\"WARN\", \"singular_listkind\", \"names.singular not declared\"))\n if not re.search(r\"^\\s*listKind:\\s*[\\w\u003c]\", doc, re.MULTILINE):\n findings.append((\"WARN\", \"singular_listkind\", \"names.listKind not declared\"))\n return findings\n\n\ndef _walk_yaml_files(root):\n if os.path.isfile(root):\n yield root\n return\n for r, _, files in os.walk(root):\n for f in files:\n if f.endswith((\".yaml\", \".yml\")):\n yield os.path.join(r, f)\n\n\ndef audit(target):\n results = []\n for path in _walk_yaml_files(target):\n for doc in _load_yaml_minimal(path):\n if not _is_crd_doc(doc):\n continue\n kind_match = re.search(r\"kind:\\s*(\\w+)\\s*$\", doc, re.MULTILINE)\n crd_kind = kind_match.group(1) if kind_match else \"?\"\n name_match = re.search(r\"^\\s+name:\\s*([\\w.\\-]+)\\s*$\", doc, re.MULTILINE)\n crd_name = name_match.group(1) if name_match else os.path.basename(path)\n findings = _check_one(doc, path)\n results.append({\"path\": path, \"name\": crd_name, \"kind\": crd_kind, \"findings\": findings})\n return results\n\n\ndef render_text(results):\n if not results:\n print(\"No CRD documents found.\")\n return 0\n fails = sum(1 for r in results for f in r[\"findings\"] if f[0] == \"FAIL\")\n warns = sum(1 for r in results for f in r[\"findings\"] if f[0] == \"WARN\")\n print(f\"CRD Validator — {len(results)} CRD(s) inspected, {fails} FAIL, {warns} WARN\")\n print(\"\")\n for r in results:\n print(f\"== {r['name']} ({r['path']})\")\n if not r[\"findings\"]:\n print(\" PASS: all checks green\")\n continue\n for level, key, msg in r[\"findings\"]:\n print(f\" [{level}] {key}: {msg}\")\n print(\"\")\n return 1 if fails else 0\n\n\ndef main():\n ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)\n ap.add_argument(\"--crd\", required=True, help=\"Path to a CRD YAML file or a directory of YAMLs\")\n ap.add_argument(\"--format\", choices=[\"text\", \"json\"], default=\"text\")\n args = ap.parse_args()\n\n if not os.path.exists(args.crd):\n print(f\"ERROR: not found: {args.crd}\", file=sys.stderr)\n return 2\n results = audit(args.crd)\n if args.format == \"json\":\n print(json.dumps(results, indent=2))\n return 0\n return render_text(results)\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n","content_type":"text/x-python; charset=utf-8","language":"python","size":5939,"content_sha256":"e9151a593c13602108d79ca629099e0865849a49c9c5f5cbad09bfd65466d8b9"},{"filename":"scripts/operator_capability_audit.py","content":"#!/usr/bin/env python3\n\"\"\"Score an operator against OperatorHub Capability Levels (1-5).\n\nWalks an operator repo and detects evidence for each level. Level achieved =\nhighest level for which all required signals are present. Reports next-level\ngaps as concrete advancement steps.\n\nLevels:\n L1 Basic Install — CRD + controller + Deployment manifest\n L2 Seamless Upgrades — version conversion + PDB + leader election\n L3 Full Lifecycle — backup/restore + finalizers + status conditions\n L4 Deep Insights — /metrics endpoint + Prometheus rules\n L5 Auto Pilot — HPA / VPA / autotuning logic referenced\n\"\"\"\nimport argparse\nimport json\nimport os\nimport re\nimport sys\n\n\nSIGNALS = {\n \"L1\": [\n (\"crd_present\", lambda files, contents: any(\"CustomResourceDefinition\" in c for c in contents.values())),\n (\"deployment_present\", lambda files, contents: any(re.search(r\"^kind:\\s*Deployment\", c, re.MULTILINE) for c in contents.values())),\n (\"controller_code\", lambda files, contents: any(p.endswith(\".go\") and \"Reconcile\" in c for p, c in contents.items())),\n ],\n \"L2\": [\n (\"conversion_webhook\", lambda files, contents: any(\"conversion\" in c.lower() and \"webhook\" in c.lower() for c in contents.values())),\n (\"leader_election\", lambda files, contents: any(\"LeaderElection\" in c or \"leader-elect\" in c for c in contents.values())),\n (\"pdb_present\", lambda files, contents: any(re.search(r\"kind:\\s*PodDisruptionBudget\", c) for c in contents.values())),\n ],\n \"L3\": [\n (\"finalizers\", lambda files, contents: any(\"Finalizer\" in c or \"finalizers\" in c for c in contents.values())),\n (\"status_conditions\", lambda files, contents: any(\"metav1.Condition\" in c or \"SetStatusCondition\" in c for c in contents.values())),\n (\"backup_restore_hint\", lambda files, contents: any(re.search(r\"\\b(backup|restore|snapshot)\\b\", c, re.IGNORECASE) for c in contents.values())),\n ],\n \"L4\": [\n (\"metrics_endpoint\", lambda files, contents: any(re.search(r\"/metrics|prometheus\", c) for c in contents.values())),\n (\"prometheus_rules\", lambda files, contents: any(re.search(r\"PrometheusRule|alert:\", c) for c in contents.values())),\n ],\n \"L5\": [\n (\"autoscaling_referenced\", lambda files, contents: any(re.search(r\"\\bHorizontalPodAutoscaler|VerticalPodAutoscaler|autoscal\", c) for c in contents.values())),\n (\"autotune_logic\", lambda files, contents: any(re.search(r\"autotune|self-heal|anomaly\", c, re.IGNORECASE) for c in contents.values())),\n ],\n}\n\nLEVEL_NAMES = {\n \"L1\": \"Basic Install\",\n \"L2\": \"Seamless Upgrades\",\n \"L3\": \"Full Lifecycle\",\n \"L4\": \"Deep Insights\",\n \"L5\": \"Auto Pilot\",\n}\n\nSCAN_EXTS = {\".go\", \".yaml\", \".yml\", \".md\"}\nSKIP_DIRS = {\".git\", \"node_modules\", \"vendor\", \"bin\", \"dist\", \"__pycache__\"}\n\n\ndef _walk(root):\n files = {}\n for r, dirs, fnames in os.walk(root):\n dirs[:] = [d for d in dirs if d not in SKIP_DIRS]\n for f in fnames:\n if os.path.splitext(f)[1] in SCAN_EXTS:\n p = os.path.join(r, f)\n try:\n with open(p, \"r\", encoding=\"utf-8\", errors=\"replace\") as fh:\n files[p] = fh.read()\n except OSError:\n continue\n return files\n\n\ndef evaluate(operator_dir):\n contents = _walk(operator_dir)\n file_paths = list(contents.keys())\n results = {}\n achieved_max = None\n for level in [\"L1\", \"L2\", \"L3\", \"L4\", \"L5\"]:\n signals = SIGNALS[level]\n passing = []\n failing = []\n for key, check in signals:\n ok = check(file_paths, contents)\n (passing if ok else failing).append(key)\n all_pass = len(failing) == 0\n results[level] = {\n \"name\": LEVEL_NAMES[level],\n \"passing\": passing,\n \"missing\": failing,\n \"achieved\": all_pass,\n }\n if all_pass:\n achieved_max = level\n else:\n break\n return {\"current_level\": achieved_max, \"details\": results}\n\n\ndef render_text(report, operator_dir):\n print(f\"Operator Capability Audit — {operator_dir}\")\n current = report[\"current_level\"]\n if current is None:\n print(\"Current level: BELOW_L1 (no operator structure detected)\")\n else:\n print(f\"Current level: {current} — {LEVEL_NAMES[current]}\")\n print(\"\")\n for level in [\"L1\", \"L2\", \"L3\", \"L4\", \"L5\"]:\n d = report[\"details\"].get(level)\n if d is None:\n continue\n marker = \"✓\" if d[\"achieved\"] else \"✗\"\n print(f\" {marker} {level} {d['name']}: pass={len(d['passing'])} miss={len(d['missing'])}\")\n for k in d[\"missing\"]:\n print(f\" - missing: {k}\")\n print(\"\")\n next_level = None\n for lv in [\"L1\", \"L2\", \"L3\", \"L4\", \"L5\"]:\n if lv == current:\n continue\n if not report[\"details\"].get(lv, {}).get(\"achieved\"):\n next_level = lv\n break\n if next_level:\n misses = report[\"details\"][next_level][\"missing\"]\n print(f\"Next: advance to {next_level} ({LEVEL_NAMES[next_level]}) by addressing:\")\n for k in misses:\n print(f\" - {k}\")\n\n\ndef main():\n ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)\n ap.add_argument(\"--operator-dir\", required=True, help=\"Path to operator repo root\")\n ap.add_argument(\"--format\", choices=[\"text\", \"json\"], default=\"text\")\n args = ap.parse_args()\n\n if not os.path.isdir(args.operator_dir):\n print(f\"ERROR: not a directory: {args.operator_dir}\", file=sys.stderr)\n return 2\n report = evaluate(args.operator_dir)\n if args.format == \"json\":\n print(json.dumps(report, indent=2))\n else:\n render_text(report, args.operator_dir)\n return 0\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n","content_type":"text/x-python; charset=utf-8","language":"python","size":5916,"content_sha256":"7ce49cb8d0347f5a8c121c9370361007019a71787141316900136a145b092221"},{"filename":"scripts/reconcile_lint.py","content":"#!/usr/bin/env python3\n\"\"\"Lint a Go controller reconcile function for operator anti-patterns.\n\nDetects common operator bugs from static patterns in Go source: blocking calls\ninside reconcile, spec mutation (instead of status), missing requeue on error,\noversized reconcile functions, and missing finalizer/condition handling. Pure\nregex heuristics; not a Go AST parser, but catches the recurring mistakes.\n\"\"\"\nimport argparse\nimport json\nimport os\nimport re\nimport sys\n\nCODE_EXTS = {\".go\"}\n\n\nCHECKS = [\n (\"time_sleep\", r\"\\btime\\.Sleep\\s*\\(\", \"FAIL\", \"time.Sleep inside reconcile blocks the work queue. Use ctrl.Result{RequeueAfter: ...}.\"),\n (\"update_spec\", r\"r\\.(?:Client\\.)?Update\\(\\s*ctx\\s*,\\s*\\w+\\)\", \"WARN\", \"r.Client.Update on the reconciled object likely mutates spec. Use r.Status().Update for status.\"),\n (\"missing_context_in_http\", r\"http\\.(?:Get|Post|Do)\\s*\\(\", \"WARN\", \"HTTP calls without ctx-aware client; cannot cancel during shutdown.\"),\n (\"os_exit\", r\"\\bos\\.Exit\\s*\\(\", \"FAIL\", \"os.Exit inside reconcile kills the controller; return an error instead.\"),\n (\"panic_call\", r\"\\bpanic\\s*\\(\", \"WARN\", \"panic inside reconcile crashes the controller; return an error so it requeues.\"),\n (\"log_fatal\", r\"\\blog\\.Fatal\", \"FAIL\", \"log.Fatal exits the process; return an error instead.\"),\n]\n\n\ndef _read(path):\n try:\n with open(path, \"r\", encoding=\"utf-8\", errors=\"replace\") as f:\n return f.read()\n except OSError:\n return \"\"\n\n\ndef _find_reconcile_blocks(src):\n \"\"\"Return list of (start_line, end_line, body) for each Reconcile func.\"\"\"\n blocks = []\n sig = re.compile(r\"func\\s+\\([^)]*\\)\\s+Reconcile\\s*\\(\", re.MULTILINE)\n for m in sig.finditer(src):\n start = m.start()\n i = src.find(\"{\", m.end())\n if i \u003c 0:\n continue\n depth = 1\n j = i + 1\n while j \u003c len(src) and depth > 0:\n c = src[j]\n if c == \"{\":\n depth += 1\n elif c == \"}\":\n depth -= 1\n j += 1\n if depth == 0:\n body = src[i:j]\n start_line = src[:start].count(\"\\n\") + 1\n end_line = src[:j].count(\"\\n\") + 1\n blocks.append((start_line, end_line, body))\n return blocks\n\n\ndef _check_block(body, start_line):\n findings = []\n for key, pattern, level, msg in CHECKS:\n for m in re.finditer(pattern, body):\n line_offset = body[: m.start()].count(\"\\n\")\n findings.append({\n \"level\": level,\n \"key\": key,\n \"line\": start_line + line_offset,\n \"msg\": msg,\n })\n body_lines = body.count(\"\\n\")\n if body_lines > 80:\n findings.append({\n \"level\": \"WARN\",\n \"key\": \"reconcile_length\",\n \"line\": start_line,\n \"msg\": f\"Reconcile body is {body_lines} lines (>80). Extract reconcileXxx subroutines.\",\n })\n has_finalizer_add = re.search(r\"controllerutil\\.AddFinalizer\\b|finalizers\\s*=\", body)\n has_finalizer_remove = re.search(r\"controllerutil\\.RemoveFinalizer\\b\", body)\n if has_finalizer_add and not has_finalizer_remove:\n findings.append({\n \"level\": \"WARN\",\n \"key\": \"finalizer_unbalanced\",\n \"line\": start_line,\n \"msg\": \"AddFinalizer found but no RemoveFinalizer call — orphaned external resources on delete.\",\n })\n if not re.search(r\"ctrl\\.Result\\{\", body):\n findings.append({\n \"level\": \"WARN\",\n \"key\": \"missing_requeue\",\n \"line\": start_line,\n \"msg\": \"Reconcile body does not return ctrl.Result{...}. Confirm error returns trigger requeue.\",\n })\n return findings\n\n\ndef audit_file(path):\n src = _read(path)\n if not src or \"Reconcile\" not in src:\n return []\n blocks = _find_reconcile_blocks(src)\n out = []\n for start_line, _, body in blocks:\n out.extend(_check_block(body, start_line))\n # Cross-function check: AddFinalizer present in file → RemoveFinalizer must be too.\n has_add = \"controllerutil.AddFinalizer\" in src or re.search(r\"finalizers\\s*=\", src)\n has_remove = \"controllerutil.RemoveFinalizer\" in src\n if has_add and not has_remove:\n out = [f for f in out if f[\"key\"] != \"finalizer_unbalanced\"]\n out.append({\n \"level\": \"WARN\",\n \"key\": \"finalizer_unbalanced\",\n \"line\": 0,\n \"msg\": \"AddFinalizer is called somewhere in this file but RemoveFinalizer is not — orphaned external resources on delete.\",\n })\n elif has_remove:\n # Suppress per-block warnings if file-level pairing is balanced.\n out = [f for f in out if f[\"key\"] != \"finalizer_unbalanced\"]\n return out\n\n\ndef _walk(target):\n if os.path.isfile(target):\n yield target\n return\n for r, _, files in os.walk(target):\n for f in files:\n if os.path.splitext(f)[1] in CODE_EXTS:\n yield os.path.join(r, f)\n\n\ndef audit(target):\n results = []\n for path in _walk(target):\n findings = audit_file(path)\n if findings:\n results.append({\"path\": path, \"findings\": findings})\n return results\n\n\ndef render_text(results):\n fails = sum(1 for r in results for f in r[\"findings\"] if f[\"level\"] == \"FAIL\")\n warns = sum(1 for r in results for f in r[\"findings\"] if f[\"level\"] == \"WARN\")\n print(f\"Reconcile Lint — {len(results)} controller file(s), {fails} FAIL, {warns} WARN\")\n print(\"\")\n if not results:\n print(\"PASS: no anti-patterns detected.\")\n return 0\n for r in results:\n print(f\"== {r['path']}\")\n for f in r[\"findings\"]:\n print(f\" [{f['level']}] line {f['line']} {f['key']}: {f['msg']}\")\n print(\"\")\n return 1 if fails else 0\n\n\ndef main():\n ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)\n ap.add_argument(\"--controller\", required=True, help=\"Path to a Go controller file or directory\")\n ap.add_argument(\"--format\", choices=[\"text\", \"json\"], default=\"text\")\n args = ap.parse_args()\n\n if not os.path.exists(args.controller):\n print(f\"ERROR: not found: {args.controller}\", file=sys.stderr)\n return 2\n results = audit(args.controller)\n if args.format == \"json\":\n print(json.dumps(results, indent=2))\n return 0\n return render_text(results)\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n","content_type":"text/x-python; charset=utf-8","language":"python","size":6495,"content_sha256":"8f599a530d471080d160d82f48e828e27936319728e255192d19b075af761b3e"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"Kubernetes Operator","type":"text"}]},{"type":"paragraph","content":[{"text":"Build operators that reconcile correctly. Most operator bugs are not Kubernetes bugs — they are reconcile-loop bugs: missing finalizers, blocking calls, no requeue on transient errors, status drift, RBAC over-grants. This skill catches them deterministically before they reach a cluster.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"When to use","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Building a new Kubernetes Operator (controller for a CRD)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Reviewing an existing operator for capability-level gaps","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Auditing a CRD spec for status/conditions/finalizer correctness","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Choosing a framework (controller-runtime / kubebuilder / operator-sdk / metacontroller / KOPF)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Designing the API surface of a Custom Resource","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Hardening RBAC, leader election, or webhook validation","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"When NOT to use","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Plain Helm chart packaging → use ","type":"text"},{"text":"helm-chart-builder","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Standard kubectl operations / blue-green deploys → use ","type":"text"},{"text":"senior-devops","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"General k8s security posture → use ","type":"text"},{"text":"cloud-security","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"I want to run a workload\" — that's a Deployment / Job, not an operator","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Core principle: an operator is a reconcile loop, not a script","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"observe(actual) → desired = read(spec) → diff(actual, desired) → act → update(status)\n ↓\n requeue / done","type":"text"}]},{"type":"paragraph","content":[{"text":"Operators that fail are the ones that:","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Treat reconcile as imperative (do this, then this, then this) instead of declarative (make actual=desired, idempotently)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Don't requeue transient failures","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Don't use finalizers, leaving orphan resources","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Mutate spec instead of status","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Don't use the status subresource (status updates trigger spec reconciles → loop)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Block in reconcile (long HTTP calls, locks)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Forget leader election → split-brain on multi-replica deploys","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"The 3 tools below catch each of these.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Quick start","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"SKILL=engineering/kubernetes-operator/skills/kubernetes-operator\n\n# Validate a CRD design\npython \"$SKILL/scripts/crd_validator.py\" --crd config/crd/myapp.yaml\n\n# Lint a Go reconcile function\npython \"$SKILL/scripts/reconcile_lint.py\" --controller controllers/myapp_controller.go\n\n# Score against OperatorHub Capability Levels (1-5)\npython \"$SKILL/scripts/operator_capability_audit.py\" --operator-dir .","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"The 3 Python tools","type":"text"}]},{"type":"paragraph","content":[{"text":"All stdlib-only. Run with ","type":"text"},{"text":"--help","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"crd_validator.py","type":"text","marks":[{"type":"code_inline"}]}]},{"type":"paragraph","content":[{"text":"Validates a CRD YAML against operator-pattern best practices.","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python scripts/crd_validator.py --crd config/crd/myapp.yaml\npython scripts/crd_validator.py --crd config/crd/ --format json","type":"text"}]},{"type":"paragraph","content":[{"text":"Checks:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"spec.versions[*].subresources.status","type":"text","marks":[{"type":"code_inline"}]},{"text":" is set (status subresource)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"spec.scope","type":"text","marks":[{"type":"code_inline"}]},{"text":" is ","type":"text"},{"text":"Namespaced","type":"text","marks":[{"type":"code_inline"}]},{"text":" (not ","type":"text"},{"text":"Cluster","type":"text","marks":[{"type":"code_inline"}]},{"text":") unless explicitly justified","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Singular and listKind defined","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"spec.versions[*].schema.openAPIV3Schema","type":"text","marks":[{"type":"code_inline"}]},{"text":" has type definitions (no ","type":"text"},{"text":"x-kubernetes-preserve-unknown-fields: true","type":"text","marks":[{"type":"code_inline"}]},{"text":" at top level)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"A version is marked ","type":"text"},{"text":"served: true","type":"text","marks":[{"type":"code_inline"}]},{"text":" AND ","type":"text"},{"text":"storage: true","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Conditions array is in the schema (allows ","type":"text"},{"text":"metav1.Conditions","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Printer columns include ","type":"text"},{"text":"Age","type":"text","marks":[{"type":"code_inline"}]},{"text":" and ","type":"text"},{"text":"Status","type":"text","marks":[{"type":"code_inline"}]},{"text":"/","type":"text"},{"text":"Phase","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"reconcile_lint.py","type":"text","marks":[{"type":"code_inline"}]}]},{"type":"paragraph","content":[{"text":"Lints a Go controller reconcile function for anti-patterns.","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python scripts/reconcile_lint.py --controller controllers/myapp_controller.go","type":"text"}]},{"type":"paragraph","content":[{"text":"Checks (regex-based heuristics):","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Returns are ","type":"text"},{"text":"(ctrl.Result, error)","type":"text","marks":[{"type":"code_inline"}]},{"text":" shape","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Errors trigger a non-zero requeue (","type":"text"},{"text":"return ctrl.Result{Requeue: true}, err","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"client.Update()","type":"text","marks":[{"type":"code_inline"}]},{"text":" on the spec object is flagged (controllers should update only status)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"time.Sleep","type":"text","marks":[{"type":"code_inline"}]},{"text":" inside reconcile is flagged (use ","type":"text"},{"text":"RequeueAfter","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"HTTP calls without context cancellation are flagged","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Missing ","type":"text"},{"text":"defer","type":"text","marks":[{"type":"code_inline"}]},{"text":" after a finalizer add","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"No ","type":"text"},{"text":"IsConditionTrue","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"SetCondition","type":"text","marks":[{"type":"code_inline"}]},{"text":" calls when conditions present in CRD","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Reconcile function exceeds 80 lines (extract subroutines)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"operator_capability_audit.py","type":"text","marks":[{"type":"code_inline"}]}]},{"type":"paragraph","content":[{"text":"Scores an operator against OperatorHub's 5 Capability Levels.","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python scripts/operator_capability_audit.py --operator-dir .","type":"text"}]},{"type":"paragraph","content":[{"text":"Levels:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"L1 — Basic Install:","type":"text","marks":[{"type":"strong"}]},{"text":" CRD defined, controller deploys it","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"L2 — Seamless Upgrades:","type":"text","marks":[{"type":"strong"}]},{"text":" PDBs, conversion webhooks, version skew strategy","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"L3 — Full Lifecycle:","type":"text","marks":[{"type":"strong"}]},{"text":" backups, restores, failure recovery","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"L4 — Deep Insights:","type":"text","marks":[{"type":"strong"}]},{"text":" metrics endpoint, Prometheus rules, alerts","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"L5 — Auto Pilot:","type":"text","marks":[{"type":"strong"}]},{"text":" auto-scaling, auto-tuning, anomaly detection","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Reports current level + concrete next steps to advance one level.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Tooling landscape","type":"text"}]},{"type":"paragraph","content":[{"text":"Pick a framework based on language and complexity. See ","type":"text"},{"text":"references/tooling_landscape.md","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Framework","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Language","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Best for","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Maintenance","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"controller-runtime","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Go","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Production-grade, low-level control","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Active (sig-api-machinery)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"kubebuilder","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Go","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Standard scaffolding, opinionated","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Active (Kubernetes SIGs)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"operator-sdk","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Go / Helm / Ansible","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"OpenShift / mixed-paradigm teams","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Active (Red Hat)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"metacontroller","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Any (webhook-based)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Polyglot teams, avoiding Go","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Less active","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"KOPF","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Python","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Python shops, async-first","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Active (community)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"java-operator-sdk","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Java","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"JVM shops","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Active (Red Hat / Java SIG)","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"Decision rules:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"New operator + Go shop → kubebuilder","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"New operator + Python shop → KOPF","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"New operator + can't pick a language → metacontroller","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"OpenShift target → operator-sdk","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"CRD design principles","type":"text"}]},{"type":"paragraph","content":[{"text":"See ","type":"text"},{"text":"references/crd_design.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" for full detail. Quick rules:","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"status is the source of truth for the controller's view of the world.","type":"text","marks":[{"type":"strong"}]},{"text":" Spec is what the user wants; status is what the controller observed.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use the status subresource.","type":"text","marks":[{"type":"strong"}]},{"text":" Without it, status updates re-trigger reconcile (loop).","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use Conditions.","type":"text","marks":[{"type":"strong"}]},{"text":" ","type":"text"},{"text":"Ready","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"Reconciling","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"Degraded","type":"text","marks":[{"type":"code_inline"}]},{"text":". Each carries a reason and message.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Add finalizers.","type":"text","marks":[{"type":"strong"}]},{"text":" Without finalizers, deletion races the controller and orphans external resources.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Version your CRD from day 1.","type":"text","marks":[{"type":"strong"}]},{"text":" ","type":"text"},{"text":"v1alpha1","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"v1beta1","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"v1","type":"text","marks":[{"type":"code_inline"}]},{"text":". Plan a conversion webhook.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Validate via OpenAPI v3 schema.","type":"text","marks":[{"type":"strong"}]},{"text":" Don't rely on the controller for validation that should fail at admission.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use ","type":"text","marks":[{"type":"strong"}]},{"text":"additionalPrinterColumns","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" for ","type":"text","marks":[{"type":"strong"}]},{"text":"kubectl get","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":".","type":"text","marks":[{"type":"strong"}]},{"text":" Show ","type":"text"},{"text":"Age","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"Phase","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"Ready","type":"text","marks":[{"type":"code_inline"}]},{"text":" at minimum.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Namespace your CRDs unless they manage cluster-scoped resources.","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Reconcile loop principles","type":"text"}]},{"type":"paragraph","content":[{"text":"See ","type":"text"},{"text":"references/reconcile_loop.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" for full detail. Quick rules:","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Idempotent.","type":"text","marks":[{"type":"strong"}]},{"text":" Reconciling the same state twice → same result, zero side effects.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Read once, decide, act.","type":"text","marks":[{"type":"strong"}]},{"text":" Don't observe the world repeatedly during reconcile.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Update status, not spec.","type":"text","marks":[{"type":"strong"}]},{"text":" Spec belongs to the user.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Return errors that requeue.","type":"text","marks":[{"type":"strong"}]},{"text":" Use ","type":"text"},{"text":"ctrl.Result{RequeueAfter: ...}","type":"text","marks":[{"type":"code_inline"}]},{"text":" for known transient cases.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Never block.","type":"text","marks":[{"type":"strong"}]},{"text":" No ","type":"text"},{"text":"time.Sleep","type":"text","marks":[{"type":"code_inline"}]},{"text":". No long HTTP calls without context.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use the cache.","type":"text","marks":[{"type":"strong"}]},{"text":" Read via the controller's cached client; only escape the cache for a specific reason.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Leader-elect when running >1 replica.","type":"text","marks":[{"type":"strong"}]},{"text":" Otherwise enable single-replica mode.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Set OwnerReferences.","type":"text","marks":[{"type":"strong"}]},{"text":" Cascading deletion is the operator pattern's free gift.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Workflows","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Workflow 1: Bootstrap a new operator (Go + kubebuilder)","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"1. Pick a Group/Version/Kind: e.g., apps.example.com/v1alpha1, kind=MyApp\n2. kubebuilder init --domain example.com --repo github.com/org/myapp-operator\n3. kubebuilder create api --group apps --version v1alpha1 --kind MyApp\n4. Run crd_validator.py on config/crd/bases/apps.example.com_myapps.yaml\n → Fix every WARN before writing controller code\n5. Implement the reconcile function (Karpathy principle 2: simplest correct version first)\n6. Run reconcile_lint.py on controllers/myapp_controller.go\n7. Run operator_capability_audit.py --operator-dir . — confirm L1\n8. Test in a kind cluster: kubectl apply -f config/samples/\n9. Add status conditions; aim for L2 in the same PR","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Workflow 2: Audit an existing operator","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"1. Run operator_capability_audit.py --operator-dir \u003cpath>\n2. Run crd_validator.py --crd config/crd/\n3. Run reconcile_lint.py --controller controllers/\n4. Triage findings:\n - FAIL → block release; fix before next deploy\n - WARN → file an issue; fix in next 30 days\n5. Document current capability level in README; commit\n6. Plan one capability level advancement per quarter","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Workflow 3: Choose a framework","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"1. Identify primary language constraint (team skill)\n2. Identify deployment target (vanilla k8s vs OpenShift)\n3. Identify operator complexity (single CRD vs multi-CRD vs cluster-wide)\n4. Cross-reference with references/tooling_landscape.md\n5. Build a 1-week proof-of-concept before committing","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"References","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"references/operator_pattern.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" — what an operator IS, when to use vs alternatives","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"references/crd_design.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" — CRD design principles, versioning, conversion webhooks","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"references/reconcile_loop.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" — reconcile patterns, error handling, idempotency","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"references/tooling_landscape.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" — framework comparison + decision tree","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Slash command","type":"text"}]},{"type":"paragraph","content":[{"text":"/operator-audit","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Run all 3 tools on an operator repo and produce a markdown report.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Asset templates","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"assets/crd_template.yaml","type":"text","marks":[{"type":"code_inline"}]},{"text":" — CRD with status subresource, conditions, finalizer hint, printer columns","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"assets/reconcile_skeleton.go","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Go controller reconcile function with idempotency, conditions, finalizers, requeue patterns","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Anti-patterns","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"time.Sleep(30 * time.Second)","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" inside reconcile","type":"text","marks":[{"type":"strong"}]},{"text":" — block other reconciles. Use ","type":"text"},{"text":"RequeueAfter","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"r.Client.Update(ctx, obj)","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" to set status","type":"text","marks":[{"type":"strong"}]},{"text":" — use ","type":"text"},{"text":"r.Status().Update(ctx, obj)","type":"text","marks":[{"type":"code_inline"}]},{"text":" instead.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"No leader election + 2+ replicas","type":"text","marks":[{"type":"strong"}]},{"text":" — split-brain.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"No finalizer","type":"text","marks":[{"type":"strong"}]},{"text":" — external resources orphan on deletion.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"CRD without status subresource","type":"text","marks":[{"type":"strong"}]},{"text":" — status updates trigger spec reconciles (infinite loop).","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Reconcile function > 200 lines","type":"text","marks":[{"type":"strong"}]},{"text":" — extract reconcileXxx subroutines per condition.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"x-kubernetes-preserve-unknown-fields: true","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" on spec root","type":"text","marks":[{"type":"strong"}]},{"text":" — defeats validation.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Imperative reconcile","type":"text","marks":[{"type":"strong"}]},{"text":" — \"if creating, do A; if updating, do B; if deleting, do C\". Wrong shape. Reconcile = make actual=desired, regardless of how we got here.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Verifiable success","type":"text"}]},{"type":"paragraph","content":[{"text":"A team using this skill should achieve:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"100% of new CRDs pass ","type":"text"},{"text":"crd_validator.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" before merge","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"All reconcile functions pass ","type":"text"},{"text":"reconcile_lint.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" strict mode","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Operators reach OperatorHub Capability Level 3 (Full Lifecycle) before public release","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Mean time to fix a reconcile bug: \u003c1 day (no infinite loops in production)","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"kubernetes-operator","tags":["kubernetes","operator","crd","controller-runtime","kubebuilder","operator-sdk","metacontroller","kopf","reconcile","devops"],"author":"@skillopedia","source":{"stars":16818,"repo_name":"claude-skills","origin_url":"https://github.com/alirezarezvani/claude-skills/blob/HEAD/engineering/skills/kubernetes-operator/SKILL.md","repo_owner":"alirezarezvani","body_sha256":"ca808b0a66d9b120a813fa4582f8c80c04cde8a193cfccb7fbd4e6ab7cc834f8","cluster_key":"4279619aff91c92c3a17875df9e5294d5b9d43263f47379983118a05ca4d5810","clean_bundle":{"format":"clean-skill-bundle-v1","source":"alirezarezvani/claude-skills/engineering/skills/kubernetes-operator/SKILL.md","attachments":[{"id":"e1ac6821-ca0e-5eb5-92ff-ac98b9a6103a","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/e1ac6821-ca0e-5eb5-92ff-ac98b9a6103a/attachment.yaml","path":"assets/crd_template.yaml","size":2620,"sha256":"3afaafc885780b30210032d537629f01b0f7aeebdd734912895a52fcd6aede19","contentType":"application/yaml; charset=utf-8"},{"id":"04df9fb7-ac16-5181-a26a-de6fc0f79f9e","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/04df9fb7-ac16-5181-a26a-de6fc0f79f9e/attachment.go","path":"assets/reconcile_skeleton.go","size":4052,"sha256":"ff5f3d75b77d7f4053be74182acf0fe2482f5c42c4c726d28754b2e719733ea2","contentType":"text/plain; charset=utf-8"},{"id":"f8368f1a-bb85-58af-a87b-5d8ca125140e","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/f8368f1a-bb85-58af-a87b-5d8ca125140e/attachment.md","path":"references/crd_design.md","size":6998,"sha256":"970db86bfe5c27f71f673d1acb5ac973534c44c99944a8721b0a95645fb99a31","contentType":"text/markdown; charset=utf-8"},{"id":"68075974-2311-525e-a29c-b0ec4e47c220","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/68075974-2311-525e-a29c-b0ec4e47c220/attachment.md","path":"references/operator_pattern.md","size":7157,"sha256":"28ad0d8c1e165f5b0a2d095ad5890dcc122885dab88da666072714b8d2cb257d","contentType":"text/markdown; charset=utf-8"},{"id":"fb27c1c2-e5e6-5356-a471-ff5dc371b3f5","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/fb27c1c2-e5e6-5356-a471-ff5dc371b3f5/attachment.md","path":"references/reconcile_loop.md","size":7275,"sha256":"3ece01fa9b29c88429255e6920885ded737b2d761b2b68791641682b0595e619","contentType":"text/markdown; charset=utf-8"},{"id":"81fbc03d-6f6d-5696-b550-00da02a52b2e","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/81fbc03d-6f6d-5696-b550-00da02a52b2e/attachment.md","path":"references/tooling_landscape.md","size":7642,"sha256":"8d561136192786f37bee2560fac0ab4dbc3594dc607da853ffd545dde721e8bc","contentType":"text/markdown; charset=utf-8"},{"id":"bcd44aae-f648-52d8-aa09-561a8414e695","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/bcd44aae-f648-52d8-aa09-561a8414e695/attachment.py","path":"scripts/crd_validator.py","size":5939,"sha256":"e9151a593c13602108d79ca629099e0865849a49c9c5f5cbad09bfd65466d8b9","contentType":"text/x-python; charset=utf-8"},{"id":"1e7bd0ad-0fd9-574f-9aba-fcd24e6258f7","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/1e7bd0ad-0fd9-574f-9aba-fcd24e6258f7/attachment.py","path":"scripts/operator_capability_audit.py","size":5916,"sha256":"7ce49cb8d0347f5a8c121c9370361007019a71787141316900136a145b092221","contentType":"text/x-python; charset=utf-8"},{"id":"4a085d3c-4fcd-5376-b8bd-3fdbdeaf609e","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/4a085d3c-4fcd-5376-b8bd-3fdbdeaf609e/attachment.py","path":"scripts/reconcile_lint.py","size":6495,"sha256":"8f599a530d471080d160d82f48e828e27936319728e255192d19b075af761b3e","contentType":"text/x-python; charset=utf-8"}],"bundle_sha256":"caba260d0cb4e4055677fc6cd52abb5811c88df108e017920e7ef57e11e1f6c6","attachment_count":9,"text_attachments":9,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":0,"excluded_attachments":[]},"cluster_size":4,"skill_md_path":"engineering/skills/kubernetes-operator/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"security","category_label":"Security"},"exact_dupes_collapsed_into_this":3},"context":"fork","license":"MIT","version":"v1","category":"security","import_tag":"clean-skills-v1","description":"Use when building a Kubernetes Operator — custom controllers that reconcile CRD state. Triggers on \"build an operator\", \"CRD design\", \"reconcile loop\", \"controller-runtime\", \"kubebuilder\", \"operator-sdk\", \"metacontroller\", \"KOPF\", \"operator capability levels\", or \"custom resource\". Ships CRD validator, reconcile-loop linter, and OperatorHub capability auditor (all stdlib Python), 4 references on the operator pattern + CRD design + reconcile patterns + tooling landscape, and a /operator-audit slash command. NOT a generic k8s skill — specifically the Operator pattern.","compatible_tools":["claude-code","codex-cli","cursor","antigravity","opencode","gemini-cli"]}},"renderedAt":1782980190539}

Kubernetes Operator Build operators that reconcile correctly. Most operator bugs are not Kubernetes bugs — they are reconcile-loop bugs: missing finalizers, blocking calls, no requeue on transient errors, status drift, RBAC over-grants. This skill catches them deterministically before they reach a cluster. When to use - Building a new Kubernetes Operator (controller for a CRD) - Reviewing an existing operator for capability-level gaps - Auditing a CRD spec for status/conditions/finalizer correctness - Choosing a framework (controller-runtime / kubebuilder / operator-sdk / metacontroller / KOP…