# dualwrite.yaml — Prometheus recording + alerting rules for the
# Wave-B dual-write reconciler (cmd/reconciler).
#
# Companion to internal/orgunit/dualwrite/reconciler.go (metrics) and
# internal/orgunit/dualwrite_metrics.go (mirror-side counter).
#
# Shelfware gate: every alert references a metric emitted by real
# production code.
#
#   dualwrite_drift_total          -> internal/orgunit/dualwrite/reconciler.go (DriftTotal)
#   dualwrite_open_drift           -> internal/orgunit/dualwrite/reconciler.go (OpenDrift)
#   dualwrite_healed_total         -> internal/orgunit/dualwrite/reconciler.go (HealedTotal)
#   dualwrite_reconciler_run_seconds -> internal/orgunit/dualwrite/reconciler.go (RunDuration)
#   dualwrite_mirror_total         -> internal/orgunit/dualwrite_metrics.go (DualwriteMirrorTotal)
#
# References:
#   - Issue #583 (this task)
#   - LLD #560 addendum §Wave B Risk A1
#   - HLD #556 v1.1 §5.1
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: dualwrite-reconciler
  namespace: platform
  labels:
    app: reconciler
    role: alert-rules
    release: kube-prometheus-stack
spec:
  groups:
    # ──────────────────────────────────────────────────────────────
    # Recording rules — cheap, pre-aggregated series for dashboards.
    # ──────────────────────────────────────────────────────────────
    - name: dualwrite.recording
      interval: 1m
      rules:
        # Total open drift across all tenants, broken out by mirror
        # table. Primary dashboard tile; a non-zero value is the
        # single trigger the on-call watches.
        - record: dualwrite_open_drift_total
          expr: sum by (table) (dualwrite_open_drift)

        # Heal-vs-detect ratio per table. A healthy system keeps this
        # at 1 over a rolling 48h window — drift gets healed within
        # one daily tick. A dip below 0.9 means the heal path is
        # failing silently even though detection works.
        - record: dualwrite_heal_ratio_48h
          expr: |
            sum by (table) (rate(dualwrite_healed_total[48h]))
            /
            ignoring (tenant_id) sum by (table) (rate(dualwrite_drift_total[48h]))

        # Mirror-success rate per target table (99.9% is green).
        # Helps distinguish "drift caused by missed mirror writes"
        # from "drift caused by legacy-only writes during Wave A
        # pre-cutover".
        - record: dualwrite_mirror_success_ratio
          expr: |
            sum by (table) (rate(dualwrite_mirror_total{outcome="ok"}[5m]))
            /
            sum by (table) (rate(dualwrite_mirror_total[5m]))

    # ──────────────────────────────────────────────────────────────
    # Alerting rules.
    # ──────────────────────────────────────────────────────────────
    - name: dualwrite.alerts
      rules:
        # ──────────────────────────────────────────────────────────
        # A1. Drift > 0 persisting > 1h → P2 page.
        #
        # This is the Risk-A1 gate from LLD #560 addendum §Wave B:
        # "Zero drift on staging fixtures for 48h before Wave E
        # starts". If a tenant has un-healed drift for more than one
        # hour, the reconciler either failed to heal or drift is
        # being re-introduced faster than the daily heal ticks.
        # Either way the on-call needs to know BEFORE Wave E reads
        # start pointing at the mirror tables.
        # ──────────────────────────────────────────────────────────
        - alert: DualwriteDriftPersisting
          expr: dualwrite_open_drift > 0
          for: 1h
          labels:
            severity: critical
            component: reconciler
            team: platform
            risk: A1
          annotations:
            summary: "Dual-write drift has persisted > 1h for tenant {{ $labels.tenant_id }}"
            description: |
              The reconciler detected {{ $value }} unhealed rows in
              `{{ $labels.table }}` for tenant `{{ $labels.tenant_id }}`
              over the last hour. Either:

                1. The heal path is failing silently (check
                   `dualwrite_heal_ratio_48h` and reconciler logs
                   for "tenant errored" entries).
                2. Drift is being re-introduced faster than the
                   daily heal interval (check
                   `dualwrite_mirror_success_ratio{table="{{ $labels.table }}"}`
                   — values < 0.999 mean hooks are failing).

              Wave E MUST NOT go live until this alert clears —
              serving reads from a drifted mirror will surface
              inconsistent team membership to end-users.
            runbook_url: https://github.com/upsquad-ai/upsquad-core/blob/main/docs/runbooks/dualwrite-reconciler.md

        # ──────────────────────────────────────────────────────────
        # A2. Reconciler not running at all.
        #
        # No pass has completed in the last 2h, which is 2× the
        # default detect interval. Triggers a P2 page because the
        # DualwriteDriftPersisting alert above silently assumes the
        # reconciler is emitting metrics.
        # ──────────────────────────────────────────────────────────
        - alert: DualwriteReconcilerStalled
          expr: |
            (time() - max(
              timestamp(
                dualwrite_reconciler_run_seconds_count{mode="detect"}
              )
            )) > 7200
          for: 5m
          labels:
            severity: critical
            component: reconciler
            team: platform
          annotations:
            summary: "Dual-write reconciler has not run a detect pass in > 2h"
            description: |
              `dualwrite_reconciler_run_seconds_count{mode="detect"}`
              has not advanced in over 2 hours. The reconciler is
              either crashed, deadlocked on the advisory lock, or
              losing its Postgres connection on every retry.

              Check:
                1. kubectl -n platform logs deploy/reconciler
                2. `pg_stat_activity` for hanging connections holding
                   `upsquad.orgunit_reconciler` advisory lock.
                3. Disk / connection saturation on the primary DB.
            runbook_url: https://github.com/upsquad-ai/upsquad-core/blob/main/docs/runbooks/dualwrite-reconciler.md

        # ──────────────────────────────────────────────────────────
        # A3. Heal ratio collapse.
        #
        # Detection works but healing is failing for a specific table.
        # This is a softer signal than A1 (which fires regardless of
        # heal outcome) and lets on-call catch the heal-only regression
        # before it becomes a sustained drift alert.
        # ──────────────────────────────────────────────────────────
        - alert: DualwriteHealRatioLow
          expr: dualwrite_heal_ratio_48h < 0.9
          for: 30m
          labels:
            severity: warning
            component: reconciler
            team: platform
          annotations:
            summary: "Dual-write heal ratio < 0.9 for table {{ $labels.table }}"
            description: |
              Over the last 48h the reconciler detected more drift
              rows than it was able to heal in `{{ $labels.table }}`
              (ratio = {{ $value | humanize }}). Sustained values
              below 0.9 indicate the heal SQL is hitting constraint
              violations or the RLS GUC is not set correctly for the
              tenant.
            runbook_url: https://github.com/upsquad-ai/upsquad-core/blob/main/docs/runbooks/dualwrite-reconciler.md

        # ──────────────────────────────────────────────────────────
        # A4. Mirror write failure rate elevated.
        #
        # The dualwrite.DualWriter hooks are failing on the write
        # path. This is upstream of the reconciler — drift will
        # accumulate until the hook is fixed.
        # ──────────────────────────────────────────────────────────
        - alert: DualwriteMirrorErrorsHigh
          expr: rate(dualwrite_mirror_total{outcome="err"}[5m]) > 0.1
          for: 5m
          labels:
            severity: warning
            component: tenant-service
            team: platform
          annotations:
            summary: "Dual-write mirror error rate > 0.1/s on table {{ $labels.table }}"
            description: |
              `dualwrite_mirror_total{outcome="err",table="{{ $labels.table }}"}`
              is incrementing at {{ $value | humanize }}/s over the
              last 5 minutes. Every err-outcome rolls back the caller
              transaction so legacy writes are also blocked — check
              gateway 5xx rate for correlated impact.
            runbook_url: https://github.com/upsquad-ai/upsquad-core/blob/main/docs/runbooks/dualwrite-reconciler.md
