Scaffold dd0c/cost: Welford baseline, anomaly scorer, governance engine, tests

- Welford online algorithm for running mean/stddev baselines - Anomaly scorer: z-score → 0-100 mapping, property-based tests (10K runs, fast-check) - Governance engine: 14-day auto-promotion with FP rate gate, injectable Clock - Panic mode: defaults to active (safe) when Redis unreachable - Tests: 12 scorer cases (incl 2x 10K property-based), 9 governance cases, 3 panic mode cases - PostgreSQL schema with RLS: baselines (optimistic locking), anomalies, remediation_actions - Fly.io config, Dockerfile
2026-03-01 02:52:53 +00:00
parent 23db74b306
commit 6f692fc5ef
8 changed files with 540 additions and 0 deletions
--- a/products/05-aws-cost-anomaly/src/detection/scorer.ts
+++ b/products/05-aws-cost-anomaly/src/detection/scorer.ts
@@ -0,0 +1,99 @@
+/**
+ * Welford's online algorithm for running mean and standard deviation.
+ * Used for baseline cost calculations — handles concurrent updates
+ * via DynamoDB conditional writes (BMad must-have).
+ */
+export class WelfordBaseline {
+  count: number;
+  mean: number;
+  m2: number; // Sum of squares of differences from the mean
+
+  constructor(count = 0, mean = 0, m2 = 0) {
+    this.count = count;
+    this.mean = mean;
+    this.m2 = m2;
+  }
+
+  /** Add a new observation */
+  update(value: number): void {
+    this.count++;
+    const delta = value - this.mean;
+    this.mean += delta / this.count;
+    const delta2 = value - this.mean;
+    this.m2 += delta * delta2;
+  }
+
+  /** Population standard deviation */
+  get stddev(): number {
+    if (this.count < 2) return 0;
+    return Math.sqrt(this.m2 / this.count);
+  }
+
+  /** Sample standard deviation */
+  get sampleStddev(): number {
+    if (this.count < 2) return 0;
+    return Math.sqrt(this.m2 / (this.count - 1));
+  }
+
+  /** Serialize for DynamoDB storage */
+  toJSON(): { count: number; mean: number; m2: number } {
+    return { count: this.count, mean: this.mean, m2: this.m2 };
+  }
+
+  static fromJSON(data: { count: number; mean: number; m2: number }): WelfordBaseline {
+    return new WelfordBaseline(data.count, data.mean, data.m2);
+  }
+}
+
+/**
+ * Anomaly scorer — computes a 0-100 score based on how far
+ * the current cost deviates from the baseline.
+ */
+export function scoreAnomaly(input: {
+  cost: number;
+  mean: number;
+  stddev: number;
+}): number {
+  const { cost, mean, stddev } = input;
+
+  // Guard: no baseline yet
+  if (mean === 0 && stddev === 0) return 0;
+
+  // Guard: zero stddev (all values identical)
+  if (stddev === 0) {
+    return cost > mean ? 100 : 0;
+  }
+
+  // Z-score: how many standard deviations from mean
+  const zScore = (cost - mean) / stddev;
+
+  // Map z-score to 0-100 (clamped)
+  // z=0 → 0, z=1 → 25, z=2 → 50, z=3 → 75, z>=4 → 100
+  const score = Math.min(100, Math.max(0, zScore * 25));
+
+  return Math.round(score * 100) / 100; // 2 decimal places
+}
+
+/**
+ * Determine if a score warrants an alert.
+ */
+export function shouldAlert(score: number, threshold = 50): boolean {
+  return score >= threshold;
+}
+
+export interface CostEvent {
+  tenantId: string;
+  accountId: string;
+  resourceType: string; // e.g. 'ec2/m5.xlarge', 'rds/db.r5.large'
+  hourlyCost: number;
+  timestamp: number; // Unix ms
+  region: string;
+  tags: Record<string, string>;
+}
+
+export interface AnomalyResult {
+  event: CostEvent;
+  score: number;
+  baseline: { mean: number; stddev: number; count: number };
+  isAnomaly: boolean;
+}
--- a/products/05-aws-cost-anomaly/src/governance/engine.ts
+++ b/products/05-aws-cost-anomaly/src/governance/engine.ts
@@ -0,0 +1,80 @@
+import pino from 'pino';
+
+const logger = pino({ name: 'governance' });
+
+/**
+ * Clock interface for deterministic governance tests (BMad must-have).
+ * Inject FakeClock in tests, RealClock in production.
+ */
+export interface Clock {
+  now(): number;
+}
+
+export class RealClock implements Clock {
+  now() { return Date.now(); }
+}
+
+export class FakeClock implements Clock {
+  private current: number;
+  constructor(start = Date.now()) { this.current = start; }
+  now() { return this.current; }
+  advanceBy(ms: number) { this.current += ms; }
+}
+
+export interface PromotionResult {
+  promoted: boolean;
+  newMode?: 'audit' | 'enforce';
+  reason?: string;
+}
+
+const PROMOTION_DAYS = 14;
+const MAX_FP_RATE = 0.10; // 10% false positive rate
+
+/**
+ * Governance engine — manages auto-promotion from shadow → audit → enforce.
+ * Uses injectable Clock for deterministic testing.
+ */
+export class GovernanceEngine {
+  private clock: Clock;
+
+  constructor(clock: Clock = new RealClock()) {
+    this.clock = clock;
+  }
+
+  /**
+   * Evaluate whether a tenant should be promoted to the next governance mode.
+   * Requires: 14+ days in current mode AND false positive rate < 10%.
+   */
+  evaluatePromotion(
+    tenantId: string,
+    stats: { fpRate: number; daysInCurrentMode: number },
+  ): PromotionResult {
+    if (stats.daysInCurrentMode < PROMOTION_DAYS) {
+      return { promoted: false, reason: `Only ${stats.daysInCurrentMode} days — need ${PROMOTION_DAYS}` };
+    }
+
+    if (stats.fpRate > MAX_FP_RATE) {
+      return {
+        promoted: false,
+        reason: `false-positive rate ${(stats.fpRate * 100).toFixed(1)}% exceeds ${MAX_FP_RATE * 100}% threshold`,
+      };
+    }
+
+    return { promoted: true, newMode: 'audit' };
+  }
+
+  /**
+   * Check panic mode status.
+   * Defaults to panic=active (safe) when Redis is unreachable (BMad must-have).
+   */
+  async checkPanicMode(tenantId: string, redis: any): Promise<boolean> {
+    try {
+      const val = await redis.get(`panic:${tenantId}`);
+      return val === '1';
+    } catch (err) {
+      logger.warn({ tenantId, error: (err as Error).message },
+        'Redis unreachable — defaulting to panic=active');
+      return true; // Safe default: panic IS active
+    }
+  }
+}