Scaffold dd0c/cost: Welford baseline, anomaly scorer, governance engine, tests

- Welford online algorithm for running mean/stddev baselines
- Anomaly scorer: z-score → 0-100 mapping, property-based tests (10K runs, fast-check)
- Governance engine: 14-day auto-promotion with FP rate gate, injectable Clock
- Panic mode: defaults to active (safe) when Redis unreachable
- Tests: 12 scorer cases (incl 2x 10K property-based), 9 governance cases, 3 panic mode cases
- PostgreSQL schema with RLS: baselines (optimistic locking), anomalies, remediation_actions
- Fly.io config, Dockerfile
This commit is contained in:
2026-03-01 02:52:53 +00:00
parent 23db74b306
commit 6f692fc5ef
8 changed files with 540 additions and 0 deletions

View File

@@ -0,0 +1,99 @@
/**
* Welford's online algorithm for running mean and standard deviation.
* Used for baseline cost calculations — handles concurrent updates
* via DynamoDB conditional writes (BMad must-have).
*/
export class WelfordBaseline {
count: number;
mean: number;
m2: number; // Sum of squares of differences from the mean
constructor(count = 0, mean = 0, m2 = 0) {
this.count = count;
this.mean = mean;
this.m2 = m2;
}
/** Add a new observation */
update(value: number): void {
this.count++;
const delta = value - this.mean;
this.mean += delta / this.count;
const delta2 = value - this.mean;
this.m2 += delta * delta2;
}
/** Population standard deviation */
get stddev(): number {
if (this.count < 2) return 0;
return Math.sqrt(this.m2 / this.count);
}
/** Sample standard deviation */
get sampleStddev(): number {
if (this.count < 2) return 0;
return Math.sqrt(this.m2 / (this.count - 1));
}
/** Serialize for DynamoDB storage */
toJSON(): { count: number; mean: number; m2: number } {
return { count: this.count, mean: this.mean, m2: this.m2 };
}
static fromJSON(data: { count: number; mean: number; m2: number }): WelfordBaseline {
return new WelfordBaseline(data.count, data.mean, data.m2);
}
}
/**
* Anomaly scorer — computes a 0-100 score based on how far
* the current cost deviates from the baseline.
*/
export function scoreAnomaly(input: {
cost: number;
mean: number;
stddev: number;
}): number {
const { cost, mean, stddev } = input;
// Guard: no baseline yet
if (mean === 0 && stddev === 0) return 0;
// Guard: zero stddev (all values identical)
if (stddev === 0) {
return cost > mean ? 100 : 0;
}
// Z-score: how many standard deviations from mean
const zScore = (cost - mean) / stddev;
// Map z-score to 0-100 (clamped)
// z=0 → 0, z=1 → 25, z=2 → 50, z=3 → 75, z>=4 → 100
const score = Math.min(100, Math.max(0, zScore * 25));
return Math.round(score * 100) / 100; // 2 decimal places
}
/**
* Determine if a score warrants an alert.
*/
export function shouldAlert(score: number, threshold = 50): boolean {
return score >= threshold;
}
export interface CostEvent {
tenantId: string;
accountId: string;
resourceType: string; // e.g. 'ec2/m5.xlarge', 'rds/db.r5.large'
hourlyCost: number;
timestamp: number; // Unix ms
region: string;
tags: Record<string, string>;
}
export interface AnomalyResult {
event: CostEvent;
score: number;
baseline: { mean: number; stddev: number; count: number };
isAnomaly: boolean;
}

View File

@@ -0,0 +1,80 @@
import pino from 'pino';
const logger = pino({ name: 'governance' });
/**
* Clock interface for deterministic governance tests (BMad must-have).
* Inject FakeClock in tests, RealClock in production.
*/
export interface Clock {
now(): number;
}
export class RealClock implements Clock {
now() { return Date.now(); }
}
export class FakeClock implements Clock {
private current: number;
constructor(start = Date.now()) { this.current = start; }
now() { return this.current; }
advanceBy(ms: number) { this.current += ms; }
}
export interface PromotionResult {
promoted: boolean;
newMode?: 'audit' | 'enforce';
reason?: string;
}
const PROMOTION_DAYS = 14;
const MAX_FP_RATE = 0.10; // 10% false positive rate
/**
* Governance engine — manages auto-promotion from shadow → audit → enforce.
* Uses injectable Clock for deterministic testing.
*/
export class GovernanceEngine {
private clock: Clock;
constructor(clock: Clock = new RealClock()) {
this.clock = clock;
}
/**
* Evaluate whether a tenant should be promoted to the next governance mode.
* Requires: 14+ days in current mode AND false positive rate < 10%.
*/
evaluatePromotion(
tenantId: string,
stats: { fpRate: number; daysInCurrentMode: number },
): PromotionResult {
if (stats.daysInCurrentMode < PROMOTION_DAYS) {
return { promoted: false, reason: `Only ${stats.daysInCurrentMode} days — need ${PROMOTION_DAYS}` };
}
if (stats.fpRate > MAX_FP_RATE) {
return {
promoted: false,
reason: `false-positive rate ${(stats.fpRate * 100).toFixed(1)}% exceeds ${MAX_FP_RATE * 100}% threshold`,
};
}
return { promoted: true, newMode: 'audit' };
}
/**
* Check panic mode status.
* Defaults to panic=active (safe) when Redis is unreachable (BMad must-have).
*/
async checkPanicMode(tenantId: string, redis: any): Promise<boolean> {
try {
const val = await redis.get(`panic:${tenantId}`);
return val === '1';
} catch (err) {
logger.warn({ tenantId, error: (err as Error).message },
'Redis unreachable — defaulting to panic=active');
return true; // Safe default: panic IS active
}
}
}