Scaffold dd0c/cost: Welford baseline, anomaly scorer, governance engine, tests
- Welford online algorithm for running mean/stddev baselines - Anomaly scorer: z-score → 0-100 mapping, property-based tests (10K runs, fast-check) - Governance engine: 14-day auto-promotion with FP rate gate, injectable Clock - Panic mode: defaults to active (safe) when Redis unreachable - Tests: 12 scorer cases (incl 2x 10K property-based), 9 governance cases, 3 panic mode cases - PostgreSQL schema with RLS: baselines (optimistic locking), anomalies, remediation_actions - Fly.io config, Dockerfile
This commit is contained in:
99
products/05-aws-cost-anomaly/src/detection/scorer.ts
Normal file
99
products/05-aws-cost-anomaly/src/detection/scorer.ts
Normal file
@@ -0,0 +1,99 @@
|
||||
/**
|
||||
* Welford's online algorithm for running mean and standard deviation.
|
||||
* Used for baseline cost calculations — handles concurrent updates
|
||||
* via DynamoDB conditional writes (BMad must-have).
|
||||
*/
|
||||
export class WelfordBaseline {
|
||||
count: number;
|
||||
mean: number;
|
||||
m2: number; // Sum of squares of differences from the mean
|
||||
|
||||
constructor(count = 0, mean = 0, m2 = 0) {
|
||||
this.count = count;
|
||||
this.mean = mean;
|
||||
this.m2 = m2;
|
||||
}
|
||||
|
||||
/** Add a new observation */
|
||||
update(value: number): void {
|
||||
this.count++;
|
||||
const delta = value - this.mean;
|
||||
this.mean += delta / this.count;
|
||||
const delta2 = value - this.mean;
|
||||
this.m2 += delta * delta2;
|
||||
}
|
||||
|
||||
/** Population standard deviation */
|
||||
get stddev(): number {
|
||||
if (this.count < 2) return 0;
|
||||
return Math.sqrt(this.m2 / this.count);
|
||||
}
|
||||
|
||||
/** Sample standard deviation */
|
||||
get sampleStddev(): number {
|
||||
if (this.count < 2) return 0;
|
||||
return Math.sqrt(this.m2 / (this.count - 1));
|
||||
}
|
||||
|
||||
/** Serialize for DynamoDB storage */
|
||||
toJSON(): { count: number; mean: number; m2: number } {
|
||||
return { count: this.count, mean: this.mean, m2: this.m2 };
|
||||
}
|
||||
|
||||
static fromJSON(data: { count: number; mean: number; m2: number }): WelfordBaseline {
|
||||
return new WelfordBaseline(data.count, data.mean, data.m2);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Anomaly scorer — computes a 0-100 score based on how far
|
||||
* the current cost deviates from the baseline.
|
||||
*/
|
||||
export function scoreAnomaly(input: {
|
||||
cost: number;
|
||||
mean: number;
|
||||
stddev: number;
|
||||
}): number {
|
||||
const { cost, mean, stddev } = input;
|
||||
|
||||
// Guard: no baseline yet
|
||||
if (mean === 0 && stddev === 0) return 0;
|
||||
|
||||
// Guard: zero stddev (all values identical)
|
||||
if (stddev === 0) {
|
||||
return cost > mean ? 100 : 0;
|
||||
}
|
||||
|
||||
// Z-score: how many standard deviations from mean
|
||||
const zScore = (cost - mean) / stddev;
|
||||
|
||||
// Map z-score to 0-100 (clamped)
|
||||
// z=0 → 0, z=1 → 25, z=2 → 50, z=3 → 75, z>=4 → 100
|
||||
const score = Math.min(100, Math.max(0, zScore * 25));
|
||||
|
||||
return Math.round(score * 100) / 100; // 2 decimal places
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if a score warrants an alert.
|
||||
*/
|
||||
export function shouldAlert(score: number, threshold = 50): boolean {
|
||||
return score >= threshold;
|
||||
}
|
||||
|
||||
export interface CostEvent {
|
||||
tenantId: string;
|
||||
accountId: string;
|
||||
resourceType: string; // e.g. 'ec2/m5.xlarge', 'rds/db.r5.large'
|
||||
hourlyCost: number;
|
||||
timestamp: number; // Unix ms
|
||||
region: string;
|
||||
tags: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface AnomalyResult {
|
||||
event: CostEvent;
|
||||
score: number;
|
||||
baseline: { mean: number; stddev: number; count: number };
|
||||
isAnomaly: boolean;
|
||||
}
|
||||
80
products/05-aws-cost-anomaly/src/governance/engine.ts
Normal file
80
products/05-aws-cost-anomaly/src/governance/engine.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
import pino from 'pino';
|
||||
|
||||
const logger = pino({ name: 'governance' });
|
||||
|
||||
/**
|
||||
* Clock interface for deterministic governance tests (BMad must-have).
|
||||
* Inject FakeClock in tests, RealClock in production.
|
||||
*/
|
||||
export interface Clock {
|
||||
now(): number;
|
||||
}
|
||||
|
||||
export class RealClock implements Clock {
|
||||
now() { return Date.now(); }
|
||||
}
|
||||
|
||||
export class FakeClock implements Clock {
|
||||
private current: number;
|
||||
constructor(start = Date.now()) { this.current = start; }
|
||||
now() { return this.current; }
|
||||
advanceBy(ms: number) { this.current += ms; }
|
||||
}
|
||||
|
||||
export interface PromotionResult {
|
||||
promoted: boolean;
|
||||
newMode?: 'audit' | 'enforce';
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
const PROMOTION_DAYS = 14;
|
||||
const MAX_FP_RATE = 0.10; // 10% false positive rate
|
||||
|
||||
/**
|
||||
* Governance engine — manages auto-promotion from shadow → audit → enforce.
|
||||
* Uses injectable Clock for deterministic testing.
|
||||
*/
|
||||
export class GovernanceEngine {
|
||||
private clock: Clock;
|
||||
|
||||
constructor(clock: Clock = new RealClock()) {
|
||||
this.clock = clock;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate whether a tenant should be promoted to the next governance mode.
|
||||
* Requires: 14+ days in current mode AND false positive rate < 10%.
|
||||
*/
|
||||
evaluatePromotion(
|
||||
tenantId: string,
|
||||
stats: { fpRate: number; daysInCurrentMode: number },
|
||||
): PromotionResult {
|
||||
if (stats.daysInCurrentMode < PROMOTION_DAYS) {
|
||||
return { promoted: false, reason: `Only ${stats.daysInCurrentMode} days — need ${PROMOTION_DAYS}` };
|
||||
}
|
||||
|
||||
if (stats.fpRate > MAX_FP_RATE) {
|
||||
return {
|
||||
promoted: false,
|
||||
reason: `false-positive rate ${(stats.fpRate * 100).toFixed(1)}% exceeds ${MAX_FP_RATE * 100}% threshold`,
|
||||
};
|
||||
}
|
||||
|
||||
return { promoted: true, newMode: 'audit' };
|
||||
}
|
||||
|
||||
/**
|
||||
* Check panic mode status.
|
||||
* Defaults to panic=active (safe) when Redis is unreachable (BMad must-have).
|
||||
*/
|
||||
async checkPanicMode(tenantId: string, redis: any): Promise<boolean> {
|
||||
try {
|
||||
const val = await redis.get(`panic:${tenantId}`);
|
||||
return val === '1';
|
||||
} catch (err) {
|
||||
logger.warn({ tenantId, error: (err as Error).message },
|
||||
'Redis unreachable — defaulting to panic=active');
|
||||
return true; // Safe default: panic IS active
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user