Flesh out dd0c/cost: ingestion with Welford optimistic locking, anomaly API, governance, baselines
- Ingestion API: batch cost events, Welford baseline update with optimistic locking (version column), anomaly detection inline - Anomaly API: list (filtered), acknowledge, snooze (1-168h), mark expected, dashboard summary with hourly trend - Governance API: mode status, promotion eligibility check with FP rate calculation - Baseline API: list with computed stddev, reset per resource - Data layer: withTenant() RLS wrapper, Zod config with ANOMALY_THRESHOLD - Fastify server entry point
This commit is contained in:
100
products/05-aws-cost-anomaly/src/api/anomalies.ts
Normal file
100
products/05-aws-cost-anomaly/src/api/anomalies.ts
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
import type { FastifyInstance } from 'fastify';
|
||||||
|
import { z } from 'zod';
|
||||||
|
import { withTenant } from '../data/db.js';
|
||||||
|
|
||||||
|
const listQuerySchema = z.object({
|
||||||
|
page: z.coerce.number().min(1).default(1),
|
||||||
|
limit: z.coerce.number().min(1).max(100).default(20),
|
||||||
|
status: z.enum(['open', 'acknowledged', 'snoozed', 'expected', 'resolved']).optional(),
|
||||||
|
account_id: z.string().optional(),
|
||||||
|
min_score: z.coerce.number().min(0).max(100).optional(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export function registerAnomalyRoutes(app: FastifyInstance) {
|
||||||
|
// List anomalies
|
||||||
|
app.get('/api/v1/anomalies', async (req, reply) => {
|
||||||
|
const query = listQuerySchema.parse(req.query);
|
||||||
|
const tenantId = (req as any).tenantId;
|
||||||
|
const offset = (query.page - 1) * query.limit;
|
||||||
|
|
||||||
|
const result = await withTenant(tenantId, async (client) => {
|
||||||
|
let sql = 'SELECT * FROM anomalies WHERE 1=1';
|
||||||
|
const params: any[] = [];
|
||||||
|
let idx = 1;
|
||||||
|
|
||||||
|
if (query.status) { sql += ` AND status = $${idx++}`; params.push(query.status); }
|
||||||
|
if (query.account_id) { sql += ` AND account_id = $${idx++}`; params.push(query.account_id); }
|
||||||
|
if (query.min_score) { sql += ` AND score >= $${idx++}`; params.push(query.min_score); }
|
||||||
|
|
||||||
|
sql += ` ORDER BY detected_at DESC LIMIT $${idx++} OFFSET $${idx++}`;
|
||||||
|
params.push(query.limit, offset);
|
||||||
|
|
||||||
|
return client.query(sql, params);
|
||||||
|
});
|
||||||
|
|
||||||
|
return { anomalies: result.rows, page: query.page, limit: query.limit };
|
||||||
|
});
|
||||||
|
|
||||||
|
// Acknowledge anomaly
|
||||||
|
app.post('/api/v1/anomalies/:id/acknowledge', async (req, reply) => {
|
||||||
|
const { id } = req.params as { id: string };
|
||||||
|
const tenantId = (req as any).tenantId;
|
||||||
|
|
||||||
|
await withTenant(tenantId, async (client) => {
|
||||||
|
await client.query("UPDATE anomalies SET status = 'acknowledged' WHERE id = $1 AND status = 'open'", [id]);
|
||||||
|
});
|
||||||
|
return { status: 'acknowledged' };
|
||||||
|
});
|
||||||
|
|
||||||
|
// Snooze anomaly
|
||||||
|
app.post('/api/v1/anomalies/:id/snooze', async (req, reply) => {
|
||||||
|
const { id } = req.params as { id: string };
|
||||||
|
const { hours } = z.object({ hours: z.coerce.number().min(1).max(168).default(24) }).parse(req.body ?? {});
|
||||||
|
const tenantId = (req as any).tenantId;
|
||||||
|
|
||||||
|
await withTenant(tenantId, async (client) => {
|
||||||
|
await client.query(
|
||||||
|
"UPDATE anomalies SET status = 'snoozed', snoozed_until = now() + $1 * interval '1 hour' WHERE id = $2",
|
||||||
|
[hours, id],
|
||||||
|
);
|
||||||
|
});
|
||||||
|
return { status: 'snoozed', until_hours: hours };
|
||||||
|
});
|
||||||
|
|
||||||
|
// Mark as expected (recurring cost)
|
||||||
|
app.post('/api/v1/anomalies/:id/expected', async (req, reply) => {
|
||||||
|
const { id } = req.params as { id: string };
|
||||||
|
const tenantId = (req as any).tenantId;
|
||||||
|
|
||||||
|
await withTenant(tenantId, async (client) => {
|
||||||
|
await client.query("UPDATE anomalies SET status = 'expected' WHERE id = $1", [id]);
|
||||||
|
});
|
||||||
|
return { status: 'expected' };
|
||||||
|
});
|
||||||
|
|
||||||
|
// Dashboard summary
|
||||||
|
app.get('/api/v1/dashboard', async (req, reply) => {
|
||||||
|
const tenantId = (req as any).tenantId;
|
||||||
|
|
||||||
|
const result = await withTenant(tenantId, async (client) => {
|
||||||
|
const openCount = await client.query("SELECT COUNT(*)::int as count FROM anomalies WHERE status = 'open'");
|
||||||
|
const topResources = await client.query(`
|
||||||
|
SELECT resource_type, COUNT(*)::int as anomaly_count, AVG(score)::numeric(5,2) as avg_score
|
||||||
|
FROM anomalies WHERE status = 'open'
|
||||||
|
GROUP BY resource_type ORDER BY anomaly_count DESC LIMIT 10
|
||||||
|
`);
|
||||||
|
const recentTrend = await client.query(`
|
||||||
|
SELECT date_trunc('hour', detected_at) as hour, COUNT(*)::int as count
|
||||||
|
FROM anomalies WHERE detected_at > now() - interval '24 hours'
|
||||||
|
GROUP BY hour ORDER BY hour
|
||||||
|
`);
|
||||||
|
return {
|
||||||
|
open_anomalies: openCount.rows[0]?.count ?? 0,
|
||||||
|
top_resources: topResources.rows,
|
||||||
|
hourly_trend: recentTrend.rows,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
return result;
|
||||||
|
});
|
||||||
|
}
|
||||||
38
products/05-aws-cost-anomaly/src/api/baselines.ts
Normal file
38
products/05-aws-cost-anomaly/src/api/baselines.ts
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import type { FastifyInstance } from 'fastify';
|
||||||
|
import { withTenant } from '../data/db.js';
|
||||||
|
|
||||||
|
export function registerBaselineRoutes(app: FastifyInstance) {
|
||||||
|
// List baselines
|
||||||
|
app.get('/api/v1/baselines', async (req, reply) => {
|
||||||
|
const tenantId = (req as any).tenantId;
|
||||||
|
|
||||||
|
const result = await withTenant(tenantId, async (client) => {
|
||||||
|
return client.query(`
|
||||||
|
SELECT account_id, resource_type, welford_count as sample_count,
|
||||||
|
welford_mean::numeric(12,4) as mean,
|
||||||
|
CASE WHEN welford_count > 1
|
||||||
|
THEN sqrt(welford_m2 / welford_count)::numeric(12,4)
|
||||||
|
ELSE 0 END as stddev,
|
||||||
|
updated_at
|
||||||
|
FROM baselines ORDER BY account_id, resource_type
|
||||||
|
`);
|
||||||
|
});
|
||||||
|
|
||||||
|
return { baselines: result.rows };
|
||||||
|
});
|
||||||
|
|
||||||
|
// Reset baseline for a specific resource
|
||||||
|
app.delete('/api/v1/baselines/:accountId/:resourceType', async (req, reply) => {
|
||||||
|
const { accountId, resourceType } = req.params as { accountId: string; resourceType: string };
|
||||||
|
const tenantId = (req as any).tenantId;
|
||||||
|
|
||||||
|
await withTenant(tenantId, async (client) => {
|
||||||
|
await client.query(
|
||||||
|
'DELETE FROM baselines WHERE account_id = $1 AND resource_type = $2',
|
||||||
|
[accountId, resourceType],
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
return { status: 'reset', accountId, resourceType };
|
||||||
|
});
|
||||||
|
}
|
||||||
59
products/05-aws-cost-anomaly/src/api/governance.ts
Normal file
59
products/05-aws-cost-anomaly/src/api/governance.ts
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
import type { FastifyInstance } from 'fastify';
|
||||||
|
import { withTenant } from '../data/db.js';
|
||||||
|
import { GovernanceEngine } from '../governance/engine.js';
|
||||||
|
|
||||||
|
const engine = new GovernanceEngine();
|
||||||
|
|
||||||
|
export function registerGovernanceRoutes(app: FastifyInstance) {
|
||||||
|
// Get governance status
|
||||||
|
app.get('/api/v1/governance', async (req, reply) => {
|
||||||
|
const tenantId = (req as any).tenantId;
|
||||||
|
|
||||||
|
const result = await withTenant(tenantId, async (client) => {
|
||||||
|
return client.query('SELECT governance_mode, governance_started_at FROM tenants WHERE id = $1', [tenantId]);
|
||||||
|
});
|
||||||
|
|
||||||
|
const tenant = result.rows[0];
|
||||||
|
if (!tenant) return reply.status(404).send({ error: 'Tenant not found' });
|
||||||
|
|
||||||
|
const daysInMode = Math.floor((Date.now() - new Date(tenant.governance_started_at).getTime()) / (86400 * 1000));
|
||||||
|
|
||||||
|
return {
|
||||||
|
mode: tenant.governance_mode,
|
||||||
|
days_in_mode: daysInMode,
|
||||||
|
started_at: tenant.governance_started_at,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Check promotion eligibility
|
||||||
|
app.get('/api/v1/governance/promotion', async (req, reply) => {
|
||||||
|
const tenantId = (req as any).tenantId;
|
||||||
|
|
||||||
|
const result = await withTenant(tenantId, async (client) => {
|
||||||
|
const tenant = await client.query('SELECT governance_mode, governance_started_at FROM tenants WHERE id = $1', [tenantId]);
|
||||||
|
const t = tenant.rows[0];
|
||||||
|
if (!t) return null;
|
||||||
|
|
||||||
|
// Calculate FP rate from last 30 days
|
||||||
|
const fpStats = await client.query(`
|
||||||
|
SELECT
|
||||||
|
COUNT(*) FILTER (WHERE status = 'expected')::float /
|
||||||
|
NULLIF(COUNT(*)::float, 0) as fp_rate
|
||||||
|
FROM anomalies
|
||||||
|
WHERE detected_at > now() - interval '30 days'
|
||||||
|
`);
|
||||||
|
|
||||||
|
return { tenant: t, fpRate: fpStats.rows[0]?.fp_rate ?? 0 };
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!result) return reply.status(404).send({ error: 'Tenant not found' });
|
||||||
|
|
||||||
|
const daysInMode = Math.floor((Date.now() - new Date(result.tenant.governance_started_at).getTime()) / (86400 * 1000));
|
||||||
|
const evaluation = engine.evaluatePromotion(tenantId, {
|
||||||
|
fpRate: result.fpRate,
|
||||||
|
daysInCurrentMode: daysInMode,
|
||||||
|
});
|
||||||
|
|
||||||
|
return { ...evaluation, current_mode: result.tenant.governance_mode, fp_rate: result.fpRate, days_in_mode: daysInMode };
|
||||||
|
});
|
||||||
|
}
|
||||||
103
products/05-aws-cost-anomaly/src/api/ingestion.ts
Normal file
103
products/05-aws-cost-anomaly/src/api/ingestion.ts
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
import type { FastifyInstance } from 'fastify';
|
||||||
|
import { z } from 'zod';
|
||||||
|
import pino from 'pino';
|
||||||
|
import { withTenant } from '../data/db.js';
|
||||||
|
import { WelfordBaseline, scoreAnomaly, shouldAlert, type CostEvent } from '../detection/scorer.js';
|
||||||
|
import { config } from '../config/index.js';
|
||||||
|
|
||||||
|
const logger = pino({ name: 'api-ingestion' });
|
||||||
|
|
||||||
|
const costEventSchema = z.object({
|
||||||
|
account_id: z.string(),
|
||||||
|
resource_type: z.string(),
|
||||||
|
hourly_cost: z.number().min(0),
|
||||||
|
region: z.string().default('us-east-1'),
|
||||||
|
tags: z.record(z.string()).default({}),
|
||||||
|
});
|
||||||
|
|
||||||
|
const batchSchema = z.object({
|
||||||
|
events: z.array(costEventSchema).min(1).max(100),
|
||||||
|
});
|
||||||
|
|
||||||
|
export function registerIngestionRoutes(app: FastifyInstance) {
|
||||||
|
// Ingest cost events (from agent or Cost Explorer poller)
|
||||||
|
app.post('/api/v1/ingest', async (req, reply) => {
|
||||||
|
const tenantId = (req as any).tenantId;
|
||||||
|
const { events } = batchSchema.parse(req.body);
|
||||||
|
|
||||||
|
const results = await withTenant(tenantId, async (client) => {
|
||||||
|
const anomalies = [];
|
||||||
|
|
||||||
|
for (const event of events) {
|
||||||
|
// Fetch or create baseline
|
||||||
|
const baselineRow = await client.query(
|
||||||
|
`SELECT welford_count, welford_mean, welford_m2, version
|
||||||
|
FROM baselines WHERE account_id = $1 AND resource_type = $2`,
|
||||||
|
[event.account_id, event.resource_type],
|
||||||
|
);
|
||||||
|
|
||||||
|
let baseline: WelfordBaseline;
|
||||||
|
let version: number;
|
||||||
|
|
||||||
|
if (baselineRow.rows[0]) {
|
||||||
|
baseline = WelfordBaseline.fromJSON({
|
||||||
|
count: baselineRow.rows[0].welford_count,
|
||||||
|
mean: parseFloat(baselineRow.rows[0].welford_mean),
|
||||||
|
m2: parseFloat(baselineRow.rows[0].welford_m2),
|
||||||
|
});
|
||||||
|
version = baselineRow.rows[0].version;
|
||||||
|
} else {
|
||||||
|
baseline = new WelfordBaseline();
|
||||||
|
version = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score before updating baseline
|
||||||
|
const score = scoreAnomaly({
|
||||||
|
cost: event.hourly_cost,
|
||||||
|
mean: baseline.mean,
|
||||||
|
stddev: baseline.stddev,
|
||||||
|
});
|
||||||
|
|
||||||
|
const isAnomaly = shouldAlert(score, config.ANOMALY_THRESHOLD);
|
||||||
|
|
||||||
|
// Update baseline with optimistic locking (BMad must-have: concurrent Welford corruption)
|
||||||
|
baseline.update(event.hourly_cost);
|
||||||
|
const bj = baseline.toJSON();
|
||||||
|
|
||||||
|
if (version === 0) {
|
||||||
|
await client.query(
|
||||||
|
`INSERT INTO baselines (tenant_id, account_id, resource_type, welford_count, welford_mean, welford_m2, version)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6, 1)
|
||||||
|
ON CONFLICT (tenant_id, account_id, resource_type) DO NOTHING`,
|
||||||
|
[tenantId, event.account_id, event.resource_type, bj.count, bj.mean, bj.m2],
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
const updated = await client.query(
|
||||||
|
`UPDATE baselines SET welford_count = $1, welford_mean = $2, welford_m2 = $3,
|
||||||
|
version = version + 1, updated_at = now()
|
||||||
|
WHERE account_id = $4 AND resource_type = $5 AND version = $6`,
|
||||||
|
[bj.count, bj.mean, bj.m2, event.account_id, event.resource_type, version],
|
||||||
|
);
|
||||||
|
if (updated.rowCount === 0) {
|
||||||
|
logger.warn({ accountId: event.account_id, resourceType: event.resource_type },
|
||||||
|
'Optimistic lock conflict — baseline update skipped');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record anomaly if threshold exceeded
|
||||||
|
if (isAnomaly) {
|
||||||
|
await client.query(
|
||||||
|
`INSERT INTO anomalies (tenant_id, account_id, resource_type, region, hourly_cost, score, baseline_mean, baseline_stddev, tags)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
|
||||||
|
[tenantId, event.account_id, event.resource_type, event.region, event.hourly_cost, score, baseline.mean, baseline.stddev, JSON.stringify(event.tags)],
|
||||||
|
);
|
||||||
|
anomalies.push({ ...event, score, isAnomaly: true });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return anomalies;
|
||||||
|
});
|
||||||
|
|
||||||
|
return { ingested: events.length, anomalies: results.length, details: results };
|
||||||
|
});
|
||||||
|
}
|
||||||
14
products/05-aws-cost-anomaly/src/config/index.ts
Normal file
14
products/05-aws-cost-anomaly/src/config/index.ts
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
import { z } from 'zod';
|
||||||
|
|
||||||
|
const envSchema = z.object({
|
||||||
|
PORT: z.coerce.number().default(3000),
|
||||||
|
DATABASE_URL: z.string().default('postgresql://localhost:5432/dd0c_cost'),
|
||||||
|
REDIS_URL: z.string().default('redis://localhost:6379'),
|
||||||
|
JWT_SECRET: z.string().min(32).default('dev-secret-change-me-in-production!!'),
|
||||||
|
CORS_ORIGIN: z.string().default('*'),
|
||||||
|
LOG_LEVEL: z.string().default('info'),
|
||||||
|
ANOMALY_THRESHOLD: z.coerce.number().default(50),
|
||||||
|
});
|
||||||
|
|
||||||
|
export const config = envSchema.parse(process.env);
|
||||||
|
export type Config = z.infer<typeof envSchema>;
|
||||||
24
products/05-aws-cost-anomaly/src/data/db.ts
Normal file
24
products/05-aws-cost-anomaly/src/data/db.ts
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import pg from 'pg';
|
||||||
|
import pino from 'pino';
|
||||||
|
import { config } from '../config/index.js';
|
||||||
|
|
||||||
|
const logger = pino({ name: 'data' });
|
||||||
|
|
||||||
|
export const pool = new pg.Pool({ connectionString: config.DATABASE_URL });
|
||||||
|
|
||||||
|
export async function withTenant<T>(tenantId: string, fn: (client: pg.PoolClient) => Promise<T>): Promise<T> {
|
||||||
|
const client = await pool.connect();
|
||||||
|
try {
|
||||||
|
await client.query('BEGIN');
|
||||||
|
await client.query(`SET LOCAL app.tenant_id = '${tenantId}'`);
|
||||||
|
const result = await fn(client);
|
||||||
|
await client.query('COMMIT');
|
||||||
|
return result;
|
||||||
|
} catch (err) {
|
||||||
|
await client.query('ROLLBACK');
|
||||||
|
throw err;
|
||||||
|
} finally {
|
||||||
|
await client.query('RESET app.tenant_id');
|
||||||
|
client.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
29
products/05-aws-cost-anomaly/src/index.ts
Normal file
29
products/05-aws-cost-anomaly/src/index.ts
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
import Fastify from 'fastify';
|
||||||
|
import cors from '@fastify/cors';
|
||||||
|
import pino from 'pino';
|
||||||
|
import { config } from './config/index.js';
|
||||||
|
import { registerAnomalyRoutes } from './api/anomalies.js';
|
||||||
|
import { registerBaselineRoutes } from './api/baselines.js';
|
||||||
|
import { registerGovernanceRoutes } from './api/governance.js';
|
||||||
|
import { registerIngestionRoutes } from './api/ingestion.js';
|
||||||
|
|
||||||
|
const logger = pino({ name: 'dd0c-cost', level: config.LOG_LEVEL });
|
||||||
|
|
||||||
|
const app = Fastify({ logger: true });
|
||||||
|
|
||||||
|
await app.register(cors, { origin: config.CORS_ORIGIN });
|
||||||
|
|
||||||
|
app.get('/health', async () => ({ status: 'ok', service: 'dd0c-cost' }));
|
||||||
|
|
||||||
|
registerIngestionRoutes(app);
|
||||||
|
registerAnomalyRoutes(app);
|
||||||
|
registerBaselineRoutes(app);
|
||||||
|
registerGovernanceRoutes(app);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await app.listen({ port: config.PORT, host: '0.0.0.0' });
|
||||||
|
logger.info({ port: config.PORT }, 'dd0c/cost started');
|
||||||
|
} catch (err) {
|
||||||
|
logger.fatal(err, 'Failed to start');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user