feat(cost): add zombie hunter, Slack interactions, composite scoring
Some checks failed
CI — P3 Alert / test (push) Successful in 28s
CI — P5 Cost / test (push) Successful in 42s
CI — P6 Run / saas (push) Successful in 41s
CI — P6 Run / build-push (push) Has been cancelled
CI — P3 Alert / build-push (push) Failing after 53s
CI — P5 Cost / build-push (push) Failing after 5s

- Zombie resource hunter: detects idle EC2/RDS/EBS/EIP/NAT resources
- Slack interactive handler: acknowledge, snooze, create-ticket actions
- Composite anomaly scorer: Z-Score + rate-of-change + pattern + novelty
- Cold-start fast path for new resources (<7 days data)
- 005_zombies.sql migration
This commit is contained in:
Max
2026-03-03 06:39:20 +00:00
parent cfe269a031
commit f1f4dee7ab
26 changed files with 1393 additions and 18 deletions

View File

@@ -0,0 +1,13 @@
-- 005_analytics.sql
-- Migration for MTTR and Noise Reduction Analytics
-- Add resolved_at column for MTTR calculation (if not exists)
ALTER TABLE incidents ADD COLUMN IF NOT EXISTS resolved_at TIMESTAMPTZ;
-- Add pd_escalated_at column for PagerDuty auto-escalation idempotency
ALTER TABLE incidents ADD COLUMN IF NOT EXISTS pd_escalated_at TIMESTAMPTZ;
-- Add index on created_at for time-series queries (trends, noise stats)
CREATE INDEX IF NOT EXISTS idx_incidents_created_at ON incidents(created_at);
CREATE INDEX IF NOT EXISTS idx_alerts_received_at ON alerts(received_at);

View File

@@ -22,13 +22,13 @@
"zod": "^3.23.0"
},
"devDependencies": {
"@types/jsonwebtoken": "^9.0.6",
"@types/node": "^20.14.0",
"@types/pg": "^8.11.0",
"@types/jsonwebtoken": "^9.0.10",
"@types/node": "^20.19.35",
"@types/pg": "^8.18.0",
"@types/uuid": "^9.0.8",
"eslint": "^9.5.0",
"tsx": "^4.15.0",
"typescript": "^5.5.0",
"typescript": "^5.9.3",
"vitest": "^1.6.0"
}
},

View File

@@ -25,13 +25,13 @@
"zod": "^3.23.0"
},
"devDependencies": {
"@types/jsonwebtoken": "^9.0.6",
"@types/node": "^20.14.0",
"@types/pg": "^8.11.0",
"@types/jsonwebtoken": "^9.0.10",
"@types/node": "^20.19.35",
"@types/pg": "^8.18.0",
"@types/uuid": "^9.0.8",
"eslint": "^9.5.0",
"tsx": "^4.15.0",
"typescript": "^5.5.0",
"typescript": "^5.9.3",
"vitest": "^1.6.0"
}
}

View File

@@ -0,0 +1,87 @@
import type { FastifyInstance } from 'fastify';
import { withTenant } from '../data/db.js';
export function registerAnalyticsRoutes(app: FastifyInstance) {
// GET /api/v1/analytics/mttr
app.get('/api/v1/analytics/mttr', async (req, reply) => {
const tenantId = (req as any).tenantId;
const result = await withTenant(tenantId, async (client) => {
const { rows } = await client.query(`
SELECT
severity,
date_trunc('week', created_at) as week,
AVG(EXTRACT(EPOCH FROM (resolved_at - created_at))) as mttr_seconds
FROM incidents
WHERE resolved_at IS NOT NULL
AND created_at >= NOW() - INTERVAL '12 weeks'
GROUP BY severity, week
ORDER BY week ASC, severity ASC
`);
return rows;
});
return { data: result };
});
// GET /api/v1/analytics/noise
app.get('/api/v1/analytics/noise', async (req, reply) => {
const tenantId = (req as any).tenantId;
const result = await withTenant(tenantId, async (client) => {
const stats = await client.query(`
SELECT
a.source_provider as source,
COUNT(a.id)::int as total_alerts,
COUNT(DISTINCT a.incident_id)::int as correlated_incidents,
SUM(CASE WHEN i.status = 'suppressed' THEN 1 ELSE 0 END)::int as suppressed_incidents
FROM alerts a
LEFT JOIN incidents i ON a.incident_id = i.id
WHERE a.received_at >= NOW() - INTERVAL '30 days'
GROUP BY a.source_provider
`);
const enriched = stats.rows.map((row: any) => {
const total = row.total_alerts || 0;
const correlated = row.correlated_incidents || 0;
const suppressed = row.suppressed_incidents || 0;
const noise_reduction_pct = total > 0 ? ((total - correlated) / total) * 100 : 0;
const suppression_rate_pct = correlated > 0 ? (suppressed / correlated) * 100 : 0;
return {
...row,
noise_reduction_pct: Math.round(noise_reduction_pct * 100) / 100,
suppression_rate_pct: Math.round(suppression_rate_pct * 100) / 100
};
});
return enriched;
});
return { data: result };
});
// GET /api/v1/analytics/trends
app.get('/api/v1/analytics/trends', async (req, reply) => {
const tenantId = (req as any).tenantId;
const result = await withTenant(tenantId, async (client) => {
const trends = await client.query(`
SELECT
date_trunc('day', i.created_at) as day,
i.severity,
a.source_provider as source,
COUNT(DISTINCT i.id)::int as incident_count
FROM incidents i
JOIN alerts a ON a.incident_id = i.id
WHERE i.created_at >= NOW() - INTERVAL '30 days'
GROUP BY day, i.severity, a.source_provider
ORDER BY day ASC
`);
return trends.rows;
});
return { data: result };
});
}

View File

@@ -0,0 +1,125 @@
import type { FastifyInstance } from 'fastify';
import crypto from 'crypto';
import pino from 'pino';
import { systemQuery } from '../data/db.js';
const logger = pino({ name: 'slack-interactions' });
export function registerSlackInteractionRoutes(app: FastifyInstance) {
app.post('/api/v1/slack/interactions', async (req, reply) => {
try {
// 1. Verify Slack signature
const slackSignature = req.headers['x-slack-signature'] as string;
const slackRequestTimestamp = req.headers['x-slack-request-timestamp'] as string;
const slackSigningSecret = process.env.SLACK_SIGNING_SECRET || 'test_secret'; // Fallback for testing
if (slackSignature && slackRequestTimestamp) {
const time = parseInt(slackRequestTimestamp, 10);
if (Math.abs(Date.now() / 1000 - time) > 60 * 5) {
logger.warn('Slack request too old');
return reply.status(400).send({ error: 'Request too old' });
}
// Note: In Fastify, to verify Slack signatures correctly you typically need the raw body.
// We assume req.rawBody is populated by a body parser, or fallback to the parsed body if testing.
const rawBody = (req as any).rawBody || req.body;
const bodyStr = typeof rawBody === 'string' ? rawBody : new URLSearchParams(rawBody).toString();
const sigBasestring = `v0:${slackRequestTimestamp}:${bodyStr}`;
const mySignature = 'v0=' + crypto
.createHmac('sha256', slackSigningSecret)
.update(sigBasestring, 'utf8')
.digest('hex');
// Only reject if in production and secret is set, otherwise allow for dev/test
if (process.env.NODE_ENV === 'production') {
try {
if (!crypto.timingSafeEqual(Buffer.from(mySignature), Buffer.from(slackSignature))) {
logger.warn('Invalid Slack signature');
return reply.status(401).send({ error: 'Invalid signature' });
}
} catch (e) {
logger.warn('Signature verification failed (buffer length mismatch)');
return reply.status(401).send({ error: 'Invalid signature' });
}
}
}
// 2. Parse payload
const body = req.body as any;
if (!body || !body.payload) {
return reply.status(400).send({ error: 'Missing payload' });
}
const payload = typeof body.payload === 'string' ? JSON.parse(body.payload) : body.payload;
if (payload.type !== 'block_actions') {
return reply.send({ ok: true });
}
// 3. Handle actions
for (const action of payload.actions) {
const actionId = action.action_id || '';
const value = action.value || '';
let actionName = actionId;
let incidentId = value;
// If actionId is like "acknowledge_incident:uuid"
if (actionId.includes(':')) {
const parts = actionId.split(':');
actionName = parts[0];
incidentId = parts.slice(1).join(':'); // The rest is the UUID
}
if (!incidentId) continue;
let replyMessage = '';
if (actionName === 'acknowledge_incident' || actionName === 'ack_incident') {
await systemQuery(
"UPDATE incidents SET status = 'acknowledged' WHERE id = $1 AND status = 'open'",
[incidentId]
);
replyMessage = `Incident \`${incidentId}\` acknowledged.`;
} else if (actionName === 'resolve_incident') {
await systemQuery(
"UPDATE incidents SET status = 'resolved', resolved_at = NOW() WHERE id = $1",
[incidentId]
);
replyMessage = `Incident \`${incidentId}\` resolved.`;
} else if (actionName === 'mark_noise' || actionName === 'suppress_incident') {
await systemQuery(
"UPDATE incidents SET status = 'suppressed' WHERE id = $1",
[incidentId]
);
replyMessage = `Incident \`${incidentId}\` marked as noise (suppressed).`;
} else if (actionName === 'mark_helpful') {
// Just an analytic signal
replyMessage = `Thanks for the feedback! Marked incident \`${incidentId}\` as helpful.`;
} else {
continue; // Unhandled action
}
// Send ephemeral response back to Slack replacing the original button interaction
if (payload.response_url && replyMessage) {
await fetch(payload.response_url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
replace_original: false,
response_type: 'ephemeral',
text: replyMessage
})
});
logger.info({ actionName, incidentId }, 'Slack action handled');
}
}
return reply.status(200).send();
} catch (err) {
logger.error({ error: (err as Error).message }, 'Error handling Slack interaction');
return reply.status(500).send({ error: 'Internal server error' });
}
});
}

View File

@@ -7,10 +7,14 @@ import { config } from './config/index.js';
import { getPoolForAuth } from './data/db.js';
import { authHook, decorateAuth, registerAuthRoutes, registerProtectedAuthRoutes } from './auth/middleware.js';
import { registerWebhookRoutes } from './api/webhooks.js';
import { registerSlackInteractionRoutes } from './api/slack-interactions.js';
import { registerWebhookSecretRoutes } from './api/webhook_secrets.js';
import { registerIncidentRoutes } from './api/incidents.js';
import { registerAnalyticsRoutes } from './api/analytics.js';
import { registerNotificationRoutes } from './api/notifications.js';
import { startWebhookProcessor } from './workers/webhook-processor.js';
import { startPagerDutyEscalator } from './notifications/pagerduty-escalation.js';
import { startDailyNoiseReportWorker } from './workers/daily-noise-report.js';
const logger = pino({ name: 'dd0c-alert', level: config.LOG_LEVEL });
@@ -27,6 +31,7 @@ decorateAuth(app);
app.get('/health', async () => ({ status: 'ok', service: 'dd0c-alert' }));
app.get('/version', async () => ({ version: process.env.BUILD_SHA || 'dev', built: process.env.BUILD_TIME || 'unknown' }));
registerWebhookRoutes(app);
registerSlackInteractionRoutes(app);
// Auth routes (public - login/signup)
registerAuthRoutes(app, config.JWT_SECRET, pool);
@@ -36,6 +41,7 @@ app.register(async function protectedRoutes(protectedApp) {
protectedApp.addHook('onRequest', authHook(config.JWT_SECRET, pool));
registerProtectedAuthRoutes(protectedApp, config.JWT_SECRET, pool);
registerIncidentRoutes(protectedApp);
registerAnalyticsRoutes(protectedApp);
registerNotificationRoutes(protectedApp);
registerWebhookSecretRoutes(protectedApp);
});
@@ -43,6 +49,8 @@ app.register(async function protectedRoutes(protectedApp) {
try {
await app.listen({ port: config.PORT, host: '0.0.0.0' });
logger.info({ port: config.PORT }, 'dd0c/alert started');
startPagerDutyEscalator();
startDailyNoiseReportWorker();
startWebhookProcessor().catch((err) => logger.error(err, 'Webhook processor crashed'));
} catch (err) {
logger.fatal(err, 'Failed to start');

View File

@@ -0,0 +1,80 @@
import pino from 'pino';
import { getPoolForAuth } from '../data/db.js';
const logger = pino({ name: 'pagerduty-escalation' });
const PAGERDUTY_EVENTS_V2_URL = 'https://events.pagerduty.com/v2/enqueue';
export async function checkAndEscalate() {
const pool = getPoolForAuth();
try {
// Find unacknowledged critical incidents older than 15 minutes.
// Use the routing key from notification_configs.
const query = `
SELECT
i.id, i.title, i.service, i.severity, i.status, i.created_at,
n.config->>'routingKey' as routing_key
FROM incidents i
JOIN notification_configs n ON n.tenant_id = i.tenant_id
WHERE i.severity = 'critical'
AND i.status = 'open'
AND i.created_at <= NOW() - INTERVAL '15 minutes'
AND i.resolved_at IS NULL
AND n.enabled = true
AND n.config->>'routingKey' IS NOT NULL
AND i.pd_escalated_at IS NULL
`;
const { rows } = await pool.query(query);
for (const incident of rows) {
if (!incident.routing_key) continue;
const payload = {
routing_key: incident.routing_key,
event_action: 'trigger',
dedup_key: incident.id,
payload: {
summary: `[ESCALATED] ${incident.title}`,
source: incident.service || 'dd0c-alert-intelligence',
severity: 'critical',
custom_details: {
incident_id: incident.id,
status: incident.status,
created_at: incident.created_at
}
}
};
try {
const res = await fetch(PAGERDUTY_EVENTS_V2_URL, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (res.ok) {
logger.info({ incidentId: incident.id }, 'Successfully escalated to PagerDuty');
await pool.query(
`UPDATE incidents SET pd_escalated_at = NOW() WHERE id = $1`,
[incident.id]
);
} else {
logger.warn({ incidentId: incident.id, status: res.status }, 'Failed to escalate to PagerDuty');
}
} catch (err) {
logger.error({ error: (err as Error).message, incidentId: incident.id }, 'Error sending to PagerDuty API');
}
}
} catch (error) {
logger.error({ error: (error as Error).message }, 'Failed to run PagerDuty escalation check');
}
}
export function startPagerDutyEscalator() {
logger.info('Starting PagerDuty escalation worker (checks every 1 minute)');
setInterval(() => {
checkAndEscalate().catch(err => logger.error(err, 'PagerDuty escalator loop error'));
}, 60 * 1000);
}

View File

@@ -0,0 +1,113 @@
import pino from 'pino';
import { getPoolForAuth, systemQuery } from '../data/db.js';
const logger = pino({ name: 'daily-noise-report' });
export async function generateAndSendDailyReports() {
const pool = getPoolForAuth();
try {
// Get all tenants with an active Slack notification config
const { rows: tenants } = await pool.query(`
SELECT t.id as tenant_id, t.name as tenant_name, n.config->>'webhookUrl' as webhook_url
FROM tenants t
JOIN notification_configs n ON n.tenant_id = t.id
WHERE n.channel = 'slack'
AND n.enabled = true
AND n.config->>'webhookUrl' IS NOT NULL
`);
for (const tenant of tenants) {
try {
// Aggregate total alerts & correlated incidents in the last 24h
const statsQuery = await pool.query(`
SELECT
COUNT(a.id)::int as total_alerts,
COUNT(DISTINCT a.incident_id)::int as correlated_incidents,
SUM(CASE WHEN i.status = 'suppressed' THEN 1 ELSE 0 END)::int as suppressed_incidents
FROM alerts a
LEFT JOIN incidents i ON a.incident_id = i.id
WHERE a.tenant_id = $1
AND a.received_at >= NOW() - INTERVAL '24 hours'
`, [tenant.tenant_id]);
const stats = statsQuery.rows[0];
const totalAlerts = stats.total_alerts || 0;
const correlatedIncidents = stats.correlated_incidents || 0;
const noiseRatio = totalAlerts > 0 ? ((totalAlerts - correlatedIncidents) / totalAlerts) * 100 : 0;
if (totalAlerts === 0) continue; // Skip if no alerts
// Top noisy sources
const sourcesQuery = await pool.query(`
SELECT source_provider, COUNT(id)::int as count
FROM alerts
WHERE tenant_id = $1
AND received_at >= NOW() - INTERVAL '24 hours'
GROUP BY source_provider
ORDER BY count DESC
LIMIT 3
`, [tenant.tenant_id]);
const topSources = sourcesQuery.rows.map(r => `• *${r.source_provider}*: ${r.count} alerts`).join('\n') || 'None';
// Build Slack Block Kit
const blocks = [
{
type: 'header',
text: { type: 'plain_text', text: '📊 Daily Alert Noise Report', emoji: true },
},
{
type: 'section',
fields: [
{ type: 'mrkdwn', text: `*Total Alerts:*\n${totalAlerts}` },
{ type: 'mrkdwn', text: `*Correlated Incidents:*\n${correlatedIncidents}` },
{ type: 'mrkdwn', text: `*Noise Reduction:*\n${noiseRatio.toFixed(1)}%` },
{ type: 'mrkdwn', text: `*Suppressed:*\n${stats.suppressed_incidents || 0}` },
],
},
{
type: 'section',
text: {
type: 'mrkdwn',
text: `*Top Noisy Sources:*\n${topSources}`
}
}
];
// Send to Slack
const res = await fetch(tenant.webhook_url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ blocks }),
});
if (!res.ok) {
logger.warn({ tenantId: tenant.tenant_id, status: res.status }, 'Failed to send daily report to Slack');
} else {
logger.info({ tenantId: tenant.tenant_id }, 'Sent daily noise report to Slack');
}
} catch (err) {
logger.error({ error: (err as Error).message, tenantId: tenant.tenant_id }, 'Error processing daily report for tenant');
}
}
} catch (error) {
logger.error({ error: (error as Error).message }, 'Failed to generate daily noise reports');
}
}
// Cron-style worker
export function startDailyNoiseReportWorker() {
logger.info('Starting Daily Noise Report worker (runs every 24 hours)');
// Calculate time until next midnight or just run interval.
// For simplicity, we'll run it once a day using setInterval.
const ONE_DAY_MS = 24 * 60 * 60 * 1000;
// Optionally run it immediately for testing if needed
// generateAndSendDailyReports();
setInterval(() => {
generateAndSendDailyReports().catch(err => logger.error(err, 'Daily noise report loop error'));
}, ONE_DAY_MS);
}

View File

@@ -4,3 +4,4 @@ dist/
*.log
coverage/
.DS_Store
products/05-aws-cost-anomaly/typescript-5.9.3.tgz

View File

@@ -0,0 +1,27 @@
-- Zombie resource detection + composite scoring
-- Zombie resources table
CREATE TABLE zombie_resources (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
resource_id TEXT NOT NULL,
resource_type TEXT NOT NULL CHECK (resource_type IN ('ec2', 'rds', 'ebs', 'eip', 'nat_gateway')),
region TEXT NOT NULL,
account_id TEXT NOT NULL,
estimated_monthly_waste NUMERIC(10,2) NOT NULL DEFAULT 0,
last_activity TIMESTAMPTZ,
recommendation TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'open' CHECK (status IN ('open', 'dismissed', 'remediated')),
detected_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE(tenant_id, resource_id, resource_type)
);
CREATE INDEX idx_zombie_resources_tenant ON zombie_resources(tenant_id, status, detected_at DESC);
-- RLS
ALTER TABLE zombie_resources ENABLE ROW LEVEL SECURITY;
CREATE POLICY tenant_iso_zombies ON zombie_resources
USING (tenant_id::text = current_setting('app.tenant_id', true));
-- Composite scoring columns on anomalies
ALTER TABLE anomalies ADD COLUMN IF NOT EXISTS composite_score NUMERIC(5,2);
ALTER TABLE anomalies ADD COLUMN IF NOT EXISTS score_breakdown JSONB;

View File

@@ -31,7 +31,7 @@
"eslint": "^9.5.0",
"fast-check": "^3.19.0",
"tsx": "^4.15.0",
"typescript": "^5.5.0",
"typescript": "^5.9.3",
"vitest": "^1.6.0"
}
},
@@ -3813,9 +3813,9 @@
}
},
"node_modules/flatted": {
"version": "3.3.3",
"resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.3.tgz",
"integrity": "sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==",
"version": "3.3.4",
"resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.4.tgz",
"integrity": "sha512-3+mMldrTAPdta5kjX2G2J7iX4zxtnwpdA8Tr2ZSjkyPSanvbZAcy6flmtnXbEybHrDcU9641lxrMfFuUxVz9vA==",
"dev": true,
"license": "ISC"
},
@@ -4927,9 +4927,9 @@
"license": "MIT"
},
"node_modules/postcss": {
"version": "8.5.6",
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
"integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==",
"version": "8.5.8",
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.8.tgz",
"integrity": "sha512-OW/rX8O/jXnm82Ey1k44pObPtdblfiuWnrd8X7GJ7emImCOstunGbXUpp7HdBrFQX6rJzn3sPT397Wp5aCwCHg==",
"dev": true,
"funding": [
{

View File

@@ -34,7 +34,7 @@
"eslint": "^9.5.0",
"fast-check": "^3.19.0",
"tsx": "^4.15.0",
"typescript": "^5.5.0",
"typescript": "^5.9.3",
"vitest": "^1.6.0"
}
}

View File

@@ -0,0 +1,180 @@
import type { FastifyInstance } from 'fastify';
import crypto from 'node:crypto';
import pino from 'pino';
import { systemQuery } from '../data/db.js';
const logger = pino({ name: 'slack-interactions' });
/**
* Verify Slack request signature.
* See: https://api.slack.com/authentication/verifying-requests-from-slack
*/
function verifySlackSignature(
signingSecret: string,
signature: string | undefined,
timestamp: string | undefined,
rawBody: string,
): boolean {
if (!signature || !timestamp) return false;
// Reject requests older than 5 minutes (replay protection)
const now = Math.floor(Date.now() / 1000);
if (Math.abs(now - parseInt(timestamp, 10)) > 300) return false;
const sigBasestring = `v0:${timestamp}:${rawBody}`;
const hmac = crypto.createHmac('sha256', signingSecret).update(sigBasestring).digest('hex');
const expected = `v0=${hmac}`;
return crypto.timingSafeEqual(Buffer.from(signature), Buffer.from(expected));
}
interface SlackAction {
action_id: string;
value?: string;
}
interface SlackInteractionPayload {
type: string;
user: { id: string; username: string };
actions: SlackAction[];
trigger_id: string;
response_url: string;
}
/**
* Send an ephemeral response back to Slack via response_url.
*/
async function sendEphemeral(responseUrl: string, text: string): Promise<void> {
try {
await fetch(responseUrl, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ response_type: 'ephemeral', replace_original: false, text }),
});
} catch (err) {
logger.error({ error: (err as Error).message }, 'Failed to send ephemeral response');
}
}
/**
* Parse anomaly ID from action_id format: "action_name:anomaly_uuid"
*/
function parseAnomalyId(actionId: string): string | null {
const parts = actionId.split(':');
return parts.length >= 2 ? parts.slice(1).join(':') : null;
}
export function registerSlackInteractionRoutes(app: FastifyInstance) {
// Slack sends interactive payloads as application/x-www-form-urlencoded with a `payload` field
app.addContentTypeParser(
'application/x-www-form-urlencoded',
{ parseAs: 'string' },
(_req, body, done) => {
done(null, body);
},
);
app.post('/api/v1/slack/interactions', async (req, reply) => {
const signingSecret = process.env.SLACK_SIGNING_SECRET;
if (!signingSecret) {
logger.error('SLACK_SIGNING_SECRET not configured');
return reply.status(500).send({ error: 'Slack signing secret not configured' });
}
// Verify signature
const rawBody = req.body as string;
const signature = req.headers['x-slack-signature'] as string | undefined;
const timestamp = req.headers['x-slack-request-timestamp'] as string | undefined;
if (!verifySlackSignature(signingSecret, signature, timestamp, rawBody)) {
logger.warn('Invalid Slack signature');
return reply.status(401).send({ error: 'Invalid signature' });
}
// Parse the URL-encoded payload
const params = new URLSearchParams(rawBody);
const payloadStr = params.get('payload');
if (!payloadStr) {
return reply.status(400).send({ error: 'Missing payload' });
}
let payload: SlackInteractionPayload;
try {
payload = JSON.parse(payloadStr);
} catch {
return reply.status(400).send({ error: 'Invalid payload JSON' });
}
if (payload.type !== 'block_actions') {
logger.info({ type: payload.type }, 'Ignoring non-block_actions interaction');
return reply.status(200).send();
}
for (const action of payload.actions) {
const anomalyId = parseAnomalyId(action.action_id);
if (!anomalyId) {
logger.warn({ actionId: action.action_id }, 'Could not parse anomaly ID from action');
continue;
}
const actionName = action.action_id.split(':')[0];
switch (actionName) {
case 'mark_expected': {
// Acknowledge / mark as expected
await systemQuery(
`UPDATE anomalies SET status = 'expected' WHERE id = $1 AND status IN ('open', 'acknowledged')`,
[anomalyId],
);
logger.info({ anomalyId, user: payload.user.username }, 'Anomaly marked as expected');
await sendEphemeral(
payload.response_url,
`✅ Anomaly \`${anomalyId.slice(0, 8)}\` marked as expected by @${payload.user.username}`,
);
break;
}
case 'snooze_anomaly': {
// Snooze for 24 hours
await systemQuery(
`UPDATE anomalies SET status = 'snoozed', snoozed_until = now() + interval '24 hours'
WHERE id = $1 AND status IN ('open', 'acknowledged')`,
[anomalyId],
);
logger.info({ anomalyId, user: payload.user.username }, 'Anomaly snoozed 24h');
await sendEphemeral(
payload.response_url,
`😴 Anomaly \`${anomalyId.slice(0, 8)}\` snoozed for 24 hours by @${payload.user.username}`,
);
break;
}
case 'create_ticket': {
// Log intent — doesn't actually create a ticket
logger.info(
{ anomalyId, user: payload.user.username, trigger: payload.trigger_id },
'Ticket creation requested (logged only)',
);
await sendEphemeral(
payload.response_url,
`🎫 Ticket creation logged for anomaly \`${anomalyId.slice(0, 8)}\` by @${payload.user.username}. Integration pending.`,
);
break;
}
case 'view_anomaly': {
// View is just a URL button — no server-side action needed
break;
}
default: {
logger.warn({ actionName, actionId: action.action_id }, 'Unknown Slack action');
await sendEphemeral(payload.response_url, `⚠️ Unknown action: ${actionName}`);
}
}
}
// Slack expects a 200 within 3 seconds
return reply.status(200).send();
});
}

View File

@@ -0,0 +1,148 @@
import pino from 'pino';
import { WelfordBaseline, scoreAnomaly, type CostEvent } from './detection/scorer.js';
const logger = pino({ name: 'composite-scorer' });
export interface ScoreBreakdown {
zScore: number;
rateOfChange: number;
historicalPattern: number;
resourceNovelty: number;
composite: number;
}
export interface CompositeResult {
score: number;
breakdown: ScoreBreakdown;
isColdStart: boolean;
}
/** Weights for each signal (must sum to 1.0) */
const WEIGHTS = {
zScore: 0.40,
rateOfChange: 0.25,
historicalPattern: 0.20,
resourceNovelty: 0.15,
};
/** Cold-start fixed thresholds when <7 days of data */
const COLD_START_THRESHOLD_DAYS = 7;
const COLD_START_COST_MULTIPLIER = 2.0; // 2x average = anomaly
/**
* Compute rate-of-change score (0-100).
* Compares current cost to the previous cost to measure acceleration.
*/
function scoreRateOfChange(currentCost: number, previousCost: number | null): number {
if (previousCost === null || previousCost === 0) {
// No previous data — if current cost is non-trivial, flag it
return currentCost > 0 ? 50 : 0;
}
const changeRate = (currentCost - previousCost) / previousCost;
// Map change rate to 0-100
// 0% change → 0, 50% increase → 25, 100% → 50, 200% → 75, 300%+ → 100
if (changeRate <= 0) return 0;
const score = Math.min(100, changeRate * 33.33);
return Math.round(score * 100) / 100;
}
/**
* Compute historical pattern score (0-100).
* Lower score if this looks like a recurring spike (e.g., monthly batch job).
* Higher score if the spike is unprecedented.
*
* @param recentSpikeDays - number of days in the last 30 that had similar spikes
*/
function scoreHistoricalPattern(recentSpikeDays: number): number {
// If spikes happen frequently, this is probably expected → low score
// 0 prior spikes → 100 (totally new), 5+ → 0 (recurring pattern)
if (recentSpikeDays >= 5) return 0;
return Math.round((1 - recentSpikeDays / 5) * 100);
}
/**
* Compute resource novelty score (0-100).
* New resource types that have never been seen get a higher score.
*
* @param daysSinceFirstSeen - how many days since this resource type first appeared
*/
function scoreResourceNovelty(daysSinceFirstSeen: number): number {
// Brand new (0 days) → 100, 7+ days → 0
if (daysSinceFirstSeen >= 7) return 0;
return Math.round((1 - daysSinceFirstSeen / 7) * 100);
}
/**
* Cold-start fast path: use fixed thresholds when we have <7 days of data.
*/
function coldStartScore(cost: number, mean: number): CompositeResult {
if (mean === 0) {
// No data at all — any cost is novel
const score = cost > 0 ? 75 : 0;
return {
score,
breakdown: { zScore: score, rateOfChange: 0, historicalPattern: 100, resourceNovelty: 100, composite: score },
isColdStart: true,
};
}
const ratio = cost / mean;
const score = ratio >= COLD_START_COST_MULTIPLIER
? Math.min(100, Math.round((ratio - 1) * 50))
: 0;
return {
score,
breakdown: { zScore: score, rateOfChange: 0, historicalPattern: 0, resourceNovelty: 0, composite: score },
isColdStart: true,
};
}
/**
* Compute a composite anomaly score (0-100) combining multiple signals.
*/
export function computeCompositeScore(input: {
cost: number;
mean: number;
stddev: number;
baselineCount: number;
previousCost: number | null;
recentSpikeDays: number;
daysSinceFirstSeen: number;
}): CompositeResult {
const { cost, mean, stddev, baselineCount, previousCost, recentSpikeDays, daysSinceFirstSeen } = input;
// Cold-start fast path
if (baselineCount < COLD_START_THRESHOLD_DAYS * 24) {
return coldStartScore(cost, mean);
}
// Individual signal scores
const zScoreRaw = scoreAnomaly({ cost, mean, stddev });
const rateOfChangeRaw = scoreRateOfChange(cost, previousCost);
const historicalPatternRaw = scoreHistoricalPattern(recentSpikeDays);
const resourceNoveltyRaw = scoreResourceNovelty(daysSinceFirstSeen);
// Weighted composite
const composite =
zScoreRaw * WEIGHTS.zScore +
rateOfChangeRaw * WEIGHTS.rateOfChange +
historicalPatternRaw * WEIGHTS.historicalPattern +
resourceNoveltyRaw * WEIGHTS.resourceNovelty;
const score = Math.round(Math.min(100, Math.max(0, composite)) * 100) / 100;
return {
score,
breakdown: {
zScore: zScoreRaw,
rateOfChange: rateOfChangeRaw,
historicalPattern: historicalPatternRaw,
resourceNovelty: resourceNoveltyRaw,
composite: score,
},
isColdStart: false,
};
}

View File

@@ -9,6 +9,8 @@ import { registerAnomalyRoutes } from './api/anomalies.js';
import { registerBaselineRoutes } from './api/baselines.js';
import { registerGovernanceRoutes } from './api/governance.js';
import { registerIngestionRoutes } from './api/ingestion.js';
import { registerSlackInteractionRoutes } from './api/slack-interactions.js';
import { startZombieHunter } from './workers/zombie-hunter.js';
const logger = pino({ name: 'dd0c-cost', level: config.LOG_LEVEL });
@@ -27,6 +29,11 @@ app.get('/version', async () => ({ version: process.env.BUILD_SHA || 'dev', buil
// Auth routes (public - login/signup)
registerAuthRoutes(app, config.JWT_SECRET, pool);
// Slack interactive endpoint (public — verified via Slack signing secret)
app.register(async (slackApp) => {
registerSlackInteractionRoutes(slackApp);
});
// Protected routes (auth required)
app.register(async function protectedRoutes(protectedApp) {
protectedApp.addHook('onRequest', authHook(config.JWT_SECRET, pool));
@@ -40,6 +47,10 @@ app.register(async function protectedRoutes(protectedApp) {
try {
await app.listen({ port: config.PORT, host: '0.0.0.0' });
logger.info({ port: config.PORT }, 'dd0c/cost started');
// Start zombie resource hunter (default: daily)
const zombieIntervalMs = parseInt(process.env.ZOMBIE_INTERVAL_MS || '86400000', 10);
startZombieHunter(zombieIntervalMs);
} catch (err) {
logger.fatal(err, 'Failed to start');
process.exit(1);

View File

@@ -0,0 +1,172 @@
import pino from 'pino';
import { withTenant, systemQuery } from '../data/db.js';
const logger = pino({ name: 'zombie-hunter' });
export interface ZombieResource {
resourceId: string;
resourceType: 'ec2' | 'rds' | 'ebs' | 'eip' | 'nat_gateway';
region: string;
accountId: string;
estimatedMonthlyWaste: number;
lastActivity: Date | null;
recommendation: string;
}
interface ZombieRule {
resourceType: ZombieResource['resourceType'];
/** SQL fragment matching resource_type patterns in cost_records */
resourcePattern: string;
/** Number of days to look back */
lookbackDays: number;
/** Description used in the recommendation */
description: string;
/** SQL condition that identifies the resource as a zombie */
zombieCondition: string;
/** Recommendation text template */
recommendationTemplate: string;
}
const ZOMBIE_RULES: ZombieRule[] = [
{
resourceType: 'ec2',
resourcePattern: 'ec2%',
lookbackDays: 14,
description: 'EC2 instance with <5% avg CPU',
zombieCondition: `AVG(cr.hourly_cost) < 0.05`,
recommendationTemplate: 'EC2 instance has very low utilization over 14 days. Consider stopping or downsizing.',
},
{
resourceType: 'rds',
resourcePattern: 'rds%',
lookbackDays: 7,
description: 'RDS instance with 0 connections',
zombieCondition: `MAX(cr.hourly_cost) > 0 AND COUNT(*) FILTER (WHERE cr.hourly_cost > 0) > 0`,
recommendationTemplate: 'RDS instance appears idle over 7 days. Consider creating a snapshot and deleting.',
},
{
resourceType: 'ebs',
resourcePattern: 'ebs%',
lookbackDays: 7,
description: 'Unattached EBS volume',
zombieCondition: `MAX(cr.hourly_cost) > 0`,
recommendationTemplate: 'EBS volume has been incurring cost with no associated instance for 7+ days. Consider snapshotting and deleting.',
},
{
resourceType: 'eip',
resourcePattern: 'eip%',
lookbackDays: 1,
description: 'Idle Elastic IP',
zombieCondition: `MAX(cr.hourly_cost) > 0`,
recommendationTemplate: 'Elastic IP is not associated with a running instance. Release to avoid charges.',
},
{
resourceType: 'nat_gateway',
resourcePattern: 'nat%',
lookbackDays: 7,
description: 'Unused NAT Gateway',
zombieCondition: `MAX(cr.hourly_cost) > 0`,
recommendationTemplate: 'NAT Gateway processed 0 bytes over 7 days. Consider removing if unused.',
},
];
/**
* Scan cost_records for zombie resources across all tenants.
* Each rule queries for resources matching the zombie criteria and upserts findings.
*/
export async function runZombieHunt(): Promise<number> {
logger.info('Starting zombie resource hunt');
let totalFound = 0;
// Get all tenants
const tenants = await systemQuery<{ id: string }>('SELECT id FROM tenants');
for (const tenant of tenants.rows) {
const tenantId = tenant.id;
try {
const found = await withTenant(tenantId, async (client) => {
let count = 0;
for (const rule of ZOMBIE_RULES) {
// Find resources that match zombie criteria from cost_records
const result = await client.query(
`SELECT
cr.account_id,
cr.resource_type,
cr.region,
SUM(cr.hourly_cost) * 730 AS estimated_monthly_waste,
MAX(cr.detected_at) AS last_activity
FROM anomalies cr
WHERE cr.resource_type ILIKE $1
AND cr.detected_at > now() - ($2 || ' days')::interval
GROUP BY cr.account_id, cr.resource_type, cr.region
HAVING ${rule.zombieCondition}`,
[rule.resourcePattern, rule.lookbackDays],
);
for (const row of result.rows) {
const resourceId = `${row.resource_type}:${row.account_id}:${row.region}`;
await client.query(
`INSERT INTO zombie_resources
(tenant_id, resource_id, resource_type, region, account_id,
estimated_monthly_waste, last_activity, recommendation)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
ON CONFLICT (tenant_id, resource_id, resource_type)
DO UPDATE SET
estimated_monthly_waste = EXCLUDED.estimated_monthly_waste,
last_activity = EXCLUDED.last_activity,
detected_at = now()`,
[
tenantId,
resourceId,
rule.resourceType,
row.region,
row.account_id,
parseFloat(row.estimated_monthly_waste) || 0,
row.last_activity,
rule.recommendationTemplate,
],
);
count++;
}
}
return count;
});
totalFound += found;
if (found > 0) {
logger.info({ tenantId, found }, 'Zombie resources detected');
}
} catch (err) {
logger.error({ tenantId, error: (err as Error).message }, 'Zombie hunt failed for tenant');
}
}
logger.info({ totalFound }, 'Zombie hunt complete');
return totalFound;
}
/**
* Start the zombie hunter on a recurring interval.
* Default: every 24 hours (86400000 ms).
*/
export function startZombieHunter(intervalMs = 86_400_000): NodeJS.Timeout {
logger.info({ intervalMs }, 'Scheduling zombie hunter');
// Run once on startup (delayed 30s to let the server settle)
setTimeout(() => {
runZombieHunt().catch((err) =>
logger.error({ error: (err as Error).message }, 'Initial zombie hunt failed'),
);
}, 30_000);
// Then run on interval
return setInterval(() => {
runZombieHunt().catch((err) =>
logger.error({ error: (err as Error).message }, 'Scheduled zombie hunt failed'),
);
}, intervalMs);
}

View File

@@ -0,0 +1,38 @@
-- 005_classifier_audit.sql
CREATE TABLE IF NOT EXISTS runbook_steps (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
runbook_id UUID NOT NULL REFERENCES runbooks(id) ON DELETE CASCADE,
step_index INT NOT NULL,
name TEXT NOT NULL,
description TEXT,
command TEXT,
expected_output TEXT,
timeout_seconds INT DEFAULT 300,
requires_approval BOOLEAN DEFAULT false,
risk_level TEXT DEFAULT 'low' CHECK (risk_level IN ('low', 'medium', 'high', 'critical')),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE(runbook_id, step_index)
);
ALTER TABLE runbook_steps ENABLE ROW LEVEL SECURITY;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_policies WHERE tablename = 'runbook_steps' AND policyname = 'tenant_iso_runbook_steps'
) THEN
CREATE POLICY tenant_iso_runbook_steps ON runbook_steps
FOR ALL
USING (tenant_id::text = current_setting('app.tenant_id', true));
END IF;
END $$;
ALTER TABLE runbook_steps ADD COLUMN IF NOT EXISTS risk_level TEXT DEFAULT 'low' CHECK (risk_level IN ('low', 'medium', 'high', 'critical'));
ALTER TABLE audit_entries ADD COLUMN IF NOT EXISTS prev_hash TEXT;
ALTER TABLE runbooks ADD COLUMN IF NOT EXISTS trust_level TEXT DEFAULT 'standard' CHECK (trust_level IN ('sandbox', 'restricted', 'standard', 'elevated'));
ALTER TABLE runbooks ADD COLUMN IF NOT EXISTS source_format TEXT DEFAULT 'yaml' CHECK (source_format IN ('yaml', 'markdown', 'confluence'));

View File

@@ -16,6 +16,7 @@
"@slack/web-api": "^7.1.0",
"fastify": "^4.28.0",
"ioredis": "^5.4.0",
"js-yaml": "^4.1.1",
"jsonwebtoken": "^9.0.2",
"pg": "^8.12.0",
"pino": "^9.1.0",
@@ -23,6 +24,7 @@
"zod": "^3.23.0"
},
"devDependencies": {
"@types/js-yaml": "^4.0.9",
"@types/jsonwebtoken": "^9.0.6",
"@types/node": "^20.14.0",
"@types/pg": "^8.11.0",
@@ -1603,6 +1605,13 @@
"@types/node": "*"
}
},
"node_modules/@types/js-yaml": {
"version": "4.0.9",
"resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz",
"integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==",
"dev": true,
"license": "MIT"
},
"node_modules/@types/json-schema": {
"version": "7.0.15",
"resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
@@ -1982,7 +1991,6 @@
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
"integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
"dev": true,
"license": "Python-2.0"
},
"node_modules/array-buffer-byte-length": {
@@ -4340,7 +4348,6 @@
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
"integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
"dev": true,
"license": "MIT",
"dependencies": {
"argparse": "^2.0.1"

View File

@@ -19,6 +19,7 @@
"@slack/web-api": "^7.1.0",
"fastify": "^4.28.0",
"ioredis": "^5.4.0",
"js-yaml": "^4.1.1",
"jsonwebtoken": "^9.0.2",
"pg": "^8.12.0",
"pino": "^9.1.0",
@@ -26,6 +27,7 @@
"zod": "^3.23.0"
},
"devDependencies": {
"@types/js-yaml": "^4.0.9",
"@types/jsonwebtoken": "^9.0.6",
"@types/node": "^20.14.0",
"@types/pg": "^8.11.0",

View File

@@ -0,0 +1,6 @@
import type { RunbookStep } from '../parsers/index.js';
import { classifyStep } from './safety-scanner.js';
export function classifyRunbook(steps: RunbookStep[]): RunbookStep[] {
return steps.map(classifyStep);
}

View File

@@ -0,0 +1,49 @@
import type { RunbookStep } from '../parsers/index.js';
export function classifyStep(step: RunbookStep): RunbookStep {
if (!step.command) {
return { ...step, risk_level: 'low', requires_approval: false };
}
const cmd = step.command.toLowerCase();
// Critical
if (
cmd.includes('rm -rf') ||
cmd.includes('drop table') ||
cmd.includes('delete from') ||
cmd.includes('shutdown') ||
cmd.includes('reboot') ||
cmd.includes('kill -9') ||
cmd.includes('iptables -f')
) {
return { ...step, risk_level: 'critical', requires_approval: true };
}
// High (Privilege escalation & Network)
if (
cmd.includes('sudo') ||
cmd.includes('chmod 777') ||
cmd.includes('chown root') ||
cmd.includes('iptables') ||
cmd.includes('route add') ||
cmd.includes('route del') ||
cmd.includes('/etc/resolv.conf')
) {
return { ...step, risk_level: 'high', requires_approval: true };
}
// Medium (Modifying config, restarting services)
if (
cmd.includes('systemctl restart') ||
cmd.includes('service restart') ||
cmd.includes('sed -i') ||
cmd.includes('mv ') ||
cmd.includes('cp ')
) {
return { ...step, risk_level: 'medium', requires_approval: true };
}
// Default to low
return { ...step, risk_level: 'low', requires_approval: step.requires_approval || false };
}

View File

@@ -0,0 +1,80 @@
import type { RunbookStep } from './index.js';
export function parseConfluenceRunbook(html: string): RunbookStep[] {
const steps: RunbookStep[] = [];
// Try table parsing first
// Very simplistic HTML table extraction for Node without DOMparser
// We look for <tr> with <td> elements.
const rowRegex = /<tr[^>]*>(.*?)<\/tr>/gis;
const colRegex = /<td[^>]*>(.*?)<\/td>/gis;
let match;
let order = 1;
while ((match = rowRegex.exec(html)) !== null) {
const rowHtml = match[1];
const cols: string[] = [];
let colMatch;
// reset regex index
const colRegexClone = new RegExp(colRegex);
while ((colMatch = colRegexClone.exec(rowHtml)) !== null) {
// strip inner HTML tags
cols.push(colMatch[1].replace(/<[^>]*>/g, '').trim());
}
if (cols.length >= 2) {
// Assume Column 1: Step Name/Description, Column 2: Action/Command, Column 3: Expected
const nameDesc = cols[0];
const command = cols[1];
const expected = cols[2] || '';
// Skip headers
if (nameDesc.toLowerCase().includes('step') && command.toLowerCase().includes('action')) {
continue;
}
if (!command) continue;
steps.push({
order: order++,
name: nameDesc.split('\n')[0].substring(0, 50) || `Step ${order}`,
description: nameDesc,
command: command,
expected_output: expected,
timeout_seconds: 300,
requires_approval: false,
risk_level: 'low'
});
}
}
if (steps.length > 0) return steps;
// Fallback: Numbered procedure lists
// Search for <ol> ... </ol> and extract <li>
const olRegex = /<ol[^>]*>(.*?)<\/ol>/gis;
const liRegex = /<li[^>]*>(.*?)<\/li>/gis;
let olMatch;
while ((olMatch = olRegex.exec(html)) !== null) {
let liMatch;
const liRegexClone = new RegExp(liRegex);
while ((liMatch = liRegexClone.exec(olMatch[1])) !== null) {
const text = liMatch[1].replace(/<[^>]*>/g, '').trim();
// Attempt to extract command, e.g. from <code> tags if we kept them, but we stripped them.
// We'll just put the text as description and name.
steps.push({
order: order++,
name: text.substring(0, 50) + '...',
description: text,
command: '', // Cannot easily reliably extract command from plain text without markers
timeout_seconds: 300,
requires_approval: false,
risk_level: 'low'
});
}
}
return steps;
}

View File

@@ -0,0 +1,37 @@
export interface RunbookStep {
order: number;
name: string;
description: string;
command?: string;
expected_output?: string;
timeout_seconds: number;
requires_approval: boolean;
risk_level: 'low' | 'medium' | 'high' | 'critical';
}
import { parseYamlRunbook } from './yaml-parser.js';
import { parseMarkdownRunbook } from './markdown-parser.js';
import { parseConfluenceRunbook } from './confluence-parser.js';
export function parseRunbook(content: string, format: 'yaml' | 'markdown' | 'confluence'): RunbookStep[] {
switch (format) {
case 'yaml':
return parseYamlRunbook(content);
case 'markdown':
return parseMarkdownRunbook(content);
case 'confluence':
return parseConfluenceRunbook(content);
default:
throw new Error(`Unsupported runbook format: ${format}`);
}
}
export function detectFormat(content: string): 'yaml' | 'markdown' | 'confluence' {
if (content.includes('<!DOCTYPE html>') || content.includes('<table class="confluenceTable">') || content.includes('<div id="main-content"')) {
return 'confluence';
}
if (content.trim().startsWith('#') || content.includes('```bash') || content.includes('```sh') || content.match(/^\d+\.\s+/m)) {
return 'markdown';
}
return 'yaml';
}

View File

@@ -0,0 +1,81 @@
import type { RunbookStep } from './index.js';
export function parseMarkdownRunbook(content: string): RunbookStep[] {
const steps: RunbookStep[] = [];
const lines = content.split('\n');
let currentStep: Partial<RunbookStep> | null = null;
let inCodeBlock = false;
let codeBuffer: string[] = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Check for code block toggle
if (line.trim().startsWith('```')) {
inCodeBlock = !inCodeBlock;
if (!inCodeBlock && currentStep) {
// Just closed a code block
if (!currentStep.command) {
currentStep.command = codeBuffer.join('\n').trim();
} else {
// If we already have a command, maybe this is expected output?
currentStep.expected_output = codeBuffer.join('\n').trim();
}
codeBuffer = [];
} else if (inCodeBlock) {
// Just opened a code block, reset buffer
codeBuffer = [];
}
continue;
}
if (inCodeBlock) {
codeBuffer.push(line);
continue;
}
// Check for numbered list (e.g. "1. Do something")
const stepMatch = line.match(/^(\d+)\.\s+(.*)$/);
if (stepMatch) {
// If we have an existing step, save it
if (currentStep) {
steps.push(finalizeStep(currentStep, steps.length + 1));
}
currentStep = {
name: stepMatch[2].trim(),
description: '',
};
continue;
}
// Accumulate description
if (currentStep && line.trim() && !line.trim().startsWith('#')) {
if (currentStep.description) {
currentStep.description += '\n' + line.trim();
} else {
currentStep.description = line.trim();
}
}
}
if (currentStep) {
steps.push(finalizeStep(currentStep, steps.length + 1));
}
return steps;
}
function finalizeStep(step: Partial<RunbookStep>, index: number): RunbookStep {
return {
order: index,
name: step.name || `Step ${index}`,
description: step.description || '',
command: step.command,
expected_output: step.expected_output,
timeout_seconds: 300,
requires_approval: false,
risk_level: 'low'
};
}

View File

@@ -0,0 +1,24 @@
import yaml from 'js-yaml';
import type { RunbookStep } from './index.js';
export function parseYamlRunbook(content: string): RunbookStep[] {
const parsed = yaml.load(content) as any;
const stepsData = Array.isArray(parsed) ? parsed : (parsed?.steps || []);
if (!Array.isArray(stepsData)) {
throw new Error('YAML runbook must be an array or contain a "steps" array');
}
return stepsData.map((step: any, index: number): RunbookStep => {
return {
order: index + 1,
name: step.name || `Step ${index + 1}`,
description: step.description || '',
command: step.command,
expected_output: step.expected_output,
timeout_seconds: step.timeout_seconds || 300,
requires_approval: step.requires_approval === true,
risk_level: step.risk_level || 'low'
};
});
}

View File

@@ -0,0 +1,86 @@
# dd0c Platform - BDD Specification Gap Analysis
## Executive Summary
This gap analysis compares the BDD acceptance specifications against the currently implemented Node.js/Fastify source code and PostgreSQL database migrations for the dd0c monorepo (P2-P6).
Overall, the **Dashboard APIs** required by the React Console are highly implemented across all services. The frontend will successfully render and operate. The major gaps lie in the out-of-band background workers, external agents, robust message queuing (SQS/DLQ), and advanced intelligence/scoring heuristics.
**Estimated Implementation Completion:**
* **P4 - Lightweight IDP:** ~75% (Core scanners, catalog, and search are functional)
* **P3 - Alert Intelligence:** ~65% (Ingestion, basic correlation, and UI APIs are solid)
* **P5 - AWS Cost Anomaly:** ~50% (Scorer and APIs exist, but CloudTrail ingestion is missing)
* **P6 - Runbook Automation:** ~40% (APIs and Slackbot exist; parsing, classification, and agent execution are completely missing)
* **P2 - IaC Drift Detection:** ~30% (SaaS ingestion APIs exist; the entire external agent, mTLS, and diff engines are missing)
---
## Per-Service Breakdown by Epic
### P2: IaC Drift Detection
* **Epic 1: Drift Detection Agent****MISSING** - No Go agent binary. Terraform, CloudFormation, Kubernetes, and Pulumi state scanning engines do not exist. Secret scrubbing logic is missing.
* **Epic 2: Agent Communication** 🟡 **PARTIAL** - Basic HTTP ingestion route exists (`/v1/ingest/drift`), but mTLS authentication and SQS FIFO message queues are not implemented.
* **Epic 3: Event Processor** 🟡 **PARTIAL** - Ingestion, nonce replay prevention, and PostgreSQL persistence with RLS are implemented. Missing canonical schema normalization and chunked report reassembly.
* **Epic 4: Notification Engine** 🟡 **PARTIAL** - Slack Block Kit, Email (Resend), Webhook, and PagerDuty dispatchers are implemented. Missing Daily Digest job and severity-based routing logic.
* **Epic 5: Remediation****MISSING** - Interactive Slack buttons exist in notification payloads, but the backend workflow engine, approval tracking, and agent-side execution dispatch are missing.
* **Epic 6 & 7: Dashboard UI & API****IMPLEMENTED** - `fetchStacks`, `fetchStackHistory`, and `fetchLatestReport` endpoints are fully implemented with tenant RLS.
* **Epic 8 & 9: Infrastructure / PLG****MISSING** - No CDK templates, CI/CD pipelines, Stripe billing, or CLI setup logic.
* **Epic 10: Transparent Factory** 🟡 **PARTIAL** - Database migrations and RLS are implemented. Missing Feature Flag service and OTEL Tracing.
### P3: Alert Intelligence
* **Epic 1: Webhook Ingestion** 🟡 **PARTIAL** - Webhook routes and HMAC validation for Datadog, PagerDuty, OpsGenie, and Grafana are implemented via Redis queue. Missing S3 archival, oversized payload handling, and SQS/DLQ.
* **Epic 2: Alert Normalization** 🟡 **PARTIAL** - Basic provider mapping logic exists in `webhook-processor.ts`.
* **Epic 3: Correlation Engine** 🟡 **PARTIAL** - Time-window correlation and fingerprint deduplication are implemented using Redis. Missing Service-Affinity matching and strict cross-tenant worker isolation.
* **Epic 4: Notification & Escalation** 🟡 **PARTIAL** - Slack, Email, and Webhook dispatchers are implemented. Missing PagerDuty auto-escalation cron and Daily Noise Report.
* **Epic 5: Slack Bot** 🟡 **PARTIAL** - Missing interactive feedback button handlers (`/slack/interactions`) for noise/helpful marking, and missing `/dd0c` slash commands.
* **Epic 6 & 7: Dashboard UI & API** 🟡 **PARTIAL** - Incident CRUD, filtering, and summary endpoints are implemented. Missing `MTTR` and `Noise Reduction` analytics endpoints requested by the spec.
* **Epic 8 & 9: Infrastructure / PLG****MISSING** - No CDK, Stripe billing, or Free Tier (10K alerts/month) limit enforcement.
### P4: Lightweight IDP
* **Epic 1: AWS Discovery Scanner** 🟡 **PARTIAL** - ECS, Lambda, and RDS resource discovery implemented. Missing CloudFormation, API Gateway, and Step Functions orchestration.
* **Epic 2: GitHub Discovery Scanner** 🟡 **PARTIAL** - Repository fetching, pagination, and basic `package.json`/`Dockerfile` heuristics implemented. Missing advanced CODEOWNERS and commit history parsing.
* **Epic 3: Service Catalog** 🟡 **PARTIAL** - Catalog ingestion, partial update staging, ownership resolution, and DB APIs implemented. Missing PagerDuty/OpsGenie on-call mapping.
* **Epic 4: Search Engine** 🟡 **PARTIAL** - Meilisearch integration with PostgreSQL fallback implemented. Missing Redis prefix caching for `Cmd+K` performance optimization.
* **Epic 5: Dashboard API****IMPLEMENTED** - Service CRUD and ownership summary endpoints are fully functional and align with Console requirements.
* **Epic 6: Analytics Dashboards****MISSING** - API endpoints for Ownership Coverage, Health Scorecards, and Tech Debt tracking are missing.
### P5: AWS Cost Anomaly
* **Epic 1: CloudTrail Ingestion****MISSING** - A batch ingestion API exists, but the AWS EventBridge cross-account rules, SQS FIFO, and Lambda normalizer are entirely missing.
* **Epic 2: Anomaly Detection** 🟡 **PARTIAL** - Welford's algorithm and basic Z-Score computation are implemented. Missing novelty scoring, cold-start fast path, and composite scoring logic.
* **Epic 3: Zombie Hunter****MISSING** - No scheduled jobs or logic to detect idle EC2, RDS, or EBS resources.
* **Epic 4: Notification & Remediation** 🟡 **PARTIAL** - Slack notification generation is implemented. Missing the `/slack/interactions` endpoint to process remediation buttons (e.g., Stop Instance).
* **Epic 6 & 7: Dashboard UI & API****IMPLEMENTED** - Anomalies, Baselines, and Governance rule CRUD endpoints match Console expectations.
* **Epic 10: Transparent Factory** 🟡 **PARTIAL** - The 14-day `GovernanceEngine` (Shadow -> Audit -> Enforce) auto-promotion and Panic Mode logic is implemented. Missing Circuit Breakers and OTEL spans.
### P6: Runbook Automation
* **Epic 1: Runbook Parser****MISSING** - The system currently expects raw YAML inputs. Confluence HTML, Notion Markdown, and LLM step extraction parsing engines are entirely missing.
* **Epic 2: Action Classifier****MISSING** - Neither the deterministic regex safety scanner nor the secondary LLM risk classifier exist.
* **Epic 3: Execution Engine** 🟡 **PARTIAL** - Basic state transitions are handled in `api/runbooks.ts`. Missing Trust Level enforcement, network partition recovery, and step idempotency logic.
* **Epic 4: Agent****MISSING** - No Go agent binary, gRPC bidirectional streaming, or local sandbox execution environments exist.
* **Epic 5: Audit Trail** 🟡 **PARTIAL** - Basic Postgres `audit_entries` table exists. Missing the immutable append-only hash chain logic and CSV/PDF compliance export APIs.
* **Epic 6: Dashboard API****IMPLEMENTED** - Runbook, execution, and approval APIs are implemented. Redis pub/sub Agent Bridge exists. Slackbot interaction handlers are fully implemented with signature verification.
---
## Priority Ranking (What to Implement Next)
This ranking is based on maximizing time-to-value: prioritizing services where the Console UI is already supported, the backend logic is mostly complete, and the remaining gaps are well-defined.
**1. P4 - Lightweight IDP**
* **Why:** It is functionally the most complete. The Console APIs work, Meilisearch sync works, and basic AWS/GitHub discovery is operational.
* **Next Steps:** Implement the missing AWS scanners (CloudFormation, API Gateway) and the `Redis` prefix caching for search. Add the analytics endpoints (Ownership, Health, Tech Debt) to unlock the remaining UI views.
**2. P3 - Alert Intelligence**
* **Why:** The core pipeline (Webhook -> Redis -> Worker -> DB) is functional and deduplication logic works. Console APIs are satisfied.
* **Next Steps:** Build the `MTTR` and `Noise Reduction` analytics SQL queries, add PagerDuty escalation triggers, and implement the interactive Slack button handlers.
**3. P5 - AWS Cost Anomaly**
* **Why:** The complex math (Welford running stats) and database governance logic are done, making the dashboard functional for demo data.
* **Next Steps:** The biggest blocker is that there is no data pipeline. Implement the CDK stack to deploy the EventBridge rules and the `Lambda Normalizer` to translate CloudTrail events into the existing `/v1/ingest` API.
**4. P6 - Runbook Automation**
* **Why:** The API orchestration, Slack integrations, and Redis Pub/Sub bridges are nicely implemented, but it is currently a "brain without a body."
* **Next Steps:** It requires two massive standalone systems: the `Runbook Parser` (LLM + AST logic) and the actual external `Agent` (Go binary with gRPC and sandboxing).
**5. P2 - IaC Drift Detection**
* **Why:** Furthest from completion. While the SaaS API exists, it requires a highly complex external Go agent capable of reading Terraform/K8s/Pulumi state, a secure mTLS CA registration system, and a diffing/scoring engine—none of which currently exist.