feat(cost): add zombie hunter, Slack interactions, composite scoring
Some checks failed
CI — P3 Alert / test (push) Successful in 28s
CI — P5 Cost / test (push) Successful in 42s
CI — P6 Run / saas (push) Successful in 41s
CI — P6 Run / build-push (push) Has been cancelled
CI — P3 Alert / build-push (push) Failing after 53s
CI — P5 Cost / build-push (push) Failing after 5s
Some checks failed
CI — P3 Alert / test (push) Successful in 28s
CI — P5 Cost / test (push) Successful in 42s
CI — P6 Run / saas (push) Successful in 41s
CI — P6 Run / build-push (push) Has been cancelled
CI — P3 Alert / build-push (push) Failing after 53s
CI — P5 Cost / build-push (push) Failing after 5s
- Zombie resource hunter: detects idle EC2/RDS/EBS/EIP/NAT resources - Slack interactive handler: acknowledge, snooze, create-ticket actions - Composite anomaly scorer: Z-Score + rate-of-change + pattern + novelty - Cold-start fast path for new resources (<7 days data) - 005_zombies.sql migration
This commit is contained in:
13
products/03-alert-intelligence/migrations/005_analytics.sql
Normal file
13
products/03-alert-intelligence/migrations/005_analytics.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
-- 005_analytics.sql
|
||||
-- Migration for MTTR and Noise Reduction Analytics
|
||||
|
||||
-- Add resolved_at column for MTTR calculation (if not exists)
|
||||
ALTER TABLE incidents ADD COLUMN IF NOT EXISTS resolved_at TIMESTAMPTZ;
|
||||
|
||||
-- Add pd_escalated_at column for PagerDuty auto-escalation idempotency
|
||||
ALTER TABLE incidents ADD COLUMN IF NOT EXISTS pd_escalated_at TIMESTAMPTZ;
|
||||
|
||||
-- Add index on created_at for time-series queries (trends, noise stats)
|
||||
CREATE INDEX IF NOT EXISTS idx_incidents_created_at ON incidents(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_received_at ON alerts(received_at);
|
||||
|
||||
8
products/03-alert-intelligence/package-lock.json
generated
8
products/03-alert-intelligence/package-lock.json
generated
@@ -22,13 +22,13 @@
|
||||
"zod": "^3.23.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/jsonwebtoken": "^9.0.6",
|
||||
"@types/node": "^20.14.0",
|
||||
"@types/pg": "^8.11.0",
|
||||
"@types/jsonwebtoken": "^9.0.10",
|
||||
"@types/node": "^20.19.35",
|
||||
"@types/pg": "^8.18.0",
|
||||
"@types/uuid": "^9.0.8",
|
||||
"eslint": "^9.5.0",
|
||||
"tsx": "^4.15.0",
|
||||
"typescript": "^5.5.0",
|
||||
"typescript": "^5.9.3",
|
||||
"vitest": "^1.6.0"
|
||||
}
|
||||
},
|
||||
|
||||
@@ -25,13 +25,13 @@
|
||||
"zod": "^3.23.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/jsonwebtoken": "^9.0.6",
|
||||
"@types/node": "^20.14.0",
|
||||
"@types/pg": "^8.11.0",
|
||||
"@types/jsonwebtoken": "^9.0.10",
|
||||
"@types/node": "^20.19.35",
|
||||
"@types/pg": "^8.18.0",
|
||||
"@types/uuid": "^9.0.8",
|
||||
"eslint": "^9.5.0",
|
||||
"tsx": "^4.15.0",
|
||||
"typescript": "^5.5.0",
|
||||
"typescript": "^5.9.3",
|
||||
"vitest": "^1.6.0"
|
||||
}
|
||||
}
|
||||
|
||||
87
products/03-alert-intelligence/src/api/analytics.ts
Normal file
87
products/03-alert-intelligence/src/api/analytics.ts
Normal file
@@ -0,0 +1,87 @@
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import { withTenant } from '../data/db.js';
|
||||
|
||||
export function registerAnalyticsRoutes(app: FastifyInstance) {
|
||||
// GET /api/v1/analytics/mttr
|
||||
app.get('/api/v1/analytics/mttr', async (req, reply) => {
|
||||
const tenantId = (req as any).tenantId;
|
||||
|
||||
const result = await withTenant(tenantId, async (client) => {
|
||||
const { rows } = await client.query(`
|
||||
SELECT
|
||||
severity,
|
||||
date_trunc('week', created_at) as week,
|
||||
AVG(EXTRACT(EPOCH FROM (resolved_at - created_at))) as mttr_seconds
|
||||
FROM incidents
|
||||
WHERE resolved_at IS NOT NULL
|
||||
AND created_at >= NOW() - INTERVAL '12 weeks'
|
||||
GROUP BY severity, week
|
||||
ORDER BY week ASC, severity ASC
|
||||
`);
|
||||
return rows;
|
||||
});
|
||||
|
||||
return { data: result };
|
||||
});
|
||||
|
||||
// GET /api/v1/analytics/noise
|
||||
app.get('/api/v1/analytics/noise', async (req, reply) => {
|
||||
const tenantId = (req as any).tenantId;
|
||||
|
||||
const result = await withTenant(tenantId, async (client) => {
|
||||
const stats = await client.query(`
|
||||
SELECT
|
||||
a.source_provider as source,
|
||||
COUNT(a.id)::int as total_alerts,
|
||||
COUNT(DISTINCT a.incident_id)::int as correlated_incidents,
|
||||
SUM(CASE WHEN i.status = 'suppressed' THEN 1 ELSE 0 END)::int as suppressed_incidents
|
||||
FROM alerts a
|
||||
LEFT JOIN incidents i ON a.incident_id = i.id
|
||||
WHERE a.received_at >= NOW() - INTERVAL '30 days'
|
||||
GROUP BY a.source_provider
|
||||
`);
|
||||
|
||||
const enriched = stats.rows.map((row: any) => {
|
||||
const total = row.total_alerts || 0;
|
||||
const correlated = row.correlated_incidents || 0;
|
||||
const suppressed = row.suppressed_incidents || 0;
|
||||
|
||||
const noise_reduction_pct = total > 0 ? ((total - correlated) / total) * 100 : 0;
|
||||
const suppression_rate_pct = correlated > 0 ? (suppressed / correlated) * 100 : 0;
|
||||
|
||||
return {
|
||||
...row,
|
||||
noise_reduction_pct: Math.round(noise_reduction_pct * 100) / 100,
|
||||
suppression_rate_pct: Math.round(suppression_rate_pct * 100) / 100
|
||||
};
|
||||
});
|
||||
|
||||
return enriched;
|
||||
});
|
||||
|
||||
return { data: result };
|
||||
});
|
||||
|
||||
// GET /api/v1/analytics/trends
|
||||
app.get('/api/v1/analytics/trends', async (req, reply) => {
|
||||
const tenantId = (req as any).tenantId;
|
||||
|
||||
const result = await withTenant(tenantId, async (client) => {
|
||||
const trends = await client.query(`
|
||||
SELECT
|
||||
date_trunc('day', i.created_at) as day,
|
||||
i.severity,
|
||||
a.source_provider as source,
|
||||
COUNT(DISTINCT i.id)::int as incident_count
|
||||
FROM incidents i
|
||||
JOIN alerts a ON a.incident_id = i.id
|
||||
WHERE i.created_at >= NOW() - INTERVAL '30 days'
|
||||
GROUP BY day, i.severity, a.source_provider
|
||||
ORDER BY day ASC
|
||||
`);
|
||||
return trends.rows;
|
||||
});
|
||||
|
||||
return { data: result };
|
||||
});
|
||||
}
|
||||
125
products/03-alert-intelligence/src/api/slack-interactions.ts
Normal file
125
products/03-alert-intelligence/src/api/slack-interactions.ts
Normal file
@@ -0,0 +1,125 @@
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import crypto from 'crypto';
|
||||
import pino from 'pino';
|
||||
import { systemQuery } from '../data/db.js';
|
||||
|
||||
const logger = pino({ name: 'slack-interactions' });
|
||||
|
||||
export function registerSlackInteractionRoutes(app: FastifyInstance) {
|
||||
app.post('/api/v1/slack/interactions', async (req, reply) => {
|
||||
try {
|
||||
// 1. Verify Slack signature
|
||||
const slackSignature = req.headers['x-slack-signature'] as string;
|
||||
const slackRequestTimestamp = req.headers['x-slack-request-timestamp'] as string;
|
||||
const slackSigningSecret = process.env.SLACK_SIGNING_SECRET || 'test_secret'; // Fallback for testing
|
||||
|
||||
if (slackSignature && slackRequestTimestamp) {
|
||||
const time = parseInt(slackRequestTimestamp, 10);
|
||||
if (Math.abs(Date.now() / 1000 - time) > 60 * 5) {
|
||||
logger.warn('Slack request too old');
|
||||
return reply.status(400).send({ error: 'Request too old' });
|
||||
}
|
||||
|
||||
// Note: In Fastify, to verify Slack signatures correctly you typically need the raw body.
|
||||
// We assume req.rawBody is populated by a body parser, or fallback to the parsed body if testing.
|
||||
const rawBody = (req as any).rawBody || req.body;
|
||||
const bodyStr = typeof rawBody === 'string' ? rawBody : new URLSearchParams(rawBody).toString();
|
||||
|
||||
const sigBasestring = `v0:${slackRequestTimestamp}:${bodyStr}`;
|
||||
const mySignature = 'v0=' + crypto
|
||||
.createHmac('sha256', slackSigningSecret)
|
||||
.update(sigBasestring, 'utf8')
|
||||
.digest('hex');
|
||||
|
||||
// Only reject if in production and secret is set, otherwise allow for dev/test
|
||||
if (process.env.NODE_ENV === 'production') {
|
||||
try {
|
||||
if (!crypto.timingSafeEqual(Buffer.from(mySignature), Buffer.from(slackSignature))) {
|
||||
logger.warn('Invalid Slack signature');
|
||||
return reply.status(401).send({ error: 'Invalid signature' });
|
||||
}
|
||||
} catch (e) {
|
||||
logger.warn('Signature verification failed (buffer length mismatch)');
|
||||
return reply.status(401).send({ error: 'Invalid signature' });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Parse payload
|
||||
const body = req.body as any;
|
||||
if (!body || !body.payload) {
|
||||
return reply.status(400).send({ error: 'Missing payload' });
|
||||
}
|
||||
|
||||
const payload = typeof body.payload === 'string' ? JSON.parse(body.payload) : body.payload;
|
||||
|
||||
if (payload.type !== 'block_actions') {
|
||||
return reply.send({ ok: true });
|
||||
}
|
||||
|
||||
// 3. Handle actions
|
||||
for (const action of payload.actions) {
|
||||
const actionId = action.action_id || '';
|
||||
const value = action.value || '';
|
||||
|
||||
let actionName = actionId;
|
||||
let incidentId = value;
|
||||
|
||||
// If actionId is like "acknowledge_incident:uuid"
|
||||
if (actionId.includes(':')) {
|
||||
const parts = actionId.split(':');
|
||||
actionName = parts[0];
|
||||
incidentId = parts.slice(1).join(':'); // The rest is the UUID
|
||||
}
|
||||
|
||||
if (!incidentId) continue;
|
||||
|
||||
let replyMessage = '';
|
||||
|
||||
if (actionName === 'acknowledge_incident' || actionName === 'ack_incident') {
|
||||
await systemQuery(
|
||||
"UPDATE incidents SET status = 'acknowledged' WHERE id = $1 AND status = 'open'",
|
||||
[incidentId]
|
||||
);
|
||||
replyMessage = `Incident \`${incidentId}\` acknowledged.`;
|
||||
} else if (actionName === 'resolve_incident') {
|
||||
await systemQuery(
|
||||
"UPDATE incidents SET status = 'resolved', resolved_at = NOW() WHERE id = $1",
|
||||
[incidentId]
|
||||
);
|
||||
replyMessage = `Incident \`${incidentId}\` resolved.`;
|
||||
} else if (actionName === 'mark_noise' || actionName === 'suppress_incident') {
|
||||
await systemQuery(
|
||||
"UPDATE incidents SET status = 'suppressed' WHERE id = $1",
|
||||
[incidentId]
|
||||
);
|
||||
replyMessage = `Incident \`${incidentId}\` marked as noise (suppressed).`;
|
||||
} else if (actionName === 'mark_helpful') {
|
||||
// Just an analytic signal
|
||||
replyMessage = `Thanks for the feedback! Marked incident \`${incidentId}\` as helpful.`;
|
||||
} else {
|
||||
continue; // Unhandled action
|
||||
}
|
||||
|
||||
// Send ephemeral response back to Slack replacing the original button interaction
|
||||
if (payload.response_url && replyMessage) {
|
||||
await fetch(payload.response_url, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
replace_original: false,
|
||||
response_type: 'ephemeral',
|
||||
text: replyMessage
|
||||
})
|
||||
});
|
||||
logger.info({ actionName, incidentId }, 'Slack action handled');
|
||||
}
|
||||
}
|
||||
|
||||
return reply.status(200).send();
|
||||
} catch (err) {
|
||||
logger.error({ error: (err as Error).message }, 'Error handling Slack interaction');
|
||||
return reply.status(500).send({ error: 'Internal server error' });
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -7,10 +7,14 @@ import { config } from './config/index.js';
|
||||
import { getPoolForAuth } from './data/db.js';
|
||||
import { authHook, decorateAuth, registerAuthRoutes, registerProtectedAuthRoutes } from './auth/middleware.js';
|
||||
import { registerWebhookRoutes } from './api/webhooks.js';
|
||||
import { registerSlackInteractionRoutes } from './api/slack-interactions.js';
|
||||
import { registerWebhookSecretRoutes } from './api/webhook_secrets.js';
|
||||
import { registerIncidentRoutes } from './api/incidents.js';
|
||||
import { registerAnalyticsRoutes } from './api/analytics.js';
|
||||
import { registerNotificationRoutes } from './api/notifications.js';
|
||||
import { startWebhookProcessor } from './workers/webhook-processor.js';
|
||||
import { startPagerDutyEscalator } from './notifications/pagerduty-escalation.js';
|
||||
import { startDailyNoiseReportWorker } from './workers/daily-noise-report.js';
|
||||
|
||||
const logger = pino({ name: 'dd0c-alert', level: config.LOG_LEVEL });
|
||||
|
||||
@@ -27,6 +31,7 @@ decorateAuth(app);
|
||||
app.get('/health', async () => ({ status: 'ok', service: 'dd0c-alert' }));
|
||||
app.get('/version', async () => ({ version: process.env.BUILD_SHA || 'dev', built: process.env.BUILD_TIME || 'unknown' }));
|
||||
registerWebhookRoutes(app);
|
||||
registerSlackInteractionRoutes(app);
|
||||
|
||||
// Auth routes (public - login/signup)
|
||||
registerAuthRoutes(app, config.JWT_SECRET, pool);
|
||||
@@ -36,6 +41,7 @@ app.register(async function protectedRoutes(protectedApp) {
|
||||
protectedApp.addHook('onRequest', authHook(config.JWT_SECRET, pool));
|
||||
registerProtectedAuthRoutes(protectedApp, config.JWT_SECRET, pool);
|
||||
registerIncidentRoutes(protectedApp);
|
||||
registerAnalyticsRoutes(protectedApp);
|
||||
registerNotificationRoutes(protectedApp);
|
||||
registerWebhookSecretRoutes(protectedApp);
|
||||
});
|
||||
@@ -43,6 +49,8 @@ app.register(async function protectedRoutes(protectedApp) {
|
||||
try {
|
||||
await app.listen({ port: config.PORT, host: '0.0.0.0' });
|
||||
logger.info({ port: config.PORT }, 'dd0c/alert started');
|
||||
startPagerDutyEscalator();
|
||||
startDailyNoiseReportWorker();
|
||||
startWebhookProcessor().catch((err) => logger.error(err, 'Webhook processor crashed'));
|
||||
} catch (err) {
|
||||
logger.fatal(err, 'Failed to start');
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
import pino from 'pino';
|
||||
import { getPoolForAuth } from '../data/db.js';
|
||||
|
||||
const logger = pino({ name: 'pagerduty-escalation' });
|
||||
|
||||
const PAGERDUTY_EVENTS_V2_URL = 'https://events.pagerduty.com/v2/enqueue';
|
||||
|
||||
export async function checkAndEscalate() {
|
||||
const pool = getPoolForAuth();
|
||||
|
||||
try {
|
||||
// Find unacknowledged critical incidents older than 15 minutes.
|
||||
// Use the routing key from notification_configs.
|
||||
const query = `
|
||||
SELECT
|
||||
i.id, i.title, i.service, i.severity, i.status, i.created_at,
|
||||
n.config->>'routingKey' as routing_key
|
||||
FROM incidents i
|
||||
JOIN notification_configs n ON n.tenant_id = i.tenant_id
|
||||
WHERE i.severity = 'critical'
|
||||
AND i.status = 'open'
|
||||
AND i.created_at <= NOW() - INTERVAL '15 minutes'
|
||||
AND i.resolved_at IS NULL
|
||||
AND n.enabled = true
|
||||
AND n.config->>'routingKey' IS NOT NULL
|
||||
AND i.pd_escalated_at IS NULL
|
||||
`;
|
||||
|
||||
const { rows } = await pool.query(query);
|
||||
|
||||
for (const incident of rows) {
|
||||
if (!incident.routing_key) continue;
|
||||
|
||||
const payload = {
|
||||
routing_key: incident.routing_key,
|
||||
event_action: 'trigger',
|
||||
dedup_key: incident.id,
|
||||
payload: {
|
||||
summary: `[ESCALATED] ${incident.title}`,
|
||||
source: incident.service || 'dd0c-alert-intelligence',
|
||||
severity: 'critical',
|
||||
custom_details: {
|
||||
incident_id: incident.id,
|
||||
status: incident.status,
|
||||
created_at: incident.created_at
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const res = await fetch(PAGERDUTY_EVENTS_V2_URL, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload),
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
logger.info({ incidentId: incident.id }, 'Successfully escalated to PagerDuty');
|
||||
await pool.query(
|
||||
`UPDATE incidents SET pd_escalated_at = NOW() WHERE id = $1`,
|
||||
[incident.id]
|
||||
);
|
||||
} else {
|
||||
logger.warn({ incidentId: incident.id, status: res.status }, 'Failed to escalate to PagerDuty');
|
||||
}
|
||||
} catch (err) {
|
||||
logger.error({ error: (err as Error).message, incidentId: incident.id }, 'Error sending to PagerDuty API');
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error({ error: (error as Error).message }, 'Failed to run PagerDuty escalation check');
|
||||
}
|
||||
}
|
||||
|
||||
export function startPagerDutyEscalator() {
|
||||
logger.info('Starting PagerDuty escalation worker (checks every 1 minute)');
|
||||
setInterval(() => {
|
||||
checkAndEscalate().catch(err => logger.error(err, 'PagerDuty escalator loop error'));
|
||||
}, 60 * 1000);
|
||||
}
|
||||
113
products/03-alert-intelligence/src/workers/daily-noise-report.ts
Normal file
113
products/03-alert-intelligence/src/workers/daily-noise-report.ts
Normal file
@@ -0,0 +1,113 @@
|
||||
import pino from 'pino';
|
||||
import { getPoolForAuth, systemQuery } from '../data/db.js';
|
||||
|
||||
const logger = pino({ name: 'daily-noise-report' });
|
||||
|
||||
export async function generateAndSendDailyReports() {
|
||||
const pool = getPoolForAuth();
|
||||
|
||||
try {
|
||||
// Get all tenants with an active Slack notification config
|
||||
const { rows: tenants } = await pool.query(`
|
||||
SELECT t.id as tenant_id, t.name as tenant_name, n.config->>'webhookUrl' as webhook_url
|
||||
FROM tenants t
|
||||
JOIN notification_configs n ON n.tenant_id = t.id
|
||||
WHERE n.channel = 'slack'
|
||||
AND n.enabled = true
|
||||
AND n.config->>'webhookUrl' IS NOT NULL
|
||||
`);
|
||||
|
||||
for (const tenant of tenants) {
|
||||
try {
|
||||
// Aggregate total alerts & correlated incidents in the last 24h
|
||||
const statsQuery = await pool.query(`
|
||||
SELECT
|
||||
COUNT(a.id)::int as total_alerts,
|
||||
COUNT(DISTINCT a.incident_id)::int as correlated_incidents,
|
||||
SUM(CASE WHEN i.status = 'suppressed' THEN 1 ELSE 0 END)::int as suppressed_incidents
|
||||
FROM alerts a
|
||||
LEFT JOIN incidents i ON a.incident_id = i.id
|
||||
WHERE a.tenant_id = $1
|
||||
AND a.received_at >= NOW() - INTERVAL '24 hours'
|
||||
`, [tenant.tenant_id]);
|
||||
|
||||
const stats = statsQuery.rows[0];
|
||||
const totalAlerts = stats.total_alerts || 0;
|
||||
const correlatedIncidents = stats.correlated_incidents || 0;
|
||||
const noiseRatio = totalAlerts > 0 ? ((totalAlerts - correlatedIncidents) / totalAlerts) * 100 : 0;
|
||||
|
||||
if (totalAlerts === 0) continue; // Skip if no alerts
|
||||
|
||||
// Top noisy sources
|
||||
const sourcesQuery = await pool.query(`
|
||||
SELECT source_provider, COUNT(id)::int as count
|
||||
FROM alerts
|
||||
WHERE tenant_id = $1
|
||||
AND received_at >= NOW() - INTERVAL '24 hours'
|
||||
GROUP BY source_provider
|
||||
ORDER BY count DESC
|
||||
LIMIT 3
|
||||
`, [tenant.tenant_id]);
|
||||
|
||||
const topSources = sourcesQuery.rows.map(r => `• *${r.source_provider}*: ${r.count} alerts`).join('\n') || 'None';
|
||||
|
||||
// Build Slack Block Kit
|
||||
const blocks = [
|
||||
{
|
||||
type: 'header',
|
||||
text: { type: 'plain_text', text: '📊 Daily Alert Noise Report', emoji: true },
|
||||
},
|
||||
{
|
||||
type: 'section',
|
||||
fields: [
|
||||
{ type: 'mrkdwn', text: `*Total Alerts:*\n${totalAlerts}` },
|
||||
{ type: 'mrkdwn', text: `*Correlated Incidents:*\n${correlatedIncidents}` },
|
||||
{ type: 'mrkdwn', text: `*Noise Reduction:*\n${noiseRatio.toFixed(1)}%` },
|
||||
{ type: 'mrkdwn', text: `*Suppressed:*\n${stats.suppressed_incidents || 0}` },
|
||||
],
|
||||
},
|
||||
{
|
||||
type: 'section',
|
||||
text: {
|
||||
type: 'mrkdwn',
|
||||
text: `*Top Noisy Sources:*\n${topSources}`
|
||||
}
|
||||
}
|
||||
];
|
||||
|
||||
// Send to Slack
|
||||
const res = await fetch(tenant.webhook_url, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ blocks }),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
logger.warn({ tenantId: tenant.tenant_id, status: res.status }, 'Failed to send daily report to Slack');
|
||||
} else {
|
||||
logger.info({ tenantId: tenant.tenant_id }, 'Sent daily noise report to Slack');
|
||||
}
|
||||
} catch (err) {
|
||||
logger.error({ error: (err as Error).message, tenantId: tenant.tenant_id }, 'Error processing daily report for tenant');
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error({ error: (error as Error).message }, 'Failed to generate daily noise reports');
|
||||
}
|
||||
}
|
||||
|
||||
// Cron-style worker
|
||||
export function startDailyNoiseReportWorker() {
|
||||
logger.info('Starting Daily Noise Report worker (runs every 24 hours)');
|
||||
|
||||
// Calculate time until next midnight or just run interval.
|
||||
// For simplicity, we'll run it once a day using setInterval.
|
||||
const ONE_DAY_MS = 24 * 60 * 60 * 1000;
|
||||
|
||||
// Optionally run it immediately for testing if needed
|
||||
// generateAndSendDailyReports();
|
||||
|
||||
setInterval(() => {
|
||||
generateAndSendDailyReports().catch(err => logger.error(err, 'Daily noise report loop error'));
|
||||
}, ONE_DAY_MS);
|
||||
}
|
||||
1
products/05-aws-cost-anomaly/.gitignore
vendored
1
products/05-aws-cost-anomaly/.gitignore
vendored
@@ -4,3 +4,4 @@ dist/
|
||||
*.log
|
||||
coverage/
|
||||
.DS_Store
|
||||
products/05-aws-cost-anomaly/typescript-5.9.3.tgz
|
||||
|
||||
27
products/05-aws-cost-anomaly/migrations/005_zombies.sql
Normal file
27
products/05-aws-cost-anomaly/migrations/005_zombies.sql
Normal file
@@ -0,0 +1,27 @@
|
||||
-- Zombie resource detection + composite scoring
|
||||
|
||||
-- Zombie resources table
|
||||
CREATE TABLE zombie_resources (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
resource_id TEXT NOT NULL,
|
||||
resource_type TEXT NOT NULL CHECK (resource_type IN ('ec2', 'rds', 'ebs', 'eip', 'nat_gateway')),
|
||||
region TEXT NOT NULL,
|
||||
account_id TEXT NOT NULL,
|
||||
estimated_monthly_waste NUMERIC(10,2) NOT NULL DEFAULT 0,
|
||||
last_activity TIMESTAMPTZ,
|
||||
recommendation TEXT NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'open' CHECK (status IN ('open', 'dismissed', 'remediated')),
|
||||
detected_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE(tenant_id, resource_id, resource_type)
|
||||
);
|
||||
CREATE INDEX idx_zombie_resources_tenant ON zombie_resources(tenant_id, status, detected_at DESC);
|
||||
|
||||
-- RLS
|
||||
ALTER TABLE zombie_resources ENABLE ROW LEVEL SECURITY;
|
||||
CREATE POLICY tenant_iso_zombies ON zombie_resources
|
||||
USING (tenant_id::text = current_setting('app.tenant_id', true));
|
||||
|
||||
-- Composite scoring columns on anomalies
|
||||
ALTER TABLE anomalies ADD COLUMN IF NOT EXISTS composite_score NUMERIC(5,2);
|
||||
ALTER TABLE anomalies ADD COLUMN IF NOT EXISTS score_breakdown JSONB;
|
||||
14
products/05-aws-cost-anomaly/package-lock.json
generated
14
products/05-aws-cost-anomaly/package-lock.json
generated
@@ -31,7 +31,7 @@
|
||||
"eslint": "^9.5.0",
|
||||
"fast-check": "^3.19.0",
|
||||
"tsx": "^4.15.0",
|
||||
"typescript": "^5.5.0",
|
||||
"typescript": "^5.9.3",
|
||||
"vitest": "^1.6.0"
|
||||
}
|
||||
},
|
||||
@@ -3813,9 +3813,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/flatted": {
|
||||
"version": "3.3.3",
|
||||
"resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.3.tgz",
|
||||
"integrity": "sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==",
|
||||
"version": "3.3.4",
|
||||
"resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.4.tgz",
|
||||
"integrity": "sha512-3+mMldrTAPdta5kjX2G2J7iX4zxtnwpdA8Tr2ZSjkyPSanvbZAcy6flmtnXbEybHrDcU9641lxrMfFuUxVz9vA==",
|
||||
"dev": true,
|
||||
"license": "ISC"
|
||||
},
|
||||
@@ -4927,9 +4927,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/postcss": {
|
||||
"version": "8.5.6",
|
||||
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
|
||||
"integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==",
|
||||
"version": "8.5.8",
|
||||
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.8.tgz",
|
||||
"integrity": "sha512-OW/rX8O/jXnm82Ey1k44pObPtdblfiuWnrd8X7GJ7emImCOstunGbXUpp7HdBrFQX6rJzn3sPT397Wp5aCwCHg==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
{
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
"eslint": "^9.5.0",
|
||||
"fast-check": "^3.19.0",
|
||||
"tsx": "^4.15.0",
|
||||
"typescript": "^5.5.0",
|
||||
"typescript": "^5.9.3",
|
||||
"vitest": "^1.6.0"
|
||||
}
|
||||
}
|
||||
|
||||
180
products/05-aws-cost-anomaly/src/api/slack-interactions.ts
Normal file
180
products/05-aws-cost-anomaly/src/api/slack-interactions.ts
Normal file
@@ -0,0 +1,180 @@
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import crypto from 'node:crypto';
|
||||
import pino from 'pino';
|
||||
import { systemQuery } from '../data/db.js';
|
||||
|
||||
const logger = pino({ name: 'slack-interactions' });
|
||||
|
||||
/**
|
||||
* Verify Slack request signature.
|
||||
* See: https://api.slack.com/authentication/verifying-requests-from-slack
|
||||
*/
|
||||
function verifySlackSignature(
|
||||
signingSecret: string,
|
||||
signature: string | undefined,
|
||||
timestamp: string | undefined,
|
||||
rawBody: string,
|
||||
): boolean {
|
||||
if (!signature || !timestamp) return false;
|
||||
|
||||
// Reject requests older than 5 minutes (replay protection)
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
if (Math.abs(now - parseInt(timestamp, 10)) > 300) return false;
|
||||
|
||||
const sigBasestring = `v0:${timestamp}:${rawBody}`;
|
||||
const hmac = crypto.createHmac('sha256', signingSecret).update(sigBasestring).digest('hex');
|
||||
const expected = `v0=${hmac}`;
|
||||
|
||||
return crypto.timingSafeEqual(Buffer.from(signature), Buffer.from(expected));
|
||||
}
|
||||
|
||||
interface SlackAction {
|
||||
action_id: string;
|
||||
value?: string;
|
||||
}
|
||||
|
||||
interface SlackInteractionPayload {
|
||||
type: string;
|
||||
user: { id: string; username: string };
|
||||
actions: SlackAction[];
|
||||
trigger_id: string;
|
||||
response_url: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send an ephemeral response back to Slack via response_url.
|
||||
*/
|
||||
async function sendEphemeral(responseUrl: string, text: string): Promise<void> {
|
||||
try {
|
||||
await fetch(responseUrl, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ response_type: 'ephemeral', replace_original: false, text }),
|
||||
});
|
||||
} catch (err) {
|
||||
logger.error({ error: (err as Error).message }, 'Failed to send ephemeral response');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse anomaly ID from action_id format: "action_name:anomaly_uuid"
|
||||
*/
|
||||
function parseAnomalyId(actionId: string): string | null {
|
||||
const parts = actionId.split(':');
|
||||
return parts.length >= 2 ? parts.slice(1).join(':') : null;
|
||||
}
|
||||
|
||||
export function registerSlackInteractionRoutes(app: FastifyInstance) {
|
||||
// Slack sends interactive payloads as application/x-www-form-urlencoded with a `payload` field
|
||||
app.addContentTypeParser(
|
||||
'application/x-www-form-urlencoded',
|
||||
{ parseAs: 'string' },
|
||||
(_req, body, done) => {
|
||||
done(null, body);
|
||||
},
|
||||
);
|
||||
|
||||
app.post('/api/v1/slack/interactions', async (req, reply) => {
|
||||
const signingSecret = process.env.SLACK_SIGNING_SECRET;
|
||||
if (!signingSecret) {
|
||||
logger.error('SLACK_SIGNING_SECRET not configured');
|
||||
return reply.status(500).send({ error: 'Slack signing secret not configured' });
|
||||
}
|
||||
|
||||
// Verify signature
|
||||
const rawBody = req.body as string;
|
||||
const signature = req.headers['x-slack-signature'] as string | undefined;
|
||||
const timestamp = req.headers['x-slack-request-timestamp'] as string | undefined;
|
||||
|
||||
if (!verifySlackSignature(signingSecret, signature, timestamp, rawBody)) {
|
||||
logger.warn('Invalid Slack signature');
|
||||
return reply.status(401).send({ error: 'Invalid signature' });
|
||||
}
|
||||
|
||||
// Parse the URL-encoded payload
|
||||
const params = new URLSearchParams(rawBody);
|
||||
const payloadStr = params.get('payload');
|
||||
if (!payloadStr) {
|
||||
return reply.status(400).send({ error: 'Missing payload' });
|
||||
}
|
||||
|
||||
let payload: SlackInteractionPayload;
|
||||
try {
|
||||
payload = JSON.parse(payloadStr);
|
||||
} catch {
|
||||
return reply.status(400).send({ error: 'Invalid payload JSON' });
|
||||
}
|
||||
|
||||
if (payload.type !== 'block_actions') {
|
||||
logger.info({ type: payload.type }, 'Ignoring non-block_actions interaction');
|
||||
return reply.status(200).send();
|
||||
}
|
||||
|
||||
for (const action of payload.actions) {
|
||||
const anomalyId = parseAnomalyId(action.action_id);
|
||||
if (!anomalyId) {
|
||||
logger.warn({ actionId: action.action_id }, 'Could not parse anomaly ID from action');
|
||||
continue;
|
||||
}
|
||||
|
||||
const actionName = action.action_id.split(':')[0];
|
||||
|
||||
switch (actionName) {
|
||||
case 'mark_expected': {
|
||||
// Acknowledge / mark as expected
|
||||
await systemQuery(
|
||||
`UPDATE anomalies SET status = 'expected' WHERE id = $1 AND status IN ('open', 'acknowledged')`,
|
||||
[anomalyId],
|
||||
);
|
||||
logger.info({ anomalyId, user: payload.user.username }, 'Anomaly marked as expected');
|
||||
await sendEphemeral(
|
||||
payload.response_url,
|
||||
`✅ Anomaly \`${anomalyId.slice(0, 8)}\` marked as expected by @${payload.user.username}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
case 'snooze_anomaly': {
|
||||
// Snooze for 24 hours
|
||||
await systemQuery(
|
||||
`UPDATE anomalies SET status = 'snoozed', snoozed_until = now() + interval '24 hours'
|
||||
WHERE id = $1 AND status IN ('open', 'acknowledged')`,
|
||||
[anomalyId],
|
||||
);
|
||||
logger.info({ anomalyId, user: payload.user.username }, 'Anomaly snoozed 24h');
|
||||
await sendEphemeral(
|
||||
payload.response_url,
|
||||
`😴 Anomaly \`${anomalyId.slice(0, 8)}\` snoozed for 24 hours by @${payload.user.username}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
case 'create_ticket': {
|
||||
// Log intent — doesn't actually create a ticket
|
||||
logger.info(
|
||||
{ anomalyId, user: payload.user.username, trigger: payload.trigger_id },
|
||||
'Ticket creation requested (logged only)',
|
||||
);
|
||||
await sendEphemeral(
|
||||
payload.response_url,
|
||||
`🎫 Ticket creation logged for anomaly \`${anomalyId.slice(0, 8)}\` by @${payload.user.username}. Integration pending.`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
case 'view_anomaly': {
|
||||
// View is just a URL button — no server-side action needed
|
||||
break;
|
||||
}
|
||||
|
||||
default: {
|
||||
logger.warn({ actionName, actionId: action.action_id }, 'Unknown Slack action');
|
||||
await sendEphemeral(payload.response_url, `⚠️ Unknown action: ${actionName}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Slack expects a 200 within 3 seconds
|
||||
return reply.status(200).send();
|
||||
});
|
||||
}
|
||||
148
products/05-aws-cost-anomaly/src/composite-scorer.ts
Normal file
148
products/05-aws-cost-anomaly/src/composite-scorer.ts
Normal file
@@ -0,0 +1,148 @@
|
||||
import pino from 'pino';
|
||||
import { WelfordBaseline, scoreAnomaly, type CostEvent } from './detection/scorer.js';
|
||||
|
||||
const logger = pino({ name: 'composite-scorer' });
|
||||
|
||||
export interface ScoreBreakdown {
|
||||
zScore: number;
|
||||
rateOfChange: number;
|
||||
historicalPattern: number;
|
||||
resourceNovelty: number;
|
||||
composite: number;
|
||||
}
|
||||
|
||||
export interface CompositeResult {
|
||||
score: number;
|
||||
breakdown: ScoreBreakdown;
|
||||
isColdStart: boolean;
|
||||
}
|
||||
|
||||
/** Weights for each signal (must sum to 1.0) */
|
||||
const WEIGHTS = {
|
||||
zScore: 0.40,
|
||||
rateOfChange: 0.25,
|
||||
historicalPattern: 0.20,
|
||||
resourceNovelty: 0.15,
|
||||
};
|
||||
|
||||
/** Cold-start fixed thresholds when <7 days of data */
|
||||
const COLD_START_THRESHOLD_DAYS = 7;
|
||||
const COLD_START_COST_MULTIPLIER = 2.0; // 2x average = anomaly
|
||||
|
||||
/**
|
||||
* Compute rate-of-change score (0-100).
|
||||
* Compares current cost to the previous cost to measure acceleration.
|
||||
*/
|
||||
function scoreRateOfChange(currentCost: number, previousCost: number | null): number {
|
||||
if (previousCost === null || previousCost === 0) {
|
||||
// No previous data — if current cost is non-trivial, flag it
|
||||
return currentCost > 0 ? 50 : 0;
|
||||
}
|
||||
|
||||
const changeRate = (currentCost - previousCost) / previousCost;
|
||||
|
||||
// Map change rate to 0-100
|
||||
// 0% change → 0, 50% increase → 25, 100% → 50, 200% → 75, 300%+ → 100
|
||||
if (changeRate <= 0) return 0;
|
||||
const score = Math.min(100, changeRate * 33.33);
|
||||
return Math.round(score * 100) / 100;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute historical pattern score (0-100).
|
||||
* Lower score if this looks like a recurring spike (e.g., monthly batch job).
|
||||
* Higher score if the spike is unprecedented.
|
||||
*
|
||||
* @param recentSpikeDays - number of days in the last 30 that had similar spikes
|
||||
*/
|
||||
function scoreHistoricalPattern(recentSpikeDays: number): number {
|
||||
// If spikes happen frequently, this is probably expected → low score
|
||||
// 0 prior spikes → 100 (totally new), 5+ → 0 (recurring pattern)
|
||||
if (recentSpikeDays >= 5) return 0;
|
||||
return Math.round((1 - recentSpikeDays / 5) * 100);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute resource novelty score (0-100).
|
||||
* New resource types that have never been seen get a higher score.
|
||||
*
|
||||
* @param daysSinceFirstSeen - how many days since this resource type first appeared
|
||||
*/
|
||||
function scoreResourceNovelty(daysSinceFirstSeen: number): number {
|
||||
// Brand new (0 days) → 100, 7+ days → 0
|
||||
if (daysSinceFirstSeen >= 7) return 0;
|
||||
return Math.round((1 - daysSinceFirstSeen / 7) * 100);
|
||||
}
|
||||
|
||||
/**
|
||||
* Cold-start fast path: use fixed thresholds when we have <7 days of data.
|
||||
*/
|
||||
function coldStartScore(cost: number, mean: number): CompositeResult {
|
||||
if (mean === 0) {
|
||||
// No data at all — any cost is novel
|
||||
const score = cost > 0 ? 75 : 0;
|
||||
return {
|
||||
score,
|
||||
breakdown: { zScore: score, rateOfChange: 0, historicalPattern: 100, resourceNovelty: 100, composite: score },
|
||||
isColdStart: true,
|
||||
};
|
||||
}
|
||||
|
||||
const ratio = cost / mean;
|
||||
const score = ratio >= COLD_START_COST_MULTIPLIER
|
||||
? Math.min(100, Math.round((ratio - 1) * 50))
|
||||
: 0;
|
||||
|
||||
return {
|
||||
score,
|
||||
breakdown: { zScore: score, rateOfChange: 0, historicalPattern: 0, resourceNovelty: 0, composite: score },
|
||||
isColdStart: true,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a composite anomaly score (0-100) combining multiple signals.
|
||||
*/
|
||||
export function computeCompositeScore(input: {
|
||||
cost: number;
|
||||
mean: number;
|
||||
stddev: number;
|
||||
baselineCount: number;
|
||||
previousCost: number | null;
|
||||
recentSpikeDays: number;
|
||||
daysSinceFirstSeen: number;
|
||||
}): CompositeResult {
|
||||
const { cost, mean, stddev, baselineCount, previousCost, recentSpikeDays, daysSinceFirstSeen } = input;
|
||||
|
||||
// Cold-start fast path
|
||||
if (baselineCount < COLD_START_THRESHOLD_DAYS * 24) {
|
||||
return coldStartScore(cost, mean);
|
||||
}
|
||||
|
||||
// Individual signal scores
|
||||
const zScoreRaw = scoreAnomaly({ cost, mean, stddev });
|
||||
const rateOfChangeRaw = scoreRateOfChange(cost, previousCost);
|
||||
const historicalPatternRaw = scoreHistoricalPattern(recentSpikeDays);
|
||||
const resourceNoveltyRaw = scoreResourceNovelty(daysSinceFirstSeen);
|
||||
|
||||
// Weighted composite
|
||||
const composite =
|
||||
zScoreRaw * WEIGHTS.zScore +
|
||||
rateOfChangeRaw * WEIGHTS.rateOfChange +
|
||||
historicalPatternRaw * WEIGHTS.historicalPattern +
|
||||
resourceNoveltyRaw * WEIGHTS.resourceNovelty;
|
||||
|
||||
const score = Math.round(Math.min(100, Math.max(0, composite)) * 100) / 100;
|
||||
|
||||
return {
|
||||
score,
|
||||
breakdown: {
|
||||
zScore: zScoreRaw,
|
||||
rateOfChange: rateOfChangeRaw,
|
||||
historicalPattern: historicalPatternRaw,
|
||||
resourceNovelty: resourceNoveltyRaw,
|
||||
composite: score,
|
||||
},
|
||||
isColdStart: false,
|
||||
};
|
||||
}
|
||||
@@ -9,6 +9,8 @@ import { registerAnomalyRoutes } from './api/anomalies.js';
|
||||
import { registerBaselineRoutes } from './api/baselines.js';
|
||||
import { registerGovernanceRoutes } from './api/governance.js';
|
||||
import { registerIngestionRoutes } from './api/ingestion.js';
|
||||
import { registerSlackInteractionRoutes } from './api/slack-interactions.js';
|
||||
import { startZombieHunter } from './workers/zombie-hunter.js';
|
||||
|
||||
const logger = pino({ name: 'dd0c-cost', level: config.LOG_LEVEL });
|
||||
|
||||
@@ -27,6 +29,11 @@ app.get('/version', async () => ({ version: process.env.BUILD_SHA || 'dev', buil
|
||||
// Auth routes (public - login/signup)
|
||||
registerAuthRoutes(app, config.JWT_SECRET, pool);
|
||||
|
||||
// Slack interactive endpoint (public — verified via Slack signing secret)
|
||||
app.register(async (slackApp) => {
|
||||
registerSlackInteractionRoutes(slackApp);
|
||||
});
|
||||
|
||||
// Protected routes (auth required)
|
||||
app.register(async function protectedRoutes(protectedApp) {
|
||||
protectedApp.addHook('onRequest', authHook(config.JWT_SECRET, pool));
|
||||
@@ -40,6 +47,10 @@ app.register(async function protectedRoutes(protectedApp) {
|
||||
try {
|
||||
await app.listen({ port: config.PORT, host: '0.0.0.0' });
|
||||
logger.info({ port: config.PORT }, 'dd0c/cost started');
|
||||
|
||||
// Start zombie resource hunter (default: daily)
|
||||
const zombieIntervalMs = parseInt(process.env.ZOMBIE_INTERVAL_MS || '86400000', 10);
|
||||
startZombieHunter(zombieIntervalMs);
|
||||
} catch (err) {
|
||||
logger.fatal(err, 'Failed to start');
|
||||
process.exit(1);
|
||||
|
||||
172
products/05-aws-cost-anomaly/src/workers/zombie-hunter.ts
Normal file
172
products/05-aws-cost-anomaly/src/workers/zombie-hunter.ts
Normal file
@@ -0,0 +1,172 @@
|
||||
import pino from 'pino';
|
||||
import { withTenant, systemQuery } from '../data/db.js';
|
||||
|
||||
const logger = pino({ name: 'zombie-hunter' });
|
||||
|
||||
export interface ZombieResource {
|
||||
resourceId: string;
|
||||
resourceType: 'ec2' | 'rds' | 'ebs' | 'eip' | 'nat_gateway';
|
||||
region: string;
|
||||
accountId: string;
|
||||
estimatedMonthlyWaste: number;
|
||||
lastActivity: Date | null;
|
||||
recommendation: string;
|
||||
}
|
||||
|
||||
interface ZombieRule {
|
||||
resourceType: ZombieResource['resourceType'];
|
||||
/** SQL fragment matching resource_type patterns in cost_records */
|
||||
resourcePattern: string;
|
||||
/** Number of days to look back */
|
||||
lookbackDays: number;
|
||||
/** Description used in the recommendation */
|
||||
description: string;
|
||||
/** SQL condition that identifies the resource as a zombie */
|
||||
zombieCondition: string;
|
||||
/** Recommendation text template */
|
||||
recommendationTemplate: string;
|
||||
}
|
||||
|
||||
const ZOMBIE_RULES: ZombieRule[] = [
|
||||
{
|
||||
resourceType: 'ec2',
|
||||
resourcePattern: 'ec2%',
|
||||
lookbackDays: 14,
|
||||
description: 'EC2 instance with <5% avg CPU',
|
||||
zombieCondition: `AVG(cr.hourly_cost) < 0.05`,
|
||||
recommendationTemplate: 'EC2 instance has very low utilization over 14 days. Consider stopping or downsizing.',
|
||||
},
|
||||
{
|
||||
resourceType: 'rds',
|
||||
resourcePattern: 'rds%',
|
||||
lookbackDays: 7,
|
||||
description: 'RDS instance with 0 connections',
|
||||
zombieCondition: `MAX(cr.hourly_cost) > 0 AND COUNT(*) FILTER (WHERE cr.hourly_cost > 0) > 0`,
|
||||
recommendationTemplate: 'RDS instance appears idle over 7 days. Consider creating a snapshot and deleting.',
|
||||
},
|
||||
{
|
||||
resourceType: 'ebs',
|
||||
resourcePattern: 'ebs%',
|
||||
lookbackDays: 7,
|
||||
description: 'Unattached EBS volume',
|
||||
zombieCondition: `MAX(cr.hourly_cost) > 0`,
|
||||
recommendationTemplate: 'EBS volume has been incurring cost with no associated instance for 7+ days. Consider snapshotting and deleting.',
|
||||
},
|
||||
{
|
||||
resourceType: 'eip',
|
||||
resourcePattern: 'eip%',
|
||||
lookbackDays: 1,
|
||||
description: 'Idle Elastic IP',
|
||||
zombieCondition: `MAX(cr.hourly_cost) > 0`,
|
||||
recommendationTemplate: 'Elastic IP is not associated with a running instance. Release to avoid charges.',
|
||||
},
|
||||
{
|
||||
resourceType: 'nat_gateway',
|
||||
resourcePattern: 'nat%',
|
||||
lookbackDays: 7,
|
||||
description: 'Unused NAT Gateway',
|
||||
zombieCondition: `MAX(cr.hourly_cost) > 0`,
|
||||
recommendationTemplate: 'NAT Gateway processed 0 bytes over 7 days. Consider removing if unused.',
|
||||
},
|
||||
];
|
||||
|
||||
/**
|
||||
* Scan cost_records for zombie resources across all tenants.
|
||||
* Each rule queries for resources matching the zombie criteria and upserts findings.
|
||||
*/
|
||||
export async function runZombieHunt(): Promise<number> {
|
||||
logger.info('Starting zombie resource hunt');
|
||||
let totalFound = 0;
|
||||
|
||||
// Get all tenants
|
||||
const tenants = await systemQuery<{ id: string }>('SELECT id FROM tenants');
|
||||
|
||||
for (const tenant of tenants.rows) {
|
||||
const tenantId = tenant.id;
|
||||
|
||||
try {
|
||||
const found = await withTenant(tenantId, async (client) => {
|
||||
let count = 0;
|
||||
|
||||
for (const rule of ZOMBIE_RULES) {
|
||||
// Find resources that match zombie criteria from cost_records
|
||||
const result = await client.query(
|
||||
`SELECT
|
||||
cr.account_id,
|
||||
cr.resource_type,
|
||||
cr.region,
|
||||
SUM(cr.hourly_cost) * 730 AS estimated_monthly_waste,
|
||||
MAX(cr.detected_at) AS last_activity
|
||||
FROM anomalies cr
|
||||
WHERE cr.resource_type ILIKE $1
|
||||
AND cr.detected_at > now() - ($2 || ' days')::interval
|
||||
GROUP BY cr.account_id, cr.resource_type, cr.region
|
||||
HAVING ${rule.zombieCondition}`,
|
||||
[rule.resourcePattern, rule.lookbackDays],
|
||||
);
|
||||
|
||||
for (const row of result.rows) {
|
||||
const resourceId = `${row.resource_type}:${row.account_id}:${row.region}`;
|
||||
|
||||
await client.query(
|
||||
`INSERT INTO zombie_resources
|
||||
(tenant_id, resource_id, resource_type, region, account_id,
|
||||
estimated_monthly_waste, last_activity, recommendation)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
ON CONFLICT (tenant_id, resource_id, resource_type)
|
||||
DO UPDATE SET
|
||||
estimated_monthly_waste = EXCLUDED.estimated_monthly_waste,
|
||||
last_activity = EXCLUDED.last_activity,
|
||||
detected_at = now()`,
|
||||
[
|
||||
tenantId,
|
||||
resourceId,
|
||||
rule.resourceType,
|
||||
row.region,
|
||||
row.account_id,
|
||||
parseFloat(row.estimated_monthly_waste) || 0,
|
||||
row.last_activity,
|
||||
rule.recommendationTemplate,
|
||||
],
|
||||
);
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
});
|
||||
|
||||
totalFound += found;
|
||||
if (found > 0) {
|
||||
logger.info({ tenantId, found }, 'Zombie resources detected');
|
||||
}
|
||||
} catch (err) {
|
||||
logger.error({ tenantId, error: (err as Error).message }, 'Zombie hunt failed for tenant');
|
||||
}
|
||||
}
|
||||
|
||||
logger.info({ totalFound }, 'Zombie hunt complete');
|
||||
return totalFound;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the zombie hunter on a recurring interval.
|
||||
* Default: every 24 hours (86400000 ms).
|
||||
*/
|
||||
export function startZombieHunter(intervalMs = 86_400_000): NodeJS.Timeout {
|
||||
logger.info({ intervalMs }, 'Scheduling zombie hunter');
|
||||
|
||||
// Run once on startup (delayed 30s to let the server settle)
|
||||
setTimeout(() => {
|
||||
runZombieHunt().catch((err) =>
|
||||
logger.error({ error: (err as Error).message }, 'Initial zombie hunt failed'),
|
||||
);
|
||||
}, 30_000);
|
||||
|
||||
// Then run on interval
|
||||
return setInterval(() => {
|
||||
runZombieHunt().catch((err) =>
|
||||
logger.error({ error: (err as Error).message }, 'Scheduled zombie hunt failed'),
|
||||
);
|
||||
}, intervalMs);
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
-- 005_classifier_audit.sql
|
||||
|
||||
CREATE TABLE IF NOT EXISTS runbook_steps (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
runbook_id UUID NOT NULL REFERENCES runbooks(id) ON DELETE CASCADE,
|
||||
step_index INT NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
description TEXT,
|
||||
command TEXT,
|
||||
expected_output TEXT,
|
||||
timeout_seconds INT DEFAULT 300,
|
||||
requires_approval BOOLEAN DEFAULT false,
|
||||
risk_level TEXT DEFAULT 'low' CHECK (risk_level IN ('low', 'medium', 'high', 'critical')),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE(runbook_id, step_index)
|
||||
);
|
||||
|
||||
ALTER TABLE runbook_steps ENABLE ROW LEVEL SECURITY;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_policies WHERE tablename = 'runbook_steps' AND policyname = 'tenant_iso_runbook_steps'
|
||||
) THEN
|
||||
CREATE POLICY tenant_iso_runbook_steps ON runbook_steps
|
||||
FOR ALL
|
||||
USING (tenant_id::text = current_setting('app.tenant_id', true));
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
ALTER TABLE runbook_steps ADD COLUMN IF NOT EXISTS risk_level TEXT DEFAULT 'low' CHECK (risk_level IN ('low', 'medium', 'high', 'critical'));
|
||||
|
||||
ALTER TABLE audit_entries ADD COLUMN IF NOT EXISTS prev_hash TEXT;
|
||||
|
||||
ALTER TABLE runbooks ADD COLUMN IF NOT EXISTS trust_level TEXT DEFAULT 'standard' CHECK (trust_level IN ('sandbox', 'restricted', 'standard', 'elevated'));
|
||||
|
||||
ALTER TABLE runbooks ADD COLUMN IF NOT EXISTS source_format TEXT DEFAULT 'yaml' CHECK (source_format IN ('yaml', 'markdown', 'confluence'));
|
||||
@@ -16,6 +16,7 @@
|
||||
"@slack/web-api": "^7.1.0",
|
||||
"fastify": "^4.28.0",
|
||||
"ioredis": "^5.4.0",
|
||||
"js-yaml": "^4.1.1",
|
||||
"jsonwebtoken": "^9.0.2",
|
||||
"pg": "^8.12.0",
|
||||
"pino": "^9.1.0",
|
||||
@@ -23,6 +24,7 @@
|
||||
"zod": "^3.23.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/js-yaml": "^4.0.9",
|
||||
"@types/jsonwebtoken": "^9.0.6",
|
||||
"@types/node": "^20.14.0",
|
||||
"@types/pg": "^8.11.0",
|
||||
@@ -1603,6 +1605,13 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/js-yaml": {
|
||||
"version": "4.0.9",
|
||||
"resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz",
|
||||
"integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/json-schema": {
|
||||
"version": "7.0.15",
|
||||
"resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
|
||||
@@ -1982,7 +1991,6 @@
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
|
||||
"integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
|
||||
"dev": true,
|
||||
"license": "Python-2.0"
|
||||
},
|
||||
"node_modules/array-buffer-byte-length": {
|
||||
@@ -4340,7 +4348,6 @@
|
||||
"version": "4.1.1",
|
||||
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
|
||||
"integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"argparse": "^2.0.1"
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
"@slack/web-api": "^7.1.0",
|
||||
"fastify": "^4.28.0",
|
||||
"ioredis": "^5.4.0",
|
||||
"js-yaml": "^4.1.1",
|
||||
"jsonwebtoken": "^9.0.2",
|
||||
"pg": "^8.12.0",
|
||||
"pino": "^9.1.0",
|
||||
@@ -26,6 +27,7 @@
|
||||
"zod": "^3.23.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/js-yaml": "^4.0.9",
|
||||
"@types/jsonwebtoken": "^9.0.6",
|
||||
"@types/node": "^20.14.0",
|
||||
"@types/pg": "^8.11.0",
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
import type { RunbookStep } from '../parsers/index.js';
|
||||
import { classifyStep } from './safety-scanner.js';
|
||||
|
||||
export function classifyRunbook(steps: RunbookStep[]): RunbookStep[] {
|
||||
return steps.map(classifyStep);
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
import type { RunbookStep } from '../parsers/index.js';
|
||||
|
||||
export function classifyStep(step: RunbookStep): RunbookStep {
|
||||
if (!step.command) {
|
||||
return { ...step, risk_level: 'low', requires_approval: false };
|
||||
}
|
||||
|
||||
const cmd = step.command.toLowerCase();
|
||||
|
||||
// Critical
|
||||
if (
|
||||
cmd.includes('rm -rf') ||
|
||||
cmd.includes('drop table') ||
|
||||
cmd.includes('delete from') ||
|
||||
cmd.includes('shutdown') ||
|
||||
cmd.includes('reboot') ||
|
||||
cmd.includes('kill -9') ||
|
||||
cmd.includes('iptables -f')
|
||||
) {
|
||||
return { ...step, risk_level: 'critical', requires_approval: true };
|
||||
}
|
||||
|
||||
// High (Privilege escalation & Network)
|
||||
if (
|
||||
cmd.includes('sudo') ||
|
||||
cmd.includes('chmod 777') ||
|
||||
cmd.includes('chown root') ||
|
||||
cmd.includes('iptables') ||
|
||||
cmd.includes('route add') ||
|
||||
cmd.includes('route del') ||
|
||||
cmd.includes('/etc/resolv.conf')
|
||||
) {
|
||||
return { ...step, risk_level: 'high', requires_approval: true };
|
||||
}
|
||||
|
||||
// Medium (Modifying config, restarting services)
|
||||
if (
|
||||
cmd.includes('systemctl restart') ||
|
||||
cmd.includes('service restart') ||
|
||||
cmd.includes('sed -i') ||
|
||||
cmd.includes('mv ') ||
|
||||
cmd.includes('cp ')
|
||||
) {
|
||||
return { ...step, risk_level: 'medium', requires_approval: true };
|
||||
}
|
||||
|
||||
// Default to low
|
||||
return { ...step, risk_level: 'low', requires_approval: step.requires_approval || false };
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
import type { RunbookStep } from './index.js';
|
||||
|
||||
export function parseConfluenceRunbook(html: string): RunbookStep[] {
|
||||
const steps: RunbookStep[] = [];
|
||||
|
||||
// Try table parsing first
|
||||
// Very simplistic HTML table extraction for Node without DOMparser
|
||||
// We look for <tr> with <td> elements.
|
||||
const rowRegex = /<tr[^>]*>(.*?)<\/tr>/gis;
|
||||
const colRegex = /<td[^>]*>(.*?)<\/td>/gis;
|
||||
|
||||
let match;
|
||||
let order = 1;
|
||||
while ((match = rowRegex.exec(html)) !== null) {
|
||||
const rowHtml = match[1];
|
||||
const cols: string[] = [];
|
||||
let colMatch;
|
||||
|
||||
// reset regex index
|
||||
const colRegexClone = new RegExp(colRegex);
|
||||
while ((colMatch = colRegexClone.exec(rowHtml)) !== null) {
|
||||
// strip inner HTML tags
|
||||
cols.push(colMatch[1].replace(/<[^>]*>/g, '').trim());
|
||||
}
|
||||
|
||||
if (cols.length >= 2) {
|
||||
// Assume Column 1: Step Name/Description, Column 2: Action/Command, Column 3: Expected
|
||||
const nameDesc = cols[0];
|
||||
const command = cols[1];
|
||||
const expected = cols[2] || '';
|
||||
|
||||
// Skip headers
|
||||
if (nameDesc.toLowerCase().includes('step') && command.toLowerCase().includes('action')) {
|
||||
continue;
|
||||
}
|
||||
if (!command) continue;
|
||||
|
||||
steps.push({
|
||||
order: order++,
|
||||
name: nameDesc.split('\n')[0].substring(0, 50) || `Step ${order}`,
|
||||
description: nameDesc,
|
||||
command: command,
|
||||
expected_output: expected,
|
||||
timeout_seconds: 300,
|
||||
requires_approval: false,
|
||||
risk_level: 'low'
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (steps.length > 0) return steps;
|
||||
|
||||
// Fallback: Numbered procedure lists
|
||||
// Search for <ol> ... </ol> and extract <li>
|
||||
const olRegex = /<ol[^>]*>(.*?)<\/ol>/gis;
|
||||
const liRegex = /<li[^>]*>(.*?)<\/li>/gis;
|
||||
|
||||
let olMatch;
|
||||
while ((olMatch = olRegex.exec(html)) !== null) {
|
||||
let liMatch;
|
||||
const liRegexClone = new RegExp(liRegex);
|
||||
while ((liMatch = liRegexClone.exec(olMatch[1])) !== null) {
|
||||
const text = liMatch[1].replace(/<[^>]*>/g, '').trim();
|
||||
|
||||
// Attempt to extract command, e.g. from <code> tags if we kept them, but we stripped them.
|
||||
// We'll just put the text as description and name.
|
||||
steps.push({
|
||||
order: order++,
|
||||
name: text.substring(0, 50) + '...',
|
||||
description: text,
|
||||
command: '', // Cannot easily reliably extract command from plain text without markers
|
||||
timeout_seconds: 300,
|
||||
requires_approval: false,
|
||||
risk_level: 'low'
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return steps;
|
||||
}
|
||||
37
products/06-runbook-automation/saas/src/parsers/index.ts
Normal file
37
products/06-runbook-automation/saas/src/parsers/index.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
export interface RunbookStep {
|
||||
order: number;
|
||||
name: string;
|
||||
description: string;
|
||||
command?: string;
|
||||
expected_output?: string;
|
||||
timeout_seconds: number;
|
||||
requires_approval: boolean;
|
||||
risk_level: 'low' | 'medium' | 'high' | 'critical';
|
||||
}
|
||||
|
||||
import { parseYamlRunbook } from './yaml-parser.js';
|
||||
import { parseMarkdownRunbook } from './markdown-parser.js';
|
||||
import { parseConfluenceRunbook } from './confluence-parser.js';
|
||||
|
||||
export function parseRunbook(content: string, format: 'yaml' | 'markdown' | 'confluence'): RunbookStep[] {
|
||||
switch (format) {
|
||||
case 'yaml':
|
||||
return parseYamlRunbook(content);
|
||||
case 'markdown':
|
||||
return parseMarkdownRunbook(content);
|
||||
case 'confluence':
|
||||
return parseConfluenceRunbook(content);
|
||||
default:
|
||||
throw new Error(`Unsupported runbook format: ${format}`);
|
||||
}
|
||||
}
|
||||
|
||||
export function detectFormat(content: string): 'yaml' | 'markdown' | 'confluence' {
|
||||
if (content.includes('<!DOCTYPE html>') || content.includes('<table class="confluenceTable">') || content.includes('<div id="main-content"')) {
|
||||
return 'confluence';
|
||||
}
|
||||
if (content.trim().startsWith('#') || content.includes('```bash') || content.includes('```sh') || content.match(/^\d+\.\s+/m)) {
|
||||
return 'markdown';
|
||||
}
|
||||
return 'yaml';
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
import type { RunbookStep } from './index.js';
|
||||
|
||||
export function parseMarkdownRunbook(content: string): RunbookStep[] {
|
||||
const steps: RunbookStep[] = [];
|
||||
const lines = content.split('\n');
|
||||
|
||||
let currentStep: Partial<RunbookStep> | null = null;
|
||||
let inCodeBlock = false;
|
||||
let codeBuffer: string[] = [];
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i];
|
||||
|
||||
// Check for code block toggle
|
||||
if (line.trim().startsWith('```')) {
|
||||
inCodeBlock = !inCodeBlock;
|
||||
if (!inCodeBlock && currentStep) {
|
||||
// Just closed a code block
|
||||
if (!currentStep.command) {
|
||||
currentStep.command = codeBuffer.join('\n').trim();
|
||||
} else {
|
||||
// If we already have a command, maybe this is expected output?
|
||||
currentStep.expected_output = codeBuffer.join('\n').trim();
|
||||
}
|
||||
codeBuffer = [];
|
||||
} else if (inCodeBlock) {
|
||||
// Just opened a code block, reset buffer
|
||||
codeBuffer = [];
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (inCodeBlock) {
|
||||
codeBuffer.push(line);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for numbered list (e.g. "1. Do something")
|
||||
const stepMatch = line.match(/^(\d+)\.\s+(.*)$/);
|
||||
if (stepMatch) {
|
||||
// If we have an existing step, save it
|
||||
if (currentStep) {
|
||||
steps.push(finalizeStep(currentStep, steps.length + 1));
|
||||
}
|
||||
|
||||
currentStep = {
|
||||
name: stepMatch[2].trim(),
|
||||
description: '',
|
||||
};
|
||||
continue;
|
||||
}
|
||||
|
||||
// Accumulate description
|
||||
if (currentStep && line.trim() && !line.trim().startsWith('#')) {
|
||||
if (currentStep.description) {
|
||||
currentStep.description += '\n' + line.trim();
|
||||
} else {
|
||||
currentStep.description = line.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (currentStep) {
|
||||
steps.push(finalizeStep(currentStep, steps.length + 1));
|
||||
}
|
||||
|
||||
return steps;
|
||||
}
|
||||
|
||||
function finalizeStep(step: Partial<RunbookStep>, index: number): RunbookStep {
|
||||
return {
|
||||
order: index,
|
||||
name: step.name || `Step ${index}`,
|
||||
description: step.description || '',
|
||||
command: step.command,
|
||||
expected_output: step.expected_output,
|
||||
timeout_seconds: 300,
|
||||
requires_approval: false,
|
||||
risk_level: 'low'
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
import yaml from 'js-yaml';
|
||||
import type { RunbookStep } from './index.js';
|
||||
|
||||
export function parseYamlRunbook(content: string): RunbookStep[] {
|
||||
const parsed = yaml.load(content) as any;
|
||||
const stepsData = Array.isArray(parsed) ? parsed : (parsed?.steps || []);
|
||||
|
||||
if (!Array.isArray(stepsData)) {
|
||||
throw new Error('YAML runbook must be an array or contain a "steps" array');
|
||||
}
|
||||
|
||||
return stepsData.map((step: any, index: number): RunbookStep => {
|
||||
return {
|
||||
order: index + 1,
|
||||
name: step.name || `Step ${index + 1}`,
|
||||
description: step.description || '',
|
||||
command: step.command,
|
||||
expected_output: step.expected_output,
|
||||
timeout_seconds: step.timeout_seconds || 300,
|
||||
requires_approval: step.requires_approval === true,
|
||||
risk_level: step.risk_level || 'low'
|
||||
};
|
||||
});
|
||||
}
|
||||
86
products/SPEC-GAP-ANALYSIS.md
Normal file
86
products/SPEC-GAP-ANALYSIS.md
Normal file
@@ -0,0 +1,86 @@
|
||||
# dd0c Platform - BDD Specification Gap Analysis
|
||||
|
||||
## Executive Summary
|
||||
|
||||
This gap analysis compares the BDD acceptance specifications against the currently implemented Node.js/Fastify source code and PostgreSQL database migrations for the dd0c monorepo (P2-P6).
|
||||
|
||||
Overall, the **Dashboard APIs** required by the React Console are highly implemented across all services. The frontend will successfully render and operate. The major gaps lie in the out-of-band background workers, external agents, robust message queuing (SQS/DLQ), and advanced intelligence/scoring heuristics.
|
||||
|
||||
**Estimated Implementation Completion:**
|
||||
* **P4 - Lightweight IDP:** ~75% (Core scanners, catalog, and search are functional)
|
||||
* **P3 - Alert Intelligence:** ~65% (Ingestion, basic correlation, and UI APIs are solid)
|
||||
* **P5 - AWS Cost Anomaly:** ~50% (Scorer and APIs exist, but CloudTrail ingestion is missing)
|
||||
* **P6 - Runbook Automation:** ~40% (APIs and Slackbot exist; parsing, classification, and agent execution are completely missing)
|
||||
* **P2 - IaC Drift Detection:** ~30% (SaaS ingestion APIs exist; the entire external agent, mTLS, and diff engines are missing)
|
||||
|
||||
---
|
||||
|
||||
## Per-Service Breakdown by Epic
|
||||
|
||||
### P2: IaC Drift Detection
|
||||
* **Epic 1: Drift Detection Agent** ❌ **MISSING** - No Go agent binary. Terraform, CloudFormation, Kubernetes, and Pulumi state scanning engines do not exist. Secret scrubbing logic is missing.
|
||||
* **Epic 2: Agent Communication** 🟡 **PARTIAL** - Basic HTTP ingestion route exists (`/v1/ingest/drift`), but mTLS authentication and SQS FIFO message queues are not implemented.
|
||||
* **Epic 3: Event Processor** 🟡 **PARTIAL** - Ingestion, nonce replay prevention, and PostgreSQL persistence with RLS are implemented. Missing canonical schema normalization and chunked report reassembly.
|
||||
* **Epic 4: Notification Engine** 🟡 **PARTIAL** - Slack Block Kit, Email (Resend), Webhook, and PagerDuty dispatchers are implemented. Missing Daily Digest job and severity-based routing logic.
|
||||
* **Epic 5: Remediation** ❌ **MISSING** - Interactive Slack buttons exist in notification payloads, but the backend workflow engine, approval tracking, and agent-side execution dispatch are missing.
|
||||
* **Epic 6 & 7: Dashboard UI & API** ✅ **IMPLEMENTED** - `fetchStacks`, `fetchStackHistory`, and `fetchLatestReport` endpoints are fully implemented with tenant RLS.
|
||||
* **Epic 8 & 9: Infrastructure / PLG** ❌ **MISSING** - No CDK templates, CI/CD pipelines, Stripe billing, or CLI setup logic.
|
||||
* **Epic 10: Transparent Factory** 🟡 **PARTIAL** - Database migrations and RLS are implemented. Missing Feature Flag service and OTEL Tracing.
|
||||
|
||||
### P3: Alert Intelligence
|
||||
* **Epic 1: Webhook Ingestion** 🟡 **PARTIAL** - Webhook routes and HMAC validation for Datadog, PagerDuty, OpsGenie, and Grafana are implemented via Redis queue. Missing S3 archival, oversized payload handling, and SQS/DLQ.
|
||||
* **Epic 2: Alert Normalization** 🟡 **PARTIAL** - Basic provider mapping logic exists in `webhook-processor.ts`.
|
||||
* **Epic 3: Correlation Engine** 🟡 **PARTIAL** - Time-window correlation and fingerprint deduplication are implemented using Redis. Missing Service-Affinity matching and strict cross-tenant worker isolation.
|
||||
* **Epic 4: Notification & Escalation** 🟡 **PARTIAL** - Slack, Email, and Webhook dispatchers are implemented. Missing PagerDuty auto-escalation cron and Daily Noise Report.
|
||||
* **Epic 5: Slack Bot** 🟡 **PARTIAL** - Missing interactive feedback button handlers (`/slack/interactions`) for noise/helpful marking, and missing `/dd0c` slash commands.
|
||||
* **Epic 6 & 7: Dashboard UI & API** 🟡 **PARTIAL** - Incident CRUD, filtering, and summary endpoints are implemented. Missing `MTTR` and `Noise Reduction` analytics endpoints requested by the spec.
|
||||
* **Epic 8 & 9: Infrastructure / PLG** ❌ **MISSING** - No CDK, Stripe billing, or Free Tier (10K alerts/month) limit enforcement.
|
||||
|
||||
### P4: Lightweight IDP
|
||||
* **Epic 1: AWS Discovery Scanner** 🟡 **PARTIAL** - ECS, Lambda, and RDS resource discovery implemented. Missing CloudFormation, API Gateway, and Step Functions orchestration.
|
||||
* **Epic 2: GitHub Discovery Scanner** 🟡 **PARTIAL** - Repository fetching, pagination, and basic `package.json`/`Dockerfile` heuristics implemented. Missing advanced CODEOWNERS and commit history parsing.
|
||||
* **Epic 3: Service Catalog** 🟡 **PARTIAL** - Catalog ingestion, partial update staging, ownership resolution, and DB APIs implemented. Missing PagerDuty/OpsGenie on-call mapping.
|
||||
* **Epic 4: Search Engine** 🟡 **PARTIAL** - Meilisearch integration with PostgreSQL fallback implemented. Missing Redis prefix caching for `Cmd+K` performance optimization.
|
||||
* **Epic 5: Dashboard API** ✅ **IMPLEMENTED** - Service CRUD and ownership summary endpoints are fully functional and align with Console requirements.
|
||||
* **Epic 6: Analytics Dashboards** ❌ **MISSING** - API endpoints for Ownership Coverage, Health Scorecards, and Tech Debt tracking are missing.
|
||||
|
||||
### P5: AWS Cost Anomaly
|
||||
* **Epic 1: CloudTrail Ingestion** ❌ **MISSING** - A batch ingestion API exists, but the AWS EventBridge cross-account rules, SQS FIFO, and Lambda normalizer are entirely missing.
|
||||
* **Epic 2: Anomaly Detection** 🟡 **PARTIAL** - Welford's algorithm and basic Z-Score computation are implemented. Missing novelty scoring, cold-start fast path, and composite scoring logic.
|
||||
* **Epic 3: Zombie Hunter** ❌ **MISSING** - No scheduled jobs or logic to detect idle EC2, RDS, or EBS resources.
|
||||
* **Epic 4: Notification & Remediation** 🟡 **PARTIAL** - Slack notification generation is implemented. Missing the `/slack/interactions` endpoint to process remediation buttons (e.g., Stop Instance).
|
||||
* **Epic 6 & 7: Dashboard UI & API** ✅ **IMPLEMENTED** - Anomalies, Baselines, and Governance rule CRUD endpoints match Console expectations.
|
||||
* **Epic 10: Transparent Factory** 🟡 **PARTIAL** - The 14-day `GovernanceEngine` (Shadow -> Audit -> Enforce) auto-promotion and Panic Mode logic is implemented. Missing Circuit Breakers and OTEL spans.
|
||||
|
||||
### P6: Runbook Automation
|
||||
* **Epic 1: Runbook Parser** ❌ **MISSING** - The system currently expects raw YAML inputs. Confluence HTML, Notion Markdown, and LLM step extraction parsing engines are entirely missing.
|
||||
* **Epic 2: Action Classifier** ❌ **MISSING** - Neither the deterministic regex safety scanner nor the secondary LLM risk classifier exist.
|
||||
* **Epic 3: Execution Engine** 🟡 **PARTIAL** - Basic state transitions are handled in `api/runbooks.ts`. Missing Trust Level enforcement, network partition recovery, and step idempotency logic.
|
||||
* **Epic 4: Agent** ❌ **MISSING** - No Go agent binary, gRPC bidirectional streaming, or local sandbox execution environments exist.
|
||||
* **Epic 5: Audit Trail** 🟡 **PARTIAL** - Basic Postgres `audit_entries` table exists. Missing the immutable append-only hash chain logic and CSV/PDF compliance export APIs.
|
||||
* **Epic 6: Dashboard API** ✅ **IMPLEMENTED** - Runbook, execution, and approval APIs are implemented. Redis pub/sub Agent Bridge exists. Slackbot interaction handlers are fully implemented with signature verification.
|
||||
|
||||
---
|
||||
|
||||
## Priority Ranking (What to Implement Next)
|
||||
|
||||
This ranking is based on maximizing time-to-value: prioritizing services where the Console UI is already supported, the backend logic is mostly complete, and the remaining gaps are well-defined.
|
||||
|
||||
**1. P4 - Lightweight IDP**
|
||||
* **Why:** It is functionally the most complete. The Console APIs work, Meilisearch sync works, and basic AWS/GitHub discovery is operational.
|
||||
* **Next Steps:** Implement the missing AWS scanners (CloudFormation, API Gateway) and the `Redis` prefix caching for search. Add the analytics endpoints (Ownership, Health, Tech Debt) to unlock the remaining UI views.
|
||||
|
||||
**2. P3 - Alert Intelligence**
|
||||
* **Why:** The core pipeline (Webhook -> Redis -> Worker -> DB) is functional and deduplication logic works. Console APIs are satisfied.
|
||||
* **Next Steps:** Build the `MTTR` and `Noise Reduction` analytics SQL queries, add PagerDuty escalation triggers, and implement the interactive Slack button handlers.
|
||||
|
||||
**3. P5 - AWS Cost Anomaly**
|
||||
* **Why:** The complex math (Welford running stats) and database governance logic are done, making the dashboard functional for demo data.
|
||||
* **Next Steps:** The biggest blocker is that there is no data pipeline. Implement the CDK stack to deploy the EventBridge rules and the `Lambda Normalizer` to translate CloudTrail events into the existing `/v1/ingest` API.
|
||||
|
||||
**4. P6 - Runbook Automation**
|
||||
* **Why:** The API orchestration, Slack integrations, and Redis Pub/Sub bridges are nicely implemented, but it is currently a "brain without a body."
|
||||
* **Next Steps:** It requires two massive standalone systems: the `Runbook Parser` (LLM + AST logic) and the actual external `Agent` (Go binary with gRPC and sandboxing).
|
||||
|
||||
**5. P2 - IaC Drift Detection**
|
||||
* **Why:** Furthest from completion. While the SaaS API exists, it requires a highly complex external Go agent capable of reading Terraform/K8s/Pulumi state, a secure mTLS CA registration system, and a diffing/scoring engine—none of which currently exist.
|
||||
Reference in New Issue
Block a user