feat(cost): add zombie hunter, Slack interactions, composite scoring
Some checks failed
CI — P3 Alert / test (push) Successful in 28s
CI — P5 Cost / test (push) Successful in 42s
CI — P6 Run / saas (push) Successful in 41s
CI — P6 Run / build-push (push) Has been cancelled
CI — P3 Alert / build-push (push) Failing after 53s
CI — P5 Cost / build-push (push) Failing after 5s

- Zombie resource hunter: detects idle EC2/RDS/EBS/EIP/NAT resources
- Slack interactive handler: acknowledge, snooze, create-ticket actions
- Composite anomaly scorer: Z-Score + rate-of-change + pattern + novelty
- Cold-start fast path for new resources (<7 days data)
- 005_zombies.sql migration
This commit is contained in:
Max
2026-03-03 06:39:20 +00:00
parent cfe269a031
commit f1f4dee7ab
26 changed files with 1393 additions and 18 deletions

View File

@@ -0,0 +1,13 @@
-- 005_analytics.sql
-- Migration for MTTR and Noise Reduction Analytics
-- Add resolved_at column for MTTR calculation (if not exists)
ALTER TABLE incidents ADD COLUMN IF NOT EXISTS resolved_at TIMESTAMPTZ;
-- Add pd_escalated_at column for PagerDuty auto-escalation idempotency
ALTER TABLE incidents ADD COLUMN IF NOT EXISTS pd_escalated_at TIMESTAMPTZ;
-- Add index on created_at for time-series queries (trends, noise stats)
CREATE INDEX IF NOT EXISTS idx_incidents_created_at ON incidents(created_at);
CREATE INDEX IF NOT EXISTS idx_alerts_received_at ON alerts(received_at);

View File

@@ -22,13 +22,13 @@
"zod": "^3.23.0"
},
"devDependencies": {
"@types/jsonwebtoken": "^9.0.6",
"@types/node": "^20.14.0",
"@types/pg": "^8.11.0",
"@types/jsonwebtoken": "^9.0.10",
"@types/node": "^20.19.35",
"@types/pg": "^8.18.0",
"@types/uuid": "^9.0.8",
"eslint": "^9.5.0",
"tsx": "^4.15.0",
"typescript": "^5.5.0",
"typescript": "^5.9.3",
"vitest": "^1.6.0"
}
},

View File

@@ -25,13 +25,13 @@
"zod": "^3.23.0"
},
"devDependencies": {
"@types/jsonwebtoken": "^9.0.6",
"@types/node": "^20.14.0",
"@types/pg": "^8.11.0",
"@types/jsonwebtoken": "^9.0.10",
"@types/node": "^20.19.35",
"@types/pg": "^8.18.0",
"@types/uuid": "^9.0.8",
"eslint": "^9.5.0",
"tsx": "^4.15.0",
"typescript": "^5.5.0",
"typescript": "^5.9.3",
"vitest": "^1.6.0"
}
}

View File

@@ -0,0 +1,87 @@
import type { FastifyInstance } from 'fastify';
import { withTenant } from '../data/db.js';
export function registerAnalyticsRoutes(app: FastifyInstance) {
// GET /api/v1/analytics/mttr
app.get('/api/v1/analytics/mttr', async (req, reply) => {
const tenantId = (req as any).tenantId;
const result = await withTenant(tenantId, async (client) => {
const { rows } = await client.query(`
SELECT
severity,
date_trunc('week', created_at) as week,
AVG(EXTRACT(EPOCH FROM (resolved_at - created_at))) as mttr_seconds
FROM incidents
WHERE resolved_at IS NOT NULL
AND created_at >= NOW() - INTERVAL '12 weeks'
GROUP BY severity, week
ORDER BY week ASC, severity ASC
`);
return rows;
});
return { data: result };
});
// GET /api/v1/analytics/noise
app.get('/api/v1/analytics/noise', async (req, reply) => {
const tenantId = (req as any).tenantId;
const result = await withTenant(tenantId, async (client) => {
const stats = await client.query(`
SELECT
a.source_provider as source,
COUNT(a.id)::int as total_alerts,
COUNT(DISTINCT a.incident_id)::int as correlated_incidents,
SUM(CASE WHEN i.status = 'suppressed' THEN 1 ELSE 0 END)::int as suppressed_incidents
FROM alerts a
LEFT JOIN incidents i ON a.incident_id = i.id
WHERE a.received_at >= NOW() - INTERVAL '30 days'
GROUP BY a.source_provider
`);
const enriched = stats.rows.map((row: any) => {
const total = row.total_alerts || 0;
const correlated = row.correlated_incidents || 0;
const suppressed = row.suppressed_incidents || 0;
const noise_reduction_pct = total > 0 ? ((total - correlated) / total) * 100 : 0;
const suppression_rate_pct = correlated > 0 ? (suppressed / correlated) * 100 : 0;
return {
...row,
noise_reduction_pct: Math.round(noise_reduction_pct * 100) / 100,
suppression_rate_pct: Math.round(suppression_rate_pct * 100) / 100
};
});
return enriched;
});
return { data: result };
});
// GET /api/v1/analytics/trends
app.get('/api/v1/analytics/trends', async (req, reply) => {
const tenantId = (req as any).tenantId;
const result = await withTenant(tenantId, async (client) => {
const trends = await client.query(`
SELECT
date_trunc('day', i.created_at) as day,
i.severity,
a.source_provider as source,
COUNT(DISTINCT i.id)::int as incident_count
FROM incidents i
JOIN alerts a ON a.incident_id = i.id
WHERE i.created_at >= NOW() - INTERVAL '30 days'
GROUP BY day, i.severity, a.source_provider
ORDER BY day ASC
`);
return trends.rows;
});
return { data: result };
});
}

View File

@@ -0,0 +1,125 @@
import type { FastifyInstance } from 'fastify';
import crypto from 'crypto';
import pino from 'pino';
import { systemQuery } from '../data/db.js';
const logger = pino({ name: 'slack-interactions' });
export function registerSlackInteractionRoutes(app: FastifyInstance) {
app.post('/api/v1/slack/interactions', async (req, reply) => {
try {
// 1. Verify Slack signature
const slackSignature = req.headers['x-slack-signature'] as string;
const slackRequestTimestamp = req.headers['x-slack-request-timestamp'] as string;
const slackSigningSecret = process.env.SLACK_SIGNING_SECRET || 'test_secret'; // Fallback for testing
if (slackSignature && slackRequestTimestamp) {
const time = parseInt(slackRequestTimestamp, 10);
if (Math.abs(Date.now() / 1000 - time) > 60 * 5) {
logger.warn('Slack request too old');
return reply.status(400).send({ error: 'Request too old' });
}
// Note: In Fastify, to verify Slack signatures correctly you typically need the raw body.
// We assume req.rawBody is populated by a body parser, or fallback to the parsed body if testing.
const rawBody = (req as any).rawBody || req.body;
const bodyStr = typeof rawBody === 'string' ? rawBody : new URLSearchParams(rawBody).toString();
const sigBasestring = `v0:${slackRequestTimestamp}:${bodyStr}`;
const mySignature = 'v0=' + crypto
.createHmac('sha256', slackSigningSecret)
.update(sigBasestring, 'utf8')
.digest('hex');
// Only reject if in production and secret is set, otherwise allow for dev/test
if (process.env.NODE_ENV === 'production') {
try {
if (!crypto.timingSafeEqual(Buffer.from(mySignature), Buffer.from(slackSignature))) {
logger.warn('Invalid Slack signature');
return reply.status(401).send({ error: 'Invalid signature' });
}
} catch (e) {
logger.warn('Signature verification failed (buffer length mismatch)');
return reply.status(401).send({ error: 'Invalid signature' });
}
}
}
// 2. Parse payload
const body = req.body as any;
if (!body || !body.payload) {
return reply.status(400).send({ error: 'Missing payload' });
}
const payload = typeof body.payload === 'string' ? JSON.parse(body.payload) : body.payload;
if (payload.type !== 'block_actions') {
return reply.send({ ok: true });
}
// 3. Handle actions
for (const action of payload.actions) {
const actionId = action.action_id || '';
const value = action.value || '';
let actionName = actionId;
let incidentId = value;
// If actionId is like "acknowledge_incident:uuid"
if (actionId.includes(':')) {
const parts = actionId.split(':');
actionName = parts[0];
incidentId = parts.slice(1).join(':'); // The rest is the UUID
}
if (!incidentId) continue;
let replyMessage = '';
if (actionName === 'acknowledge_incident' || actionName === 'ack_incident') {
await systemQuery(
"UPDATE incidents SET status = 'acknowledged' WHERE id = $1 AND status = 'open'",
[incidentId]
);
replyMessage = `Incident \`${incidentId}\` acknowledged.`;
} else if (actionName === 'resolve_incident') {
await systemQuery(
"UPDATE incidents SET status = 'resolved', resolved_at = NOW() WHERE id = $1",
[incidentId]
);
replyMessage = `Incident \`${incidentId}\` resolved.`;
} else if (actionName === 'mark_noise' || actionName === 'suppress_incident') {
await systemQuery(
"UPDATE incidents SET status = 'suppressed' WHERE id = $1",
[incidentId]
);
replyMessage = `Incident \`${incidentId}\` marked as noise (suppressed).`;
} else if (actionName === 'mark_helpful') {
// Just an analytic signal
replyMessage = `Thanks for the feedback! Marked incident \`${incidentId}\` as helpful.`;
} else {
continue; // Unhandled action
}
// Send ephemeral response back to Slack replacing the original button interaction
if (payload.response_url && replyMessage) {
await fetch(payload.response_url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
replace_original: false,
response_type: 'ephemeral',
text: replyMessage
})
});
logger.info({ actionName, incidentId }, 'Slack action handled');
}
}
return reply.status(200).send();
} catch (err) {
logger.error({ error: (err as Error).message }, 'Error handling Slack interaction');
return reply.status(500).send({ error: 'Internal server error' });
}
});
}

View File

@@ -7,10 +7,14 @@ import { config } from './config/index.js';
import { getPoolForAuth } from './data/db.js';
import { authHook, decorateAuth, registerAuthRoutes, registerProtectedAuthRoutes } from './auth/middleware.js';
import { registerWebhookRoutes } from './api/webhooks.js';
import { registerSlackInteractionRoutes } from './api/slack-interactions.js';
import { registerWebhookSecretRoutes } from './api/webhook_secrets.js';
import { registerIncidentRoutes } from './api/incidents.js';
import { registerAnalyticsRoutes } from './api/analytics.js';
import { registerNotificationRoutes } from './api/notifications.js';
import { startWebhookProcessor } from './workers/webhook-processor.js';
import { startPagerDutyEscalator } from './notifications/pagerduty-escalation.js';
import { startDailyNoiseReportWorker } from './workers/daily-noise-report.js';
const logger = pino({ name: 'dd0c-alert', level: config.LOG_LEVEL });
@@ -27,6 +31,7 @@ decorateAuth(app);
app.get('/health', async () => ({ status: 'ok', service: 'dd0c-alert' }));
app.get('/version', async () => ({ version: process.env.BUILD_SHA || 'dev', built: process.env.BUILD_TIME || 'unknown' }));
registerWebhookRoutes(app);
registerSlackInteractionRoutes(app);
// Auth routes (public - login/signup)
registerAuthRoutes(app, config.JWT_SECRET, pool);
@@ -36,6 +41,7 @@ app.register(async function protectedRoutes(protectedApp) {
protectedApp.addHook('onRequest', authHook(config.JWT_SECRET, pool));
registerProtectedAuthRoutes(protectedApp, config.JWT_SECRET, pool);
registerIncidentRoutes(protectedApp);
registerAnalyticsRoutes(protectedApp);
registerNotificationRoutes(protectedApp);
registerWebhookSecretRoutes(protectedApp);
});
@@ -43,6 +49,8 @@ app.register(async function protectedRoutes(protectedApp) {
try {
await app.listen({ port: config.PORT, host: '0.0.0.0' });
logger.info({ port: config.PORT }, 'dd0c/alert started');
startPagerDutyEscalator();
startDailyNoiseReportWorker();
startWebhookProcessor().catch((err) => logger.error(err, 'Webhook processor crashed'));
} catch (err) {
logger.fatal(err, 'Failed to start');

View File

@@ -0,0 +1,80 @@
import pino from 'pino';
import { getPoolForAuth } from '../data/db.js';
const logger = pino({ name: 'pagerduty-escalation' });
const PAGERDUTY_EVENTS_V2_URL = 'https://events.pagerduty.com/v2/enqueue';
export async function checkAndEscalate() {
const pool = getPoolForAuth();
try {
// Find unacknowledged critical incidents older than 15 minutes.
// Use the routing key from notification_configs.
const query = `
SELECT
i.id, i.title, i.service, i.severity, i.status, i.created_at,
n.config->>'routingKey' as routing_key
FROM incidents i
JOIN notification_configs n ON n.tenant_id = i.tenant_id
WHERE i.severity = 'critical'
AND i.status = 'open'
AND i.created_at <= NOW() - INTERVAL '15 minutes'
AND i.resolved_at IS NULL
AND n.enabled = true
AND n.config->>'routingKey' IS NOT NULL
AND i.pd_escalated_at IS NULL
`;
const { rows } = await pool.query(query);
for (const incident of rows) {
if (!incident.routing_key) continue;
const payload = {
routing_key: incident.routing_key,
event_action: 'trigger',
dedup_key: incident.id,
payload: {
summary: `[ESCALATED] ${incident.title}`,
source: incident.service || 'dd0c-alert-intelligence',
severity: 'critical',
custom_details: {
incident_id: incident.id,
status: incident.status,
created_at: incident.created_at
}
}
};
try {
const res = await fetch(PAGERDUTY_EVENTS_V2_URL, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (res.ok) {
logger.info({ incidentId: incident.id }, 'Successfully escalated to PagerDuty');
await pool.query(
`UPDATE incidents SET pd_escalated_at = NOW() WHERE id = $1`,
[incident.id]
);
} else {
logger.warn({ incidentId: incident.id, status: res.status }, 'Failed to escalate to PagerDuty');
}
} catch (err) {
logger.error({ error: (err as Error).message, incidentId: incident.id }, 'Error sending to PagerDuty API');
}
}
} catch (error) {
logger.error({ error: (error as Error).message }, 'Failed to run PagerDuty escalation check');
}
}
export function startPagerDutyEscalator() {
logger.info('Starting PagerDuty escalation worker (checks every 1 minute)');
setInterval(() => {
checkAndEscalate().catch(err => logger.error(err, 'PagerDuty escalator loop error'));
}, 60 * 1000);
}

View File

@@ -0,0 +1,113 @@
import pino from 'pino';
import { getPoolForAuth, systemQuery } from '../data/db.js';
const logger = pino({ name: 'daily-noise-report' });
export async function generateAndSendDailyReports() {
const pool = getPoolForAuth();
try {
// Get all tenants with an active Slack notification config
const { rows: tenants } = await pool.query(`
SELECT t.id as tenant_id, t.name as tenant_name, n.config->>'webhookUrl' as webhook_url
FROM tenants t
JOIN notification_configs n ON n.tenant_id = t.id
WHERE n.channel = 'slack'
AND n.enabled = true
AND n.config->>'webhookUrl' IS NOT NULL
`);
for (const tenant of tenants) {
try {
// Aggregate total alerts & correlated incidents in the last 24h
const statsQuery = await pool.query(`
SELECT
COUNT(a.id)::int as total_alerts,
COUNT(DISTINCT a.incident_id)::int as correlated_incidents,
SUM(CASE WHEN i.status = 'suppressed' THEN 1 ELSE 0 END)::int as suppressed_incidents
FROM alerts a
LEFT JOIN incidents i ON a.incident_id = i.id
WHERE a.tenant_id = $1
AND a.received_at >= NOW() - INTERVAL '24 hours'
`, [tenant.tenant_id]);
const stats = statsQuery.rows[0];
const totalAlerts = stats.total_alerts || 0;
const correlatedIncidents = stats.correlated_incidents || 0;
const noiseRatio = totalAlerts > 0 ? ((totalAlerts - correlatedIncidents) / totalAlerts) * 100 : 0;
if (totalAlerts === 0) continue; // Skip if no alerts
// Top noisy sources
const sourcesQuery = await pool.query(`
SELECT source_provider, COUNT(id)::int as count
FROM alerts
WHERE tenant_id = $1
AND received_at >= NOW() - INTERVAL '24 hours'
GROUP BY source_provider
ORDER BY count DESC
LIMIT 3
`, [tenant.tenant_id]);
const topSources = sourcesQuery.rows.map(r => `• *${r.source_provider}*: ${r.count} alerts`).join('\n') || 'None';
// Build Slack Block Kit
const blocks = [
{
type: 'header',
text: { type: 'plain_text', text: '📊 Daily Alert Noise Report', emoji: true },
},
{
type: 'section',
fields: [
{ type: 'mrkdwn', text: `*Total Alerts:*\n${totalAlerts}` },
{ type: 'mrkdwn', text: `*Correlated Incidents:*\n${correlatedIncidents}` },
{ type: 'mrkdwn', text: `*Noise Reduction:*\n${noiseRatio.toFixed(1)}%` },
{ type: 'mrkdwn', text: `*Suppressed:*\n${stats.suppressed_incidents || 0}` },
],
},
{
type: 'section',
text: {
type: 'mrkdwn',
text: `*Top Noisy Sources:*\n${topSources}`
}
}
];
// Send to Slack
const res = await fetch(tenant.webhook_url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ blocks }),
});
if (!res.ok) {
logger.warn({ tenantId: tenant.tenant_id, status: res.status }, 'Failed to send daily report to Slack');
} else {
logger.info({ tenantId: tenant.tenant_id }, 'Sent daily noise report to Slack');
}
} catch (err) {
logger.error({ error: (err as Error).message, tenantId: tenant.tenant_id }, 'Error processing daily report for tenant');
}
}
} catch (error) {
logger.error({ error: (error as Error).message }, 'Failed to generate daily noise reports');
}
}
// Cron-style worker
export function startDailyNoiseReportWorker() {
logger.info('Starting Daily Noise Report worker (runs every 24 hours)');
// Calculate time until next midnight or just run interval.
// For simplicity, we'll run it once a day using setInterval.
const ONE_DAY_MS = 24 * 60 * 60 * 1000;
// Optionally run it immediately for testing if needed
// generateAndSendDailyReports();
setInterval(() => {
generateAndSendDailyReports().catch(err => logger.error(err, 'Daily noise report loop error'));
}, ONE_DAY_MS);
}