Implement review remediation + PLG analytics SDK
- All 6 test architectures patched with Section 11 addendums - P5 (cost) fully rewritten from 232 to ~600 lines - PLG brainstorm + party mode advisory board results - Analytics SDK v2 (PostHog Cloud, Zod strict, Lambda-safe) - Analytics tests v2 (safeParse, no , no timestamp, no PII) - Addresses all Gemini review findings across P1-P6
This commit is contained in:
@@ -1409,3 +1409,459 @@ Before any release, these tests must pass:
|
||||
---
|
||||
|
||||
*End of dd0c/alert Test Architecture*
|
||||
|
||||
---
|
||||
|
||||
## 11. Review Remediation Addendum (Post-Gemini Review)
|
||||
|
||||
### 11.1 Missing Epic Coverage
|
||||
|
||||
#### Epic 6: Dashboard API
|
||||
|
||||
```typescript
|
||||
describe('Dashboard API', () => {
|
||||
describe('Authentication', () => {
|
||||
it('returns 401 for missing Cognito JWT', async () => {});
|
||||
it('returns 401 for expired JWT', async () => {});
|
||||
it('returns 401 for JWT signed by wrong issuer', async () => {});
|
||||
it('extracts tenantId from JWT claims', async () => {});
|
||||
});
|
||||
|
||||
describe('Incident Listing (GET /v1/incidents)', () => {
|
||||
it('returns paginated incidents for authenticated tenant', async () => {});
|
||||
it('supports cursor-based pagination', async () => {});
|
||||
it('filters by status (open, acknowledged, resolved)', async () => {});
|
||||
it('filters by severity (critical, warning, info)', async () => {});
|
||||
it('filters by time range (since, until)', async () => {});
|
||||
it('returns empty array for tenant with no incidents', async () => {});
|
||||
});
|
||||
|
||||
describe('Incident Detail (GET /v1/incidents/:id)', () => {
|
||||
it('returns full incident with correlated alerts', async () => {});
|
||||
it('returns 404 for incident belonging to different tenant', async () => {});
|
||||
it('includes timeline of state transitions', async () => {});
|
||||
});
|
||||
|
||||
describe('Analytics (GET /v1/analytics)', () => {
|
||||
it('returns MTTR for last 7/30/90 days', async () => {});
|
||||
it('returns alert volume by source', async () => {});
|
||||
it('returns noise reduction percentage', async () => {});
|
||||
it('scopes all analytics to authenticated tenant', async () => {});
|
||||
});
|
||||
|
||||
describe('Tenant Isolation', () => {
|
||||
it('tenant A cannot read tenant B incidents via API', async () => {});
|
||||
it('tenant A cannot read tenant B analytics', async () => {});
|
||||
it('all DynamoDB queries include tenantId partition key', async () => {});
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
#### Epic 7: Dashboard UI (Playwright)
|
||||
|
||||
```typescript
|
||||
// tests/e2e/ui/dashboard.spec.ts
|
||||
|
||||
test('login redirects to Cognito hosted UI', async ({ page }) => {
|
||||
await page.goto('/dashboard');
|
||||
await expect(page).toHaveURL(/cognito/);
|
||||
});
|
||||
|
||||
test('incident list renders with correct severity badges', async ({ page }) => {
|
||||
await page.goto('/dashboard/incidents');
|
||||
await expect(page.locator('[data-testid="incident-card"]')).toHaveCount(5);
|
||||
await expect(page.locator('.severity-critical')).toBeVisible();
|
||||
});
|
||||
|
||||
test('incident detail shows correlated alert timeline', async ({ page }) => {
|
||||
await page.goto('/dashboard/incidents/inc-123');
|
||||
await expect(page.locator('[data-testid="alert-timeline"]')).toBeVisible();
|
||||
await expect(page.locator('.timeline-event')).toHaveCountGreaterThan(1);
|
||||
});
|
||||
|
||||
test('MTTR chart renders with real data', async ({ page }) => {
|
||||
await page.goto('/dashboard/analytics');
|
||||
await expect(page.locator('[data-testid="mttr-chart"]')).toBeVisible();
|
||||
});
|
||||
|
||||
test('noise reduction percentage displays correctly', async ({ page }) => {
|
||||
await page.goto('/dashboard/analytics');
|
||||
const noise = page.locator('[data-testid="noise-reduction"]');
|
||||
await expect(noise).toContainText('%');
|
||||
});
|
||||
|
||||
test('webhook setup wizard generates correct URL', async ({ page }) => {
|
||||
await page.goto('/dashboard/settings/integrations');
|
||||
await page.click('[data-testid="add-datadog"]');
|
||||
const url = await page.locator('[data-testid="webhook-url"]').textContent();
|
||||
expect(url).toMatch(/\/v1\/webhooks\/ingest\/.+/);
|
||||
});
|
||||
```
|
||||
|
||||
#### Epic 9: Onboarding & PLG
|
||||
|
||||
```typescript
|
||||
describe('Free Tier Enforcement', () => {
|
||||
it('allows up to 10,000 alerts/month on free tier', async () => {});
|
||||
it('returns 429 with upgrade prompt at 10,001st alert', async () => {});
|
||||
it('resets counter on first of each month', async () => {});
|
||||
it('purges alert data older than 7 days on free tier', async () => {});
|
||||
it('retains alert data for 90 days on pro tier', async () => {});
|
||||
});
|
||||
|
||||
describe('OAuth Signup', () => {
|
||||
it('creates tenant record on first Cognito login', async () => {});
|
||||
it('assigns free tier by default', async () => {});
|
||||
it('generates unique webhook URL per tenant', async () => {});
|
||||
});
|
||||
|
||||
describe('Stripe Integration', () => {
|
||||
it('creates checkout session with correct pricing', async () => {});
|
||||
it('upgrades tenant on checkout.session.completed webhook', async () => {});
|
||||
it('downgrades tenant on subscription.deleted webhook', async () => {});
|
||||
it('validates Stripe webhook signature', async () => {});
|
||||
});
|
||||
```
|
||||
|
||||
#### Epic 5.3: Slack Feedback Endpoint
|
||||
|
||||
```typescript
|
||||
describe('Slack Interactive Actions Endpoint', () => {
|
||||
it('validates Slack request signature (HMAC-SHA256)', async () => {});
|
||||
it('rejects request with invalid signature', async () => {});
|
||||
it('handles "helpful" feedback — updates incident quality score', async () => {});
|
||||
it('handles "noise" feedback — adds to suppression training data', async () => {});
|
||||
it('handles "escalate" action — triggers PagerDuty/OpsGenie', async () => {});
|
||||
it('updates original Slack message after action', async () => {});
|
||||
it('scopes action to correct tenant', async () => {});
|
||||
});
|
||||
```
|
||||
|
||||
#### Epic 1.4: S3 Raw Payload Archival
|
||||
|
||||
```typescript
|
||||
describe('Raw Payload Archival', () => {
|
||||
it('saves raw webhook payload to S3 asynchronously', async () => {});
|
||||
it('S3 key includes tenantId, source, and timestamp', async () => {});
|
||||
it('archival failure does not block alert processing', async () => {});
|
||||
it('archived payload is retrievable for replay', async () => {});
|
||||
it('S3 lifecycle policy deletes after retention period', async () => {});
|
||||
});
|
||||
```
|
||||
|
||||
### 11.2 Anti-Pattern Fixes
|
||||
|
||||
#### Replace ioredis-mock with WindowStore Interface
|
||||
|
||||
```typescript
|
||||
// BEFORE (anti-pattern):
|
||||
// import RedisMock from 'ioredis-mock';
|
||||
// const engine = new CorrelationEngine(new RedisMock());
|
||||
|
||||
// AFTER (correct):
|
||||
interface WindowStore {
|
||||
addEvent(tenantId: string, key: string, event: Alert, ttlMs: number): Promise<void>;
|
||||
getWindow(tenantId: string, key: string): Promise<Alert[]>;
|
||||
clearWindow(tenantId: string, key: string): Promise<void>;
|
||||
}
|
||||
|
||||
class InMemoryWindowStore implements WindowStore {
|
||||
private store = new Map<string, { events: Alert[]; expiresAt: number }>();
|
||||
|
||||
async addEvent(tenantId: string, key: string, event: Alert, ttlMs: number) {
|
||||
const fullKey = `${tenantId}:${key}`;
|
||||
const existing = this.store.get(fullKey) || { events: [], expiresAt: Date.now() + ttlMs };
|
||||
existing.events.push(event);
|
||||
this.store.set(fullKey, existing);
|
||||
}
|
||||
|
||||
async getWindow(tenantId: string, key: string): Promise<Alert[]> {
|
||||
const fullKey = `${tenantId}:${key}`;
|
||||
const entry = this.store.get(fullKey);
|
||||
if (!entry || entry.expiresAt < Date.now()) return [];
|
||||
return entry.events;
|
||||
}
|
||||
}
|
||||
|
||||
// Unit tests use InMemoryWindowStore — no Redis dependency
|
||||
// Integration tests use RedisWindowStore with Testcontainers
|
||||
```
|
||||
|
||||
#### Replace sinon.useFakeTimers with Clock Interface
|
||||
|
||||
```typescript
|
||||
// BEFORE (anti-pattern):
|
||||
// sinon.useFakeTimers(new Date('2026-03-01T00:00:00Z'));
|
||||
|
||||
// AFTER (correct):
|
||||
interface Clock {
|
||||
now(): number;
|
||||
advanceBy(ms: number): void;
|
||||
}
|
||||
|
||||
class FakeClock implements Clock {
|
||||
private current: number;
|
||||
constructor(start: Date = new Date()) { this.current = start.getTime(); }
|
||||
now() { return this.current; }
|
||||
advanceBy(ms: number) { this.current += ms; }
|
||||
}
|
||||
|
||||
class SystemClock implements Clock {
|
||||
now() { return Date.now(); }
|
||||
advanceBy() { throw new Error('Cannot advance system clock'); }
|
||||
}
|
||||
|
||||
// Inject into CorrelationEngine:
|
||||
const engine = new CorrelationEngine(new InMemoryWindowStore(), new FakeClock());
|
||||
```
|
||||
|
||||
### 11.3 Trace Context Propagation Tests
|
||||
|
||||
```typescript
|
||||
describe('Trace Context Propagation', () => {
|
||||
it('API Gateway passes trace_id to Lambda via X-Amzn-Trace-Id', async () => {});
|
||||
|
||||
it('Lambda propagates trace_id into SQS message attributes', async () => {
|
||||
// Verify SQS message has MessageAttribute 'traceparent' with W3C format
|
||||
const msg = await getLastSQSMessage(localstack, 'alert-queue');
|
||||
expect(msg.MessageAttributes.traceparent).toBeDefined();
|
||||
expect(msg.MessageAttributes.traceparent.StringValue).toMatch(
|
||||
/^00-[0-9a-f]{32}-[0-9a-f]{16}-0[01]$/
|
||||
);
|
||||
});
|
||||
|
||||
it('ECS Correlation Engine extracts trace_id from SQS message', async () => {
|
||||
// Verify the correlation span has the correct parent from SQS
|
||||
const spans = inMemoryExporter.getFinishedSpans();
|
||||
const correlationSpan = spans.find(s => s.name === 'alert.correlation');
|
||||
const ingestSpan = spans.find(s => s.name === 'webhook.ingest');
|
||||
expect(correlationSpan.parentSpanId).toBeDefined();
|
||||
// Parent chain must trace back to the original ingest span
|
||||
});
|
||||
|
||||
it('end-to-end trace spans webhook → SQS → correlation → notification', async () => {
|
||||
// Fire a webhook, wait for Slack notification, verify all spans share trace_id
|
||||
const traceId = await fireWebhookAndGetTraceId();
|
||||
const spans = await getSpansByTraceId(traceId);
|
||||
const spanNames = spans.map(s => s.name);
|
||||
expect(spanNames).toContain('webhook.ingest');
|
||||
expect(spanNames).toContain('alert.normalize');
|
||||
expect(spanNames).toContain('alert.correlation');
|
||||
expect(spanNames).toContain('notification.slack');
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
### 11.4 HMAC Security Hardening
|
||||
|
||||
```typescript
|
||||
describe('HMAC Signature Validation (Hardened)', () => {
|
||||
it('uses crypto.timingSafeEqual, not === comparison', () => {
|
||||
// Inspect the source to verify timing-safe comparison
|
||||
const source = fs.readFileSync('src/ingestion/hmac.ts', 'utf8');
|
||||
expect(source).toContain('timingSafeEqual');
|
||||
expect(source).not.toMatch(/signature\s*===\s*/);
|
||||
});
|
||||
|
||||
it('handles case-insensitive header names (dd-webhook-signature vs DD-WEBHOOK-SIGNATURE)', async () => {
|
||||
const payload = makeAlertPayload('datadog');
|
||||
const sig = computeHMAC(payload, DATADOG_SECRET);
|
||||
|
||||
// Lowercase header
|
||||
const resp1 = await ingest(payload, { 'dd-webhook-signature': sig });
|
||||
expect(resp1.status).toBe(200);
|
||||
|
||||
// Uppercase header
|
||||
const resp2 = await ingest(payload, { 'DD-WEBHOOK-SIGNATURE': sig });
|
||||
expect(resp2.status).toBe(200);
|
||||
});
|
||||
|
||||
it('rejects completely missing signature header', async () => {
|
||||
const resp = await ingest(makeAlertPayload('datadog'), {});
|
||||
expect(resp.status).toBe(401);
|
||||
});
|
||||
|
||||
it('rejects empty signature header', async () => {
|
||||
const resp = await ingest(makeAlertPayload('datadog'), { 'dd-webhook-signature': '' });
|
||||
expect(resp.status).toBe(401);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
### 11.5 SQS 256KB Payload Limit
|
||||
|
||||
```typescript
|
||||
describe('Large Payload Handling', () => {
|
||||
it('compresses payloads >200KB before sending to SQS', async () => {
|
||||
const largePayload = makeLargeAlertPayload(300 * 1024); // 300KB
|
||||
const resp = await ingest(largePayload);
|
||||
expect(resp.status).toBe(200);
|
||||
|
||||
const msg = await getLastSQSMessage(localstack, 'alert-queue');
|
||||
// Payload must be compressed or use S3 pointer
|
||||
expect(msg.Body.length).toBeLessThan(256 * 1024);
|
||||
});
|
||||
|
||||
it('uses S3 pointer for payloads >256KB after compression', async () => {
|
||||
const hugePayload = makeLargeAlertPayload(500 * 1024); // 500KB
|
||||
const resp = await ingest(hugePayload);
|
||||
expect(resp.status).toBe(200);
|
||||
|
||||
const msg = await getLastSQSMessage(localstack, 'alert-queue');
|
||||
const body = JSON.parse(msg.Body);
|
||||
expect(body.s3Pointer).toBeDefined();
|
||||
expect(body.s3Pointer).toMatch(/^s3:\/\/dd0c-alert-overflow\//);
|
||||
});
|
||||
|
||||
it('strips unnecessary fields from Datadog payload before SQS', async () => {
|
||||
const payload = makeDatadogPayloadWithLargeTags(100); // 100 tags
|
||||
const resp = await ingest(payload);
|
||||
expect(resp.status).toBe(200);
|
||||
|
||||
const msg = await getLastSQSMessage(localstack, 'alert-queue');
|
||||
const normalized = JSON.parse(msg.Body);
|
||||
// Only essential fields should remain
|
||||
expect(normalized.tags.length).toBeLessThanOrEqual(20);
|
||||
});
|
||||
|
||||
it('rejects payloads >2MB at API Gateway level', async () => {
|
||||
const massive = makeLargeAlertPayload(3 * 1024 * 1024);
|
||||
const resp = await ingest(massive);
|
||||
expect(resp.status).toBe(413);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
### 11.6 DLQ Backpressure & Replay
|
||||
|
||||
```typescript
|
||||
describe('DLQ Replay with Backpressure', () => {
|
||||
it('replays DLQ messages in batches of 100', async () => {
|
||||
await seedDLQ(10000); // 10K messages
|
||||
const replayer = new DLQReplayer({ batchSize: 100, delayBetweenBatchesMs: 500 });
|
||||
await replayer.start();
|
||||
|
||||
// Verify batched processing
|
||||
expect(replayer.batchesProcessed).toBeGreaterThan(0);
|
||||
expect(replayer.maxConcurrentMessages).toBeLessThanOrEqual(100);
|
||||
});
|
||||
|
||||
it('pauses replay if correlation engine error rate exceeds 10%', async () => {
|
||||
await seedDLQ(1000);
|
||||
const replayer = new DLQReplayer({ batchSize: 100, errorThreshold: 0.1 });
|
||||
|
||||
// Simulate correlation engine returning errors
|
||||
mockCorrelationEngine.failRate = 0.15;
|
||||
await replayer.start();
|
||||
|
||||
expect(replayer.state).toBe('paused');
|
||||
expect(replayer.pauseReason).toContain('error rate exceeded');
|
||||
});
|
||||
|
||||
it('does not replay if circuit breaker is currently tripped', async () => {
|
||||
await seedDLQ(100);
|
||||
await tripCircuitBreaker();
|
||||
|
||||
const replayer = new DLQReplayer();
|
||||
await replayer.start();
|
||||
|
||||
expect(replayer.messagesReplayed).toBe(0);
|
||||
expect(replayer.state).toBe('blocked_by_circuit_breaker');
|
||||
});
|
||||
|
||||
it('tracks replay progress for resumability', async () => {
|
||||
await seedDLQ(500);
|
||||
const replayer = new DLQReplayer({ batchSize: 50 });
|
||||
|
||||
// Process 3 batches then stop
|
||||
await replayer.processNBatches(3);
|
||||
expect(replayer.checkpoint).toBe(150);
|
||||
|
||||
// Resume from checkpoint
|
||||
const replayer2 = new DLQReplayer({ resumeFrom: replayer.checkpoint });
|
||||
await replayer2.start();
|
||||
expect(replayer2.startedFrom).toBe(150);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
### 11.7 Multi-Tenancy Isolation (DynamoDB)
|
||||
|
||||
```typescript
|
||||
describe('DynamoDB Tenant Isolation', () => {
|
||||
it('all DAO methods require tenantId parameter', () => {
|
||||
// Compile-time check: DAO interface has tenantId as first param
|
||||
const daoSource = fs.readFileSync('src/data/incident-dao.ts', 'utf8');
|
||||
const methods = extractPublicMethods(daoSource);
|
||||
for (const method of methods) {
|
||||
expect(method.params[0].name).toBe('tenantId');
|
||||
}
|
||||
});
|
||||
|
||||
it('query for tenant A returns zero results for tenant B data', async () => {
|
||||
const dao = new IncidentDAO(dynamoClient);
|
||||
await dao.create('tenant-A', makeIncident());
|
||||
await dao.create('tenant-B', makeIncident());
|
||||
|
||||
const results = await dao.list('tenant-A');
|
||||
expect(results.every(r => r.tenantId === 'tenant-A')).toBe(true);
|
||||
});
|
||||
|
||||
it('partition key always includes tenantId prefix', async () => {
|
||||
const dao = new IncidentDAO(dynamoClient);
|
||||
await dao.create('tenant-X', makeIncident());
|
||||
|
||||
// Read raw DynamoDB item
|
||||
const item = await dynamoClient.scan({ TableName: 'dd0c-alert-main' });
|
||||
expect(item.Items[0].PK.S).toStartWith('TENANT#tenant-X');
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
### 11.8 Slack Circuit Breaker
|
||||
|
||||
```typescript
|
||||
describe('Slack Notification Circuit Breaker', () => {
|
||||
it('opens circuit after 10 consecutive 429s from Slack', async () => {
|
||||
const slackClient = new SlackClient({ circuitBreakerThreshold: 10 });
|
||||
for (let i = 0; i < 10; i++) {
|
||||
mockSlack.respondWith(429);
|
||||
await slackClient.send(makeMessage()).catch(() => {});
|
||||
}
|
||||
expect(slackClient.circuitState).toBe('open');
|
||||
});
|
||||
|
||||
it('queues notifications while circuit is open', async () => {
|
||||
slackClient.openCircuit();
|
||||
await slackClient.send(makeMessage());
|
||||
expect(slackClient.queuedMessages).toBe(1);
|
||||
});
|
||||
|
||||
it('half-opens circuit after 60 seconds', async () => {
|
||||
slackClient.openCircuit();
|
||||
clock.advanceBy(61000);
|
||||
expect(slackClient.circuitState).toBe('half-open');
|
||||
});
|
||||
|
||||
it('drains queue on successful half-open probe', async () => {
|
||||
slackClient.openCircuit();
|
||||
slackClient.queue(makeMessage());
|
||||
slackClient.queue(makeMessage());
|
||||
clock.advanceBy(61000);
|
||||
mockSlack.respondWith(200);
|
||||
await slackClient.probe();
|
||||
expect(slackClient.circuitState).toBe('closed');
|
||||
expect(slackClient.queuedMessages).toBe(0);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
### 11.9 Updated Test Pyramid (Post-Review)
|
||||
|
||||
| Level | Original | Revised | Rationale |
|
||||
|-------|----------|---------|-----------|
|
||||
| Unit | 70% (~140) | 65% (~180) | More tests total, but integration share grows |
|
||||
| Integration | 20% (~40) | 25% (~70) | Dashboard API, tenant isolation, trace propagation |
|
||||
| E2E | 10% (~20) | 10% (~28) | Dashboard UI (Playwright), onboarding flow |
|
||||
|
||||
*End of P3 Review Remediation Addendum*
|
||||
|
||||
Reference in New Issue
Block a user