Implement review remediation + PLG analytics SDK

- All 6 test architectures patched with Section 11 addendums - P5 (cost) fully rewritten from 232 to ~600 lines - PLG brainstorm + party mode advisory board results - Analytics SDK v2 (PostHog Cloud, Zod strict, Lambda-safe) - Analytics tests v2 (safeParse, no , no timestamp, no PII) - Addresses all Gemini review findings across P1-P6
2026-03-01 01:42:49 +00:00
parent 2fe0ed856e
commit 03bfe931fc
9 changed files with 2950 additions and 85 deletions
--- a/products/03-alert-intelligence/test-architecture/test-architecture.md
+++ b/products/03-alert-intelligence/test-architecture/test-architecture.md
@@ -1409,3 +1409,459 @@ Before any release, these tests must pass:
 ---

 *End of dd0c/alert Test Architecture*
+
+---
+
+## 11. Review Remediation Addendum (Post-Gemini Review)
+
+### 11.1 Missing Epic Coverage
+
+#### Epic 6: Dashboard API
+
+```typescript
+describe('Dashboard API', () => {
+  describe('Authentication', () => {
+    it('returns 401 for missing Cognito JWT', async () => {});
+    it('returns 401 for expired JWT', async () => {});
+    it('returns 401 for JWT signed by wrong issuer', async () => {});
+    it('extracts tenantId from JWT claims', async () => {});
+  });
+
+  describe('Incident Listing (GET /v1/incidents)', () => {
+    it('returns paginated incidents for authenticated tenant', async () => {});
+    it('supports cursor-based pagination', async () => {});
+    it('filters by status (open, acknowledged, resolved)', async () => {});
+    it('filters by severity (critical, warning, info)', async () => {});
+    it('filters by time range (since, until)', async () => {});
+    it('returns empty array for tenant with no incidents', async () => {});
+  });
+
+  describe('Incident Detail (GET /v1/incidents/:id)', () => {
+    it('returns full incident with correlated alerts', async () => {});
+    it('returns 404 for incident belonging to different tenant', async () => {});
+    it('includes timeline of state transitions', async () => {});
+  });
+
+  describe('Analytics (GET /v1/analytics)', () => {
+    it('returns MTTR for last 7/30/90 days', async () => {});
+    it('returns alert volume by source', async () => {});
+    it('returns noise reduction percentage', async () => {});
+    it('scopes all analytics to authenticated tenant', async () => {});
+  });
+
+  describe('Tenant Isolation', () => {
+    it('tenant A cannot read tenant B incidents via API', async () => {});
+    it('tenant A cannot read tenant B analytics', async () => {});
+    it('all DynamoDB queries include tenantId partition key', async () => {});
+  });
+});
+```
+
+#### Epic 7: Dashboard UI (Playwright)
+
+```typescript
+// tests/e2e/ui/dashboard.spec.ts
+
+test('login redirects to Cognito hosted UI', async ({ page }) => {
+  await page.goto('/dashboard');
+  await expect(page).toHaveURL(/cognito/);
+});
+
+test('incident list renders with correct severity badges', async ({ page }) => {
+  await page.goto('/dashboard/incidents');
+  await expect(page.locator('[data-testid="incident-card"]')).toHaveCount(5);
+  await expect(page.locator('.severity-critical')).toBeVisible();
+});
+
+test('incident detail shows correlated alert timeline', async ({ page }) => {
+  await page.goto('/dashboard/incidents/inc-123');
+  await expect(page.locator('[data-testid="alert-timeline"]')).toBeVisible();
+  await expect(page.locator('.timeline-event')).toHaveCountGreaterThan(1);
+});
+
+test('MTTR chart renders with real data', async ({ page }) => {
+  await page.goto('/dashboard/analytics');
+  await expect(page.locator('[data-testid="mttr-chart"]')).toBeVisible();
+});
+
+test('noise reduction percentage displays correctly', async ({ page }) => {
+  await page.goto('/dashboard/analytics');
+  const noise = page.locator('[data-testid="noise-reduction"]');
+  await expect(noise).toContainText('%');
+});
+
+test('webhook setup wizard generates correct URL', async ({ page }) => {
+  await page.goto('/dashboard/settings/integrations');
+  await page.click('[data-testid="add-datadog"]');
+  const url = await page.locator('[data-testid="webhook-url"]').textContent();
+  expect(url).toMatch(/\/v1\/webhooks\/ingest\/.+/);
+});
+```
+
+#### Epic 9: Onboarding & PLG
+
+```typescript
+describe('Free Tier Enforcement', () => {
+  it('allows up to 10,000 alerts/month on free tier', async () => {});
+  it('returns 429 with upgrade prompt at 10,001st alert', async () => {});
+  it('resets counter on first of each month', async () => {});
+  it('purges alert data older than 7 days on free tier', async () => {});
+  it('retains alert data for 90 days on pro tier', async () => {});
+});
+
+describe('OAuth Signup', () => {
+  it('creates tenant record on first Cognito login', async () => {});
+  it('assigns free tier by default', async () => {});
+  it('generates unique webhook URL per tenant', async () => {});
+});
+
+describe('Stripe Integration', () => {
+  it('creates checkout session with correct pricing', async () => {});
+  it('upgrades tenant on checkout.session.completed webhook', async () => {});
+  it('downgrades tenant on subscription.deleted webhook', async () => {});
+  it('validates Stripe webhook signature', async () => {});
+});
+```
+
+#### Epic 5.3: Slack Feedback Endpoint
+
+```typescript
+describe('Slack Interactive Actions Endpoint', () => {
+  it('validates Slack request signature (HMAC-SHA256)', async () => {});
+  it('rejects request with invalid signature', async () => {});
+  it('handles "helpful" feedback — updates incident quality score', async () => {});
+  it('handles "noise" feedback — adds to suppression training data', async () => {});
+  it('handles "escalate" action — triggers PagerDuty/OpsGenie', async () => {});
+  it('updates original Slack message after action', async () => {});
+  it('scopes action to correct tenant', async () => {});
+});
+```
+
+#### Epic 1.4: S3 Raw Payload Archival
+
+```typescript
+describe('Raw Payload Archival', () => {
+  it('saves raw webhook payload to S3 asynchronously', async () => {});
+  it('S3 key includes tenantId, source, and timestamp', async () => {});
+  it('archival failure does not block alert processing', async () => {});
+  it('archived payload is retrievable for replay', async () => {});
+  it('S3 lifecycle policy deletes after retention period', async () => {});
+});
+```
+
+### 11.2 Anti-Pattern Fixes
+
+#### Replace ioredis-mock with WindowStore Interface
+
+```typescript
+// BEFORE (anti-pattern):
+// import RedisMock from 'ioredis-mock';
+// const engine = new CorrelationEngine(new RedisMock());
+
+// AFTER (correct):
+interface WindowStore {
+  addEvent(tenantId: string, key: string, event: Alert, ttlMs: number): Promise<void>;
+  getWindow(tenantId: string, key: string): Promise<Alert[]>;
+  clearWindow(tenantId: string, key: string): Promise<void>;
+}
+
+class InMemoryWindowStore implements WindowStore {
+  private store = new Map<string, { events: Alert[]; expiresAt: number }>();
+  
+  async addEvent(tenantId: string, key: string, event: Alert, ttlMs: number) {
+    const fullKey = `${tenantId}:${key}`;
+    const existing = this.store.get(fullKey) || { events: [], expiresAt: Date.now() + ttlMs };
+    existing.events.push(event);
+    this.store.set(fullKey, existing);
+  }
+
+  async getWindow(tenantId: string, key: string): Promise<Alert[]> {
+    const fullKey = `${tenantId}:${key}`;
+    const entry = this.store.get(fullKey);
+    if (!entry || entry.expiresAt < Date.now()) return [];
+    return entry.events;
+  }
+}
+
+// Unit tests use InMemoryWindowStore — no Redis dependency
+// Integration tests use RedisWindowStore with Testcontainers
+```
+
+#### Replace sinon.useFakeTimers with Clock Interface
+
+```typescript
+// BEFORE (anti-pattern):
+// sinon.useFakeTimers(new Date('2026-03-01T00:00:00Z'));
+
+// AFTER (correct):
+interface Clock {
+  now(): number;
+  advanceBy(ms: number): void;
+}
+
+class FakeClock implements Clock {
+  private current: number;
+  constructor(start: Date = new Date()) { this.current = start.getTime(); }
+  now() { return this.current; }
+  advanceBy(ms: number) { this.current += ms; }
+}
+
+class SystemClock implements Clock {
+  now() { return Date.now(); }
+  advanceBy() { throw new Error('Cannot advance system clock'); }
+}
+
+// Inject into CorrelationEngine:
+const engine = new CorrelationEngine(new InMemoryWindowStore(), new FakeClock());
+```
+
+### 11.3 Trace Context Propagation Tests
+
+```typescript
+describe('Trace Context Propagation', () => {
+  it('API Gateway passes trace_id to Lambda via X-Amzn-Trace-Id', async () => {});
+  
+  it('Lambda propagates trace_id into SQS message attributes', async () => {
+    // Verify SQS message has MessageAttribute 'traceparent' with W3C format
+    const msg = await getLastSQSMessage(localstack, 'alert-queue');
+    expect(msg.MessageAttributes.traceparent).toBeDefined();
+    expect(msg.MessageAttributes.traceparent.StringValue).toMatch(
+      /^00-[0-9a-f]{32}-[0-9a-f]{16}-0[01]$/
+    );
+  });
+
+  it('ECS Correlation Engine extracts trace_id from SQS message', async () => {
+    // Verify the correlation span has the correct parent from SQS
+    const spans = inMemoryExporter.getFinishedSpans();
+    const correlationSpan = spans.find(s => s.name === 'alert.correlation');
+    const ingestSpan = spans.find(s => s.name === 'webhook.ingest');
+    expect(correlationSpan.parentSpanId).toBeDefined();
+    // Parent chain must trace back to the original ingest span
+  });
+
+  it('end-to-end trace spans webhook → SQS → correlation → notification', async () => {
+    // Fire a webhook, wait for Slack notification, verify all spans share trace_id
+    const traceId = await fireWebhookAndGetTraceId();
+    const spans = await getSpansByTraceId(traceId);
+    const spanNames = spans.map(s => s.name);
+    expect(spanNames).toContain('webhook.ingest');
+    expect(spanNames).toContain('alert.normalize');
+    expect(spanNames).toContain('alert.correlation');
+    expect(spanNames).toContain('notification.slack');
+  });
+});
+```
+
+### 11.4 HMAC Security Hardening
+
+```typescript
+describe('HMAC Signature Validation (Hardened)', () => {
+  it('uses crypto.timingSafeEqual, not === comparison', () => {
+    // Inspect the source to verify timing-safe comparison
+    const source = fs.readFileSync('src/ingestion/hmac.ts', 'utf8');
+    expect(source).toContain('timingSafeEqual');
+    expect(source).not.toMatch(/signature\s*===\s*/);
+  });
+
+  it('handles case-insensitive header names (dd-webhook-signature vs DD-WEBHOOK-SIGNATURE)', async () => {
+    const payload = makeAlertPayload('datadog');
+    const sig = computeHMAC(payload, DATADOG_SECRET);
+    
+    // Lowercase header
+    const resp1 = await ingest(payload, { 'dd-webhook-signature': sig });
+    expect(resp1.status).toBe(200);
+    
+    // Uppercase header
+    const resp2 = await ingest(payload, { 'DD-WEBHOOK-SIGNATURE': sig });
+    expect(resp2.status).toBe(200);
+  });
+
+  it('rejects completely missing signature header', async () => {
+    const resp = await ingest(makeAlertPayload('datadog'), {});
+    expect(resp.status).toBe(401);
+  });
+
+  it('rejects empty signature header', async () => {
+    const resp = await ingest(makeAlertPayload('datadog'), { 'dd-webhook-signature': '' });
+    expect(resp.status).toBe(401);
+  });
+});
+```
+
+### 11.5 SQS 256KB Payload Limit
+
+```typescript
+describe('Large Payload Handling', () => {
+  it('compresses payloads >200KB before sending to SQS', async () => {
+    const largePayload = makeLargeAlertPayload(300 * 1024); // 300KB
+    const resp = await ingest(largePayload);
+    expect(resp.status).toBe(200);
+
+    const msg = await getLastSQSMessage(localstack, 'alert-queue');
+    // Payload must be compressed or use S3 pointer
+    expect(msg.Body.length).toBeLessThan(256 * 1024);
+  });
+
+  it('uses S3 pointer for payloads >256KB after compression', async () => {
+    const hugePayload = makeLargeAlertPayload(500 * 1024); // 500KB
+    const resp = await ingest(hugePayload);
+    expect(resp.status).toBe(200);
+
+    const msg = await getLastSQSMessage(localstack, 'alert-queue');
+    const body = JSON.parse(msg.Body);
+    expect(body.s3Pointer).toBeDefined();
+    expect(body.s3Pointer).toMatch(/^s3:\/\/dd0c-alert-overflow\//);
+  });
+
+  it('strips unnecessary fields from Datadog payload before SQS', async () => {
+    const payload = makeDatadogPayloadWithLargeTags(100); // 100 tags
+    const resp = await ingest(payload);
+    expect(resp.status).toBe(200);
+
+    const msg = await getLastSQSMessage(localstack, 'alert-queue');
+    const normalized = JSON.parse(msg.Body);
+    // Only essential fields should remain
+    expect(normalized.tags.length).toBeLessThanOrEqual(20);
+  });
+
+  it('rejects payloads >2MB at API Gateway level', async () => {
+    const massive = makeLargeAlertPayload(3 * 1024 * 1024);
+    const resp = await ingest(massive);
+    expect(resp.status).toBe(413);
+  });
+});
+```
+
+### 11.6 DLQ Backpressure & Replay
+
+```typescript
+describe('DLQ Replay with Backpressure', () => {
+  it('replays DLQ messages in batches of 100', async () => {
+    await seedDLQ(10000); // 10K messages
+    const replayer = new DLQReplayer({ batchSize: 100, delayBetweenBatchesMs: 500 });
+    await replayer.start();
+
+    // Verify batched processing
+    expect(replayer.batchesProcessed).toBeGreaterThan(0);
+    expect(replayer.maxConcurrentMessages).toBeLessThanOrEqual(100);
+  });
+
+  it('pauses replay if correlation engine error rate exceeds 10%', async () => {
+    await seedDLQ(1000);
+    const replayer = new DLQReplayer({ batchSize: 100, errorThreshold: 0.1 });
+    
+    // Simulate correlation engine returning errors
+    mockCorrelationEngine.failRate = 0.15;
+    await replayer.start();
+
+    expect(replayer.state).toBe('paused');
+    expect(replayer.pauseReason).toContain('error rate exceeded');
+  });
+
+  it('does not replay if circuit breaker is currently tripped', async () => {
+    await seedDLQ(100);
+    await tripCircuitBreaker();
+
+    const replayer = new DLQReplayer();
+    await replayer.start();
+
+    expect(replayer.messagesReplayed).toBe(0);
+    expect(replayer.state).toBe('blocked_by_circuit_breaker');
+  });
+
+  it('tracks replay progress for resumability', async () => {
+    await seedDLQ(500);
+    const replayer = new DLQReplayer({ batchSize: 50 });
+    
+    // Process 3 batches then stop
+    await replayer.processNBatches(3);
+    expect(replayer.checkpoint).toBe(150);
+
+    // Resume from checkpoint
+    const replayer2 = new DLQReplayer({ resumeFrom: replayer.checkpoint });
+    await replayer2.start();
+    expect(replayer2.startedFrom).toBe(150);
+  });
+});
+```
+
+### 11.7 Multi-Tenancy Isolation (DynamoDB)
+
+```typescript
+describe('DynamoDB Tenant Isolation', () => {
+  it('all DAO methods require tenantId parameter', () => {
+    // Compile-time check: DAO interface has tenantId as first param
+    const daoSource = fs.readFileSync('src/data/incident-dao.ts', 'utf8');
+    const methods = extractPublicMethods(daoSource);
+    for (const method of methods) {
+      expect(method.params[0].name).toBe('tenantId');
+    }
+  });
+
+  it('query for tenant A returns zero results for tenant B data', async () => {
+    const dao = new IncidentDAO(dynamoClient);
+    await dao.create('tenant-A', makeIncident());
+    await dao.create('tenant-B', makeIncident());
+
+    const results = await dao.list('tenant-A');
+    expect(results.every(r => r.tenantId === 'tenant-A')).toBe(true);
+  });
+
+  it('partition key always includes tenantId prefix', async () => {
+    const dao = new IncidentDAO(dynamoClient);
+    await dao.create('tenant-X', makeIncident());
+
+    // Read raw DynamoDB item
+    const item = await dynamoClient.scan({ TableName: 'dd0c-alert-main' });
+    expect(item.Items[0].PK.S).toStartWith('TENANT#tenant-X');
+  });
+});
+```
+
+### 11.8 Slack Circuit Breaker
+
+```typescript
+describe('Slack Notification Circuit Breaker', () => {
+  it('opens circuit after 10 consecutive 429s from Slack', async () => {
+    const slackClient = new SlackClient({ circuitBreakerThreshold: 10 });
+    for (let i = 0; i < 10; i++) {
+      mockSlack.respondWith(429);
+      await slackClient.send(makeMessage()).catch(() => {});
+    }
+    expect(slackClient.circuitState).toBe('open');
+  });
+
+  it('queues notifications while circuit is open', async () => {
+    slackClient.openCircuit();
+    await slackClient.send(makeMessage());
+    expect(slackClient.queuedMessages).toBe(1);
+  });
+
+  it('half-opens circuit after 60 seconds', async () => {
+    slackClient.openCircuit();
+    clock.advanceBy(61000);
+    expect(slackClient.circuitState).toBe('half-open');
+  });
+
+  it('drains queue on successful half-open probe', async () => {
+    slackClient.openCircuit();
+    slackClient.queue(makeMessage());
+    slackClient.queue(makeMessage());
+    clock.advanceBy(61000);
+    mockSlack.respondWith(200);
+    await slackClient.probe();
+    expect(slackClient.circuitState).toBe('closed');
+    expect(slackClient.queuedMessages).toBe(0);
+  });
+});
+```
+
+### 11.9 Updated Test Pyramid (Post-Review)
+
+| Level | Original | Revised | Rationale |
+|-------|----------|---------|-----------|
+| Unit | 70% (~140) | 65% (~180) | More tests total, but integration share grows |
+| Integration | 20% (~40) | 25% (~70) | Dashboard API, tenant isolation, trace propagation |
+| E2E | 10% (~20) | 10% (~28) | Dashboard UI (Playwright), onboarding flow |
+
+*End of P3 Review Remediation Addendum*