From 03bfe931fcbb1835b6146ff2eb5dc190152930e1 Mon Sep 17 00:00:00 2001 From: Max Mayfield Date: Sun, 1 Mar 2026 01:42:49 +0000 Subject: [PATCH] Implement review remediation + PLG analytics SDK - All 6 test architectures patched with Section 11 addendums - P5 (cost) fully rewritten from 232 to ~600 lines - PLG brainstorm + party mode advisory board results - Analytics SDK v2 (PostHog Cloud, Zod strict, Lambda-safe) - Analytics tests v2 (safeParse, no , no timestamp, no PII) - Addresses all Gemini review findings across P1-P6 --- .../01-llm-cost-router/src/analytics/index.ts | 138 ++++ .../test-architecture/test-architecture.md | 312 +++++++++ .../tests/analytics/analytics.spec.ts | 204 ++++++ .../test-architecture/test-architecture.md | 367 ++++++++++ .../test-architecture/test-architecture.md | 456 ++++++++++++ .../test-architecture/test-architecture.md | 158 +++++ .../test-architecture/test-architecture.md | 650 +++++++++++++++--- .../test-architecture/test-architecture.md | 524 ++++++++++++++ products/plg-instrumentation-brainstorm.md | 226 ++++++ 9 files changed, 2950 insertions(+), 85 deletions(-) create mode 100644 products/01-llm-cost-router/src/analytics/index.ts create mode 100644 products/01-llm-cost-router/tests/analytics/analytics.spec.ts create mode 100644 products/plg-instrumentation-brainstorm.md diff --git a/products/01-llm-cost-router/src/analytics/index.ts b/products/01-llm-cost-router/src/analytics/index.ts new file mode 100644 index 0000000..a1b6700 --- /dev/null +++ b/products/01-llm-cost-router/src/analytics/index.ts @@ -0,0 +1,138 @@ +import { PostHog } from 'posthog-node'; +import { z } from 'zod'; + +// --------------------------------------------------------- +// 1. Unified Event Taxonomy (Zod Enforced, Strictly Typed) +// --------------------------------------------------------- + +export enum EventName { + SignupCompleted = 'account.signup.completed', + FirstDollarSaved = 'routing.savings.first_dollar', + UpgradeCompleted = 'billing.upgrade.completed', +} + +// Per-event property schemas — no z.any() PII loophole +const SignupProperties = z.object({ + method: z.enum(['github_sso', 'google_sso', 'email']), +}).strict(); + +const ActivationProperties = z.object({ + savings_amount: z.number().nonnegative(), +}).strict(); + +const UpgradeProperties = z.object({ + plan: z.enum(['pro', 'business']), + mrr_increase: z.number().nonnegative(), +}).strict(); + +const PropertiesMap = { + [EventName.SignupCompleted]: SignupProperties, + [EventName.FirstDollarSaved]: ActivationProperties, + [EventName.UpgradeCompleted]: UpgradeProperties, +} as const; + +export const EventSchema = z.object({ + name: z.nativeEnum(EventName), + tenant_id: z.string().min(1, 'tenant_id is required'), + product: z.literal('route'), + properties: z.record(z.unknown()).optional().default({}), +}); + +export type AnalyticsEvent = z.infer; + +// --------------------------------------------------------- +// 2. NoOp Client for local/test environments +// --------------------------------------------------------- + +class NoOpPostHog { + capture() {} + identify() {} + async flushAsync() {} + async shutdown() {} +} + +// --------------------------------------------------------- +// 3. Analytics SDK (PostHog Cloud, Lambda-Safe) +// --------------------------------------------------------- + +export class Analytics { + private client: PostHog | NoOpPostHog; + public readonly isSessionReplayEnabled = false; + + constructor(client?: PostHog) { + if (client) { + this.client = client; + } else { + const apiKey = process.env.POSTHOG_API_KEY; + if (!apiKey) { + // No key = NoOp. Never silently send to a mock key. + console.warn('[Analytics] POSTHOG_API_KEY not set — using NoOp client'); + this.client = new NoOpPostHog(); + } else { + this.client = new PostHog(apiKey, { + host: 'https://us.i.posthog.com', + flushAt: 20, // Batch up to 20 events + flushInterval: 5000, // Or flush every 5s + }); + } + } + } + + /** + * Identify a tenant once (on signup). Sets $set properties. + * Call this instead of embedding $set in every track() call. + */ + public identify(tenantId: string, properties?: Record): void { + this.client.identify({ + distinctId: tenantId, + properties: { tenant_id: tenantId, ...properties }, + }); + } + + /** + * Track an event. Uses safeParse — never crashes the caller. + * Does NOT flush. Call flush() at Lambda teardown. + */ + public track(event: AnalyticsEvent): boolean { + // 1. Base schema validation + const baseResult = EventSchema.safeParse(event); + if (!baseResult.success) { + console.error('[Analytics] Invalid event (base):', baseResult.error.format()); + return false; + } + + // 2. Per-event property validation (strict, no PII loophole) + const propSchema = PropertiesMap[baseResult.data.name]; + if (propSchema) { + const propResult = propSchema.safeParse(baseResult.data.properties); + if (!propResult.success) { + console.error('[Analytics] Invalid properties:', propResult.error.format()); + return false; + } + } + + // 3. Capture — let PostHog assign the timestamp (avoids clock skew) + this.client.capture({ + distinctId: baseResult.data.tenant_id, + event: baseResult.data.name, + properties: { + product: baseResult.data.product, + ...baseResult.data.properties, + }, + }); + + return true; + } + + /** + * Flush all queued events. Call once at Lambda teardown + * (e.g., in a Middy middleware or handler's finally block). + */ + public async flush(): Promise { + await this.client.flushAsync(); + } + + public async shutdown(): Promise { + await this.client.shutdown(); + } +} diff --git a/products/01-llm-cost-router/test-architecture/test-architecture.md b/products/01-llm-cost-router/test-architecture/test-architecture.md index d347181..37b9b05 100644 --- a/products/01-llm-cost-router/test-architecture/test-architecture.md +++ b/products/01-llm-cost-router/test-architecture/test-architecture.md @@ -2239,3 +2239,315 @@ Before writing any new function, ask: *Test Architecture document generated for dd0c/route V1 MVP.* *Total estimated test count at V1 launch: ~400 tests.* *Target CI runtime: <8 minutes (unit + integration), <15 minutes (full pipeline with E2E).* + +--- + +## 11. Review Remediation Addendum (Post-Gemini Review) + +### 11.1 Replace MockKeyCache/MockKeyStore with Testcontainers + +```rust +// BEFORE (anti-pattern — mocks hide real latency): +// let cache = MockKeyCache::new(); +// let store = MockKeyStore::new(); + +// AFTER: Use Testcontainers for hot-path auth tests +#[tokio::test] +async fn auth_middleware_validates_key_under_5ms_with_real_redis() { + let redis = TestcontainersRedis::start().await; + let pg = TestcontainersPostgres::start().await; + let cache = RedisKeyCache::new(redis.connection_string()); + let store = PgKeyStore::new(pg.connection_string()); + + let start = Instant::now(); + let result = auth_middleware(&cache, &store, "sk-valid-key").await; + assert!(start.elapsed() < Duration::from_millis(5)); + assert!(result.is_ok()); +} + +#[tokio::test] +async fn auth_middleware_handles_redis_connection_pool_exhaustion() { + // Exhaust all connections, verify fallback to PG + let redis = TestcontainersRedis::start().await; + let cache = RedisKeyCache::with_pool_size(redis.connection_string(), 1); + // Hold the single connection + let _held = cache.raw_connection().await; + // Auth must still work via PG fallback + let result = auth_middleware(&cache, &pg_store, "sk-valid-key").await; + assert!(result.is_ok()); +} +``` + +### 11.2 Fix Encryption Test (Decrypt, Don't Just Assert Non-Plaintext) + +```rust +// BEFORE (anti-pattern — passes if stored as random garbage): +// assert_ne!(stored.encrypted_key, b"sk-plaintext-key"); + +// AFTER: Full round-trip encryption test +#[tokio::test] +async fn provider_credential_encrypts_and_decrypts_correctly() { + let kms = LocalStackKMS::start().await; + let key_id = kms.create_key().await; + let store = CredentialStore::new(pg.pool(), kms.client(), key_id); + + let original = "sk-live-abc123xyz"; + store.save_credential("org-1", "openai", original).await.unwrap(); + + // Read raw from DB — must NOT be plaintext + let raw = pg.query_raw("SELECT encrypted_key FROM credentials LIMIT 1").await; + assert!(!String::from_utf8_lossy(&raw).contains(original)); + + // Decrypt via the store — must match original + let decrypted = store.get_credential("org-1", "openai").await.unwrap(); + assert_eq!(decrypted, original); +} + +#[tokio::test] +async fn kms_key_rotation_old_deks_still_decrypt_old_credentials() { + let kms = LocalStackKMS::start().await; + let key_id = kms.create_key().await; + let store = CredentialStore::new(pg.pool(), kms.client(), key_id); + + // Save with original key + store.save_credential("org-1", "openai", "sk-old").await.unwrap(); + + // Rotate KMS key + kms.rotate_key(key_id).await; + + // Old credential must still decrypt + let decrypted = store.get_credential("org-1", "openai").await.unwrap(); + assert_eq!(decrypted, "sk-old"); + + // New credential uses new DEK + store.save_credential("org-1", "anthropic", "sk-new").await.unwrap(); + let decrypted_new = store.get_credential("org-1", "anthropic").await.unwrap(); + assert_eq!(decrypted_new, "sk-new"); +} +``` + +### 11.3 Slow Dependency Chaos Test + +```rust +#[tokio::test] +async fn chaos_slow_db_does_not_block_proxy_hot_path() { + let stack = E2EStack::start().await; + + // Inject 5-second network delay on TimescaleDB port via tc netem + stack.inject_latency("timescaledb", Duration::from_secs(5)).await; + + // Proxy must still route requests within SLA + let start = Instant::now(); + let resp = stack.proxy() + .post("/v1/chat/completions") + .header("Authorization", "Bearer sk-valid") + .json(&chat_request()) + .send().await; + let latency = start.elapsed(); + + assert_eq!(resp.status(), 200); + // Telemetry is dropped, but routing works + assert!(latency < Duration::from_millis(50), + "Proxy blocked by slow DB: {:?}", latency); +} + +#[tokio::test] +async fn chaos_slow_redis_falls_back_to_pg_for_auth() { + let stack = E2EStack::start().await; + stack.inject_latency("redis", Duration::from_secs(3)).await; + + let resp = stack.proxy() + .post("/v1/chat/completions") + .header("Authorization", "Bearer sk-valid") + .json(&chat_request()) + .send().await; + assert_eq!(resp.status(), 200); +} +``` + +### 11.4 IDOR / Cross-Tenant Test Suite + +```rust +// tests/integration/idor_test.rs + +#[tokio::test] +async fn idor_org_a_cannot_read_org_b_routing_rules() { + let stack = E2EStack::start().await; + let org_a_token = stack.create_org_and_token("org-a").await; + let org_b_token = stack.create_org_and_token("org-b").await; + + // Org B creates a routing rule + let rule = stack.api() + .post("/v1/routing-rules") + .bearer_auth(&org_b_token) + .json(&json!({ "name": "secret-rule", "model": "gpt-4" })) + .send().await.json::().await; + + // Org A tries to read it + let resp = stack.api() + .get(&format!("/v1/routing-rules/{}", rule.id)) + .bearer_auth(&org_a_token) + .send().await; + assert_eq!(resp.status(), 404); // Not 403 — don't leak existence +} + +#[tokio::test] +async fn idor_org_a_cannot_read_org_b_api_keys() { + // Same pattern — create key as org B, attempt read as org A +} + +#[tokio::test] +async fn idor_org_a_cannot_read_org_b_telemetry() {} + +#[tokio::test] +async fn idor_org_a_cannot_mutate_org_b_routing_rules() {} +``` + +### 11.5 SSE Connection Drop / Billing Leak Test + +```rust +#[tokio::test] +async fn sse_client_disconnect_aborts_upstream_provider_request() { + let stack = E2EStack::start().await; + let mock_provider = stack.mock_provider(); + + // Configure provider to stream slowly (1 token/sec for 60 tokens) + mock_provider.configure_slow_stream(60, Duration::from_secs(1)); + + // Start streaming request + let mut stream = stack.proxy() + .post("/v1/chat/completions") + .json(&json!({ "stream": true, "model": "gpt-4" })) + .send().await + .bytes_stream(); + + // Read 5 tokens then drop the connection + for _ in 0..5 { + stream.next().await; + } + drop(stream); + + // Wait briefly for cleanup + tokio::time::sleep(Duration::from_millis(500)).await; + + // Provider connection must be aborted — not still streaming + assert_eq!(mock_provider.active_connections(), 0); + + // Billing: customer should only be charged for 5 tokens, not 60 + let usage = stack.get_last_usage_record().await; + assert!(usage.completion_tokens <= 10); // Some buffer for in-flight +} +``` + +### 11.6 Concurrent Circuit Breaker Race Condition + +```rust +#[tokio::test] +async fn circuit_breaker_handles_50_concurrent_failures_cleanly() { + let redis = TestcontainersRedis::start().await; + let breaker = RedisCircuitBreaker::new(redis.connection_string(), "openai", 10); + + let mut handles = vec![]; + for _ in 0..50 { + let b = breaker.clone(); + handles.push(tokio::spawn(async move { + b.record_failure().await; + })); + } + futures::future::join_all(handles).await; + + // Breaker must be open — no race condition leaving it closed + assert_eq!(breaker.state().await, CircuitState::Open); + // Failure count must be exactly 50 (atomic increments) + assert_eq!(breaker.failure_count().await, 50); +} +``` + +### 11.7 Trace Context Propagation + +```rust +#[tokio::test] +async fn otel_trace_propagates_from_client_through_proxy_to_provider() { + let stack = E2EStack::start().await; + let tracer = stack.in_memory_tracer(); + + let resp = stack.proxy() + .post("/v1/chat/completions") + .header("traceparent", "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01") + .json(&chat_request()) + .send().await; + + let spans = tracer.finished_spans(); + let proxy_span = spans.iter().find(|s| s.name == "proxy.route").unwrap(); + + // Proxy span must be child of the incoming trace + assert_eq!(proxy_span.trace_id, "4bf92f3577b34da6a3ce929d0e0e4736"); + + // Provider request must carry the same trace_id + let provider_req = stack.mock_provider().last_request(); + assert!(provider_req.headers["traceparent"].contains("4bf92f3577b34da6a3ce929d0e0e4736")); +} +``` + +### 11.8 Flag Provider Fallback Test + +```rust +#[test] +fn flag_provider_unreachable_falls_back_to_safe_default() { + // Simulate missing/corrupt flag config file + let provider = JsonFileProvider::new("/nonexistent/flags.json"); + let result = provider.evaluate("enable_new_router", false); + // Must return the safe default (false), not panic or error + assert_eq!(result, false); +} + +#[test] +fn flag_provider_malformed_json_falls_back_to_safe_default() { + let provider = JsonFileProvider::from_string("{ invalid json }}}"); + let result = provider.evaluate("enable_new_router", false); + assert_eq!(result, false); +} +``` + +### 11.9 24-Hour Soak Test Spec + +```rust +// tests/soak/long_running_latency.rs +// Run manually: cargo test --test soak -- --ignored + +#[tokio::test] +#[ignore] // Only run in nightly CI +async fn soak_24h_proxy_latency_stays_under_5ms_p99() { + // k6 config: 10 RPS sustained for 24 hours + // Assert: p99 < 5ms, no memory growth > 50MB, no connection leaks + // This catches memory fragmentation and connection pool exhaustion +} +``` + +### 11.10 Panic Mode Authorization + +```rust +#[tokio::test] +async fn panic_mode_requires_owner_role() { + let stack = E2EStack::start().await; + let viewer_token = stack.create_token_with_role("org-1", Role::Viewer).await; + + let resp = stack.api() + .post("/admin/panic") + .bearer_auth(&viewer_token) + .send().await; + assert_eq!(resp.status(), 403); +} + +#[tokio::test] +async fn panic_mode_allowed_for_owner_role() { + let owner_token = stack.create_token_with_role("org-1", Role::Owner).await; + let resp = stack.api() + .post("/admin/panic") + .bearer_auth(&owner_token) + .send().await; + assert_eq!(resp.status(), 200); +} +``` + +*End of P1 Review Remediation Addendum* diff --git a/products/01-llm-cost-router/tests/analytics/analytics.spec.ts b/products/01-llm-cost-router/tests/analytics/analytics.spec.ts new file mode 100644 index 0000000..798fdb0 --- /dev/null +++ b/products/01-llm-cost-router/tests/analytics/analytics.spec.ts @@ -0,0 +1,204 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { Analytics, EventSchema, EventName } from '../../src/analytics'; +import { PostHog } from 'posthog-node'; + +vi.mock('posthog-node'); + +describe('Analytics SDK (PostHog Cloud — v2 Post-Review)', () => { + let analytics: Analytics; + let mockPostHog: vi.Mocked; + + beforeEach(() => { + vi.clearAllMocks(); + mockPostHog = new PostHog('phc_test_key', { host: 'https://us.i.posthog.com' }) as any; + analytics = new Analytics(mockPostHog); + }); + + // ── Schema Validation (Zod) ────────────────────────────── + + describe('Event Taxonomy Validation', () => { + it('accepts valid account.signup.completed event', () => { + const event = { + name: EventName.SignupCompleted, + tenant_id: 'tenant-123', + product: 'route' as const, + properties: { method: 'github_sso' }, + }; + expect(() => EventSchema.parse(event)).not.toThrow(); + }); + + it('rejects events missing tenant_id', () => { + const event = { + name: EventName.SignupCompleted, + product: 'route', + properties: { method: 'email' }, + }; + expect(() => EventSchema.parse(event as any)).toThrow(/tenant_id/); + }); + + it('accepts valid activation event', () => { + const event = { + name: EventName.FirstDollarSaved, + tenant_id: 'tenant-123', + product: 'route' as const, + properties: { savings_amount: 1.50 }, + }; + expect(() => EventSchema.parse(event)).not.toThrow(); + }); + + it('accepts valid upgrade event', () => { + const event = { + name: EventName.UpgradeCompleted, + tenant_id: 'tenant-123', + product: 'route' as const, + properties: { plan: 'pro', mrr_increase: 49 }, + }; + expect(() => EventSchema.parse(event)).not.toThrow(); + }); + }); + + // ── track() Behavior ───────────────────────────────────── + + describe('track()', () => { + it('captures valid events via PostHog client', () => { + const result = analytics.track({ + name: EventName.SignupCompleted, + tenant_id: 'tenant-123', + product: 'route', + properties: { method: 'email' }, + }); + + expect(result).toBe(true); + expect(mockPostHog.capture).toHaveBeenCalledWith( + expect.objectContaining({ + distinctId: 'tenant-123', + event: 'account.signup.completed', + properties: expect.objectContaining({ + product: 'route', + method: 'email', + }), + }) + ); + }); + + it('does NOT include $set in track calls (use identify instead)', () => { + analytics.track({ + name: EventName.SignupCompleted, + tenant_id: 'tenant-123', + product: 'route', + properties: { method: 'github_sso' }, + }); + + const captureCall = mockPostHog.capture.mock.calls[0][0]; + expect(captureCall.properties).not.toHaveProperty('$set'); + }); + + it('does NOT pass timestamp (let PostHog handle it to avoid clock skew)', () => { + analytics.track({ + name: EventName.SignupCompleted, + tenant_id: 'tenant-123', + product: 'route', + properties: { method: 'email' }, + }); + + const captureCall = mockPostHog.capture.mock.calls[0][0]; + expect(captureCall).not.toHaveProperty('timestamp'); + }); + + it('returns false and does NOT call PostHog if base validation fails', () => { + const result = analytics.track({ + name: 'invalid.event' as any, + tenant_id: 'tenant-123', + product: 'route', + }); + + expect(result).toBe(false); + expect(mockPostHog.capture).not.toHaveBeenCalled(); + }); + + it('returns false if per-event property validation fails (strict schema)', () => { + const result = analytics.track({ + name: EventName.SignupCompleted, + tenant_id: 'tenant-123', + product: 'route', + properties: { method: 'invalid_method' }, // Not in enum + }); + + expect(result).toBe(false); + expect(mockPostHog.capture).not.toHaveBeenCalled(); + }); + + it('rejects unknown properties (strict mode — no PII loophole)', () => { + const result = analytics.track({ + name: EventName.SignupCompleted, + tenant_id: 'tenant-123', + product: 'route', + properties: { method: 'email', email: 'user@example.com' }, // PII leak attempt + }); + + expect(result).toBe(false); + expect(mockPostHog.capture).not.toHaveBeenCalled(); + }); + + it('does NOT flush after each track call (Lambda batching)', () => { + analytics.track({ + name: EventName.SignupCompleted, + tenant_id: 'tenant-123', + product: 'route', + properties: { method: 'email' }, + }); + + expect(mockPostHog.flushAsync).not.toHaveBeenCalled(); + }); + }); + + // ── identify() ─────────────────────────────────────────── + + describe('identify()', () => { + it('calls PostHog identify with tenant_id as distinctId', () => { + analytics.identify('tenant-123', { company: 'Acme' }); + + expect(mockPostHog.identify).toHaveBeenCalledWith( + expect.objectContaining({ + distinctId: 'tenant-123', + properties: expect.objectContaining({ + tenant_id: 'tenant-123', + company: 'Acme', + }), + }) + ); + }); + }); + + // ── flush() ────────────────────────────────────────────── + + describe('flush()', () => { + it('calls flushAsync on the PostHog client', async () => { + await analytics.flush(); + expect(mockPostHog.flushAsync).toHaveBeenCalledTimes(1); + }); + }); + + // ── NoOp Client ────────────────────────────────────────── + + describe('NoOp Client (missing API key)', () => { + it('does not throw when tracking without API key', () => { + const noopAnalytics = new Analytics(); // No client, no env var + const result = noopAnalytics.track({ + name: EventName.SignupCompleted, + tenant_id: 'tenant-123', + product: 'route', + properties: { method: 'email' }, + }); + expect(result).toBe(true); // NoOp accepts everything silently + }); + }); + + // ── Session Replay ─────────────────────────────────────── + + describe('Security', () => { + it('session replay is disabled', () => { + expect(analytics.isSessionReplayEnabled).toBe(false); + }); + }); +}); diff --git a/products/02-iac-drift-detection/test-architecture/test-architecture.md b/products/02-iac-drift-detection/test-architecture/test-architecture.md index 28e58f5..113245d 100644 --- a/products/02-iac-drift-detection/test-architecture/test-architecture.md +++ b/products/02-iac-drift-detection/test-architecture/test-architecture.md @@ -1727,3 +1727,370 @@ Before any code ships to production, these tests must be green: --- *Document complete. Total estimated test count at V1 launch: ~500 tests. Target by month 3: ~1,000 tests.* + +--- + +## 11. Review Remediation Addendum (Post-Gemini Review) + +### 11.1 Missing Epic Coverage + +#### Epic 6: Dashboard UI (React Testing Library + Playwright) + +```typescript +// tests/ui/components/DiffViewer.test.tsx +describe('DiffViewer Component', () => { + it('renders added lines in green', () => {}); + it('renders removed lines in red', () => {}); + it('renders unchanged lines in default color', () => {}); + it('collapses large diffs with "Show more" toggle', () => {}); + it('highlights HCL syntax in diff blocks', () => {}); + it('shows resource type icon next to each drift item', () => {}); +}); + +describe('StackOverview Component', () => { + it('renders drift count badge per stack', () => {}); + it('sorts stacks by drift severity (critical first)', () => {}); + it('shows last scan timestamp', () => {}); + it('shows agent health indicator (green/yellow/red)', () => {}); +}); + +// tests/e2e/ui/dashboard.spec.ts (Playwright) +test('OAuth login redirects to Cognito and back', async ({ page }) => { + await page.goto('/dashboard'); + await expect(page).toHaveURL(/cognito/); +}); + +test('stack list renders with drift counts', async ({ page }) => { + await page.goto('/dashboard/stacks'); + await expect(page.locator('[data-testid="stack-card"]')).toHaveCountGreaterThan(0); +}); + +test('diff viewer renders inline diff for Terraform resource', async ({ page }) => { + await page.goto('/dashboard/stacks/stack-1/drifts/drift-1'); + await expect(page.locator('[data-testid="diff-viewer"]')).toBeVisible(); + await expect(page.locator('.diff-added')).toHaveCountGreaterThan(0); +}); + +test('revert button triggers confirmation modal', async ({ page }) => { + await page.goto('/dashboard/stacks/stack-1/drifts/drift-1'); + await page.click('[data-testid="revert-btn"]'); + await expect(page.locator('[data-testid="confirm-modal"]')).toBeVisible(); +}); +``` + +#### Epic 9: Onboarding & PLG (Stripe + drift init) + +```go +// pkg/onboarding/stripe_test.go + +func TestStripeWebhookCheckoutCompleted_UpgradesTenant(t *testing.T) {} +func TestStripeWebhookSubscriptionDeleted_DowngradesTenant(t *testing.T) {} +func TestStripeWebhookInvalidSignature_Returns401(t *testing.T) {} +func TestStripeWebhookReplayedEvent_IsIdempotent(t *testing.T) {} + +// pkg/agent/init_test.go + +func TestDriftInit_DetectsTerraformInCurrentDir(t *testing.T) {} +func TestDriftInit_DetectsCloudFormationInCurrentDir(t *testing.T) {} +func TestDriftInit_DetectsPulumiInCurrentDir(t *testing.T) {} +func TestDriftInit_GeneratesValidYAMLConfig(t *testing.T) {} +func TestDriftInit_HandlesWindowsPaths(t *testing.T) {} +func TestDriftInit_HandlesMacPaths(t *testing.T) {} +func TestDriftInit_HandlesLinuxPaths(t *testing.T) {} +func TestDriftInit_FailsGracefullyOnEmptyDir(t *testing.T) {} +``` + +#### Epic 8: Infrastructure (Terratest) + +```go +// tests/infra/terraform_test.go + +func TestTerraformPlan_CreatesExpectedResources(t *testing.T) { + terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ + TerraformDir: "../../infra/terraform", + }) + defer terraform.Destroy(t, terraformOptions) + terraform.InitAndPlan(t, terraformOptions) +} + +func TestTerraformApply_SQSFIFOQueueCreated(t *testing.T) {} +func TestTerraformApply_RDSInstanceCreated(t *testing.T) {} +func TestTerraformApply_IAMRolesHaveLeastPrivilege(t *testing.T) { + // Verify no IAM policy has Action: "*" +} +func TestTerraformApply_VPCSecurityGroupsRestrictIngress(t *testing.T) {} +``` + +#### Epic 2: mTLS Certificate Lifecycle + +```go +// pkg/agent/mtls_test.go + +func TestMTLS_CertificateGeneration_ValidX509(t *testing.T) {} +func TestMTLS_CertificateExpiration_AgentRejectsExpiredCert(t *testing.T) {} +func TestMTLS_CertificateRotation_NewCertAcceptedMidConnection(t *testing.T) {} +func TestMTLS_CertificateRevocation_RevokedCertRejected(t *testing.T) {} +func TestMTLS_SelfSignedCert_RejectedBySaaS(t *testing.T) {} +func TestMTLS_CertificateChain_IntermediateCAValidated(t *testing.T) {} +``` + +### 11.2 Add t.Parallel() to Table-Driven Tests + +```go +// BEFORE (sequential — wastes CI time): +func TestSecretScrubber(t *testing.T) { + tests := []struct{ name, input, expected string }{...} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // runs sequentially + }) + } +} + +// AFTER (parallel): +func TestSecretScrubber(t *testing.T) { + t.Parallel() + tests := []struct{ name, input, expected string }{...} + for _, tt := range tests { + tt := tt // capture range variable + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + // runs in parallel + }) + } +} +``` + +### 11.3 Dynamic Resource Naming for LocalStack + +```go +// BEFORE (shared state — flaky): +// bucket := "drift-reports" + +// AFTER (per-test isolation): +func uniqueBucket(t *testing.T) string { + return fmt.Sprintf("drift-reports-%s-%d", t.Name(), time.Now().UnixNano()) +} + +func TestDriftReportUpload(t *testing.T) { + t.Parallel() + bucket := uniqueBucket(t) + s3Client.CreateBucket(ctx, &s3.CreateBucketInput{Bucket: &bucket}) + // Test uses isolated bucket — no cross-test contamination +} +``` + +### 11.4 Distributed Tracing Cross-Boundary Tests + +```go +// tests/integration/trace_propagation_test.go + +func TestTraceContext_AgentToSaaS_SpanParentChain(t *testing.T) { + // Agent generates drift_scan span with trace_id + // POST /v1/drift-reports carries traceparent header + // SaaS Event Processor creates child span + // Verify parent-child relationship across HTTP boundary + + exporter := tracetest.NewInMemoryExporter() + + // Fire drift report with traceparent + traceID := "4bf92f3577b34da6a3ce929d0e0e4736" + resp := postDriftReport(t, stack, traceID) + assert.Equal(t, 200, resp.StatusCode) + + spans := exporter.GetSpans() + eventProcessorSpan := findSpan(spans, "drift_report.process") + assert.Equal(t, traceID, eventProcessorSpan.SpanContext().TraceID().String()) +} + +func TestTraceContext_SQSBoundary_PreservesTraceID(t *testing.T) { + // Verify SQS message attributes contain traceparent + // Verify consumer extracts and continues the trace +} + +func TestTraceContext_AgentScan_CreatesParentSpan(t *testing.T) { + // Verify agent drift_scan span has correct attributes: + // drift.stack_id, drift.resource_count, drift.duration_ms +} +``` + +### 11.5 Backward Compatibility Serialization (Elastic Schema) + +```go +// tests/schema/backward_compat_test.go + +func TestOldAgent_ParsesNewDynamoDBItem_WithV2Attributes(t *testing.T) { + // Simulate V2 DynamoDB item with new _v2 fields + item := map[string]types.AttributeValue{ + "PK": &types.AttributeValueMemberS{Value: "STACK#123"}, + "drift_score": &types.AttributeValueMemberN{Value: "85"}, + "drift_score_v2": &types.AttributeValueMemberN{Value: "92"}, // New field + "remediation_v2": &types.AttributeValueMemberS{Value: "auto"}, // New field + } + + // V1 parser must ignore unknown fields + result, err := ParseDriftItem(item) + assert.NoError(t, err) + assert.Equal(t, 85, result.DriftScore) // Uses V1 field +} + +func TestV1Code_ReadsV2Writes_DuringMigrationWindow(t *testing.T) { + // V2 writes both drift_score and drift_score_v2 + // V1 reads drift_score (ignores _v2) + // Verify no data loss +} +``` + +### 11.6 Security: RBAC Forgery & Replay Attacks + +```go +// tests/integration/security_test.go + +func TestAgentCannotForgeStackID(t *testing.T) { + // Agent with API key for org-A sends drift report claiming stack belongs to org-B + orgAKey := createAPIKey(t, "org-a") + report := makeDriftReport("org-b-stack-id") // Wrong org + + resp := postDriftReportWithKey(t, report, orgAKey) + assert.Equal(t, 403, resp.StatusCode) +} + +func TestReplayAttack_DuplicateReportID_Rejected(t *testing.T) { + report := makeDriftReport("stack-1") + resp1 := postDriftReport(t, report) + assert.Equal(t, 200, resp1.StatusCode) + + // Replay exact same report + resp2 := postDriftReport(t, report) + assert.Equal(t, 409, resp2.StatusCode) // Conflict — already processed +} + +func TestReplayAttack_OldTimestamp_Rejected(t *testing.T) { + report := makeDriftReport("stack-1") + report.Timestamp = time.Now().Add(-10 * time.Minute) // 10 min old + + resp := postDriftReport(t, report) + assert.Equal(t, 400, resp.StatusCode) // Stale report +} +``` + +### 11.7 Noisy Neighbor & Fair-Share Processing + +```go +// tests/integration/fair_share_test.go + +func TestNoisyNeighbor_LargeOrgDoesNotStarveSmallOrg(t *testing.T) { + // Org A: 10,000 drifted resources + // Org B: 10 drifted resources + // Both submit reports simultaneously + + seedDriftReports(t, "org-a", 10000) + seedDriftReports(t, "org-b", 10) + + // Org B's reports must be processed within 30 seconds + // (not queued behind all 10K of Org A's) + start := time.Now() + waitForProcessed(t, "org-b", 10, 30*time.Second) + assert.Less(t, time.Since(start), 30*time.Second) +} +``` + +### 11.8 Panic Mode Mid-Remediation Race Condition + +```go +// tests/integration/panic_remediation_test.go + +func TestPanicMode_AbortsInFlightRemediation(t *testing.T) { + // Start a remediation (terraform apply) + execID := startRemediation(t, "stack-1", "drift-1") + waitForState(t, execID, "applying") + + // Trigger panic mode + triggerPanicMode(t) + + // Remediation must be aborted, not completed + state := waitForState(t, execID, "aborted") + assert.Equal(t, "aborted", state) + + // Verify terraform state is not corrupted + // (agent should have run terraform state pull to verify) +} + +func TestPanicMode_DoesNotAbortReadOnlyScans(t *testing.T) { + // Drift scans (read-only) should continue during panic + // Only write operations (remediation) are halted + scanID := startDriftScan(t, "stack-1") + triggerPanicMode(t) + + state := waitForState(t, scanID, "completed") + assert.Equal(t, "completed", state) // Scan finishes normally +} +``` + +### 11.9 Remediation vs. Concurrent Scan Race Condition + +```go +func TestConcurrentScanDuringRemediation_DoesNotReportHalfAppliedState(t *testing.T) { + // Start remediation (terraform apply — takes ~30s) + execID := startRemediation(t, "stack-1", "drift-1") + waitForState(t, execID, "applying") + + // Trigger a drift scan while remediation is in progress + scanID := startDriftScan(t, "stack-1") + + // Scan must either: + // a) Wait for remediation to complete, OR + // b) Skip the stack with "remediation in progress" status + scanResult := waitForScanComplete(t, scanID) + assert.NotEqual(t, "half-applied", scanResult.Status) + // Must be either "skipped_remediation_in_progress" or show post-remediation state +} +``` + +### 11.10 SaaS API Memory Profiling + +```go +// tests/load/memory_profile_test.go + +func TestEventProcessor_DoesNotOOM_On1MB_DriftReport(t *testing.T) { + // Generate a 1MB drift report (1000 resources with large diffs) + report := makeLargeDriftReport(1000) + assert.Greater(t, len(report), 1024*1024) + + var memBefore, memAfter runtime.MemStats + runtime.ReadMemStats(&memBefore) + + processReport(t, report) + + runtime.ReadMemStats(&memAfter) + growth := memAfter.Alloc - memBefore.Alloc + assert.Less(t, growth, uint64(50*1024*1024)) // <50MB growth +} +``` + +### 11.11 Trim E2E to Smoke Tier + +Per review recommendation, cap E2E at 10 critical paths. Remaining 40 tests pushed to integration: + +| E2E (Keep — 10 max) | Demoted to Integration | +|---------------------|----------------------| +| Onboarding: init → connect → first scan | Agent heartbeat variations | +| First drift detected → Slack alert | Individual parser format tests | +| Revert flow: Slack → agent apply → verify | Secret scrubber edge cases | +| Panic mode halts remediation | DynamoDB access pattern tests | +| Cross-tenant isolation | Individual webhook format tests | +| OAuth login → dashboard → view diff | Notification batching | +| Free tier limit enforcement | Agent config reload | +| Agent disconnect → reconnect → resume | Baseline score calculations | +| mTLS cert rotation mid-scan | Individual API endpoint tests | +| Stripe upgrade → unlock features | Cache invalidation patterns | + +### 11.12 Updated Test Pyramid (Post-Review) + +| Level | Original | Revised | Rationale | +|-------|----------|---------|-----------| +| Unit | 70% (~350) | 65% (~350) | Add t.Parallel(), keep count but add UI component tests | +| Integration | 20% (~100) | 28% (~150) | Terratest, mTLS, trace propagation, fair-share, security | +| E2E/Smoke | 10% (~50) | 7% (~35) | Capped at 10 true E2E + 25 Playwright UI tests | + +*End of P2 Review Remediation Addendum* diff --git a/products/03-alert-intelligence/test-architecture/test-architecture.md b/products/03-alert-intelligence/test-architecture/test-architecture.md index 1744ed4..4949f86 100644 --- a/products/03-alert-intelligence/test-architecture/test-architecture.md +++ b/products/03-alert-intelligence/test-architecture/test-architecture.md @@ -1409,3 +1409,459 @@ Before any release, these tests must pass: --- *End of dd0c/alert Test Architecture* + +--- + +## 11. Review Remediation Addendum (Post-Gemini Review) + +### 11.1 Missing Epic Coverage + +#### Epic 6: Dashboard API + +```typescript +describe('Dashboard API', () => { + describe('Authentication', () => { + it('returns 401 for missing Cognito JWT', async () => {}); + it('returns 401 for expired JWT', async () => {}); + it('returns 401 for JWT signed by wrong issuer', async () => {}); + it('extracts tenantId from JWT claims', async () => {}); + }); + + describe('Incident Listing (GET /v1/incidents)', () => { + it('returns paginated incidents for authenticated tenant', async () => {}); + it('supports cursor-based pagination', async () => {}); + it('filters by status (open, acknowledged, resolved)', async () => {}); + it('filters by severity (critical, warning, info)', async () => {}); + it('filters by time range (since, until)', async () => {}); + it('returns empty array for tenant with no incidents', async () => {}); + }); + + describe('Incident Detail (GET /v1/incidents/:id)', () => { + it('returns full incident with correlated alerts', async () => {}); + it('returns 404 for incident belonging to different tenant', async () => {}); + it('includes timeline of state transitions', async () => {}); + }); + + describe('Analytics (GET /v1/analytics)', () => { + it('returns MTTR for last 7/30/90 days', async () => {}); + it('returns alert volume by source', async () => {}); + it('returns noise reduction percentage', async () => {}); + it('scopes all analytics to authenticated tenant', async () => {}); + }); + + describe('Tenant Isolation', () => { + it('tenant A cannot read tenant B incidents via API', async () => {}); + it('tenant A cannot read tenant B analytics', async () => {}); + it('all DynamoDB queries include tenantId partition key', async () => {}); + }); +}); +``` + +#### Epic 7: Dashboard UI (Playwright) + +```typescript +// tests/e2e/ui/dashboard.spec.ts + +test('login redirects to Cognito hosted UI', async ({ page }) => { + await page.goto('/dashboard'); + await expect(page).toHaveURL(/cognito/); +}); + +test('incident list renders with correct severity badges', async ({ page }) => { + await page.goto('/dashboard/incidents'); + await expect(page.locator('[data-testid="incident-card"]')).toHaveCount(5); + await expect(page.locator('.severity-critical')).toBeVisible(); +}); + +test('incident detail shows correlated alert timeline', async ({ page }) => { + await page.goto('/dashboard/incidents/inc-123'); + await expect(page.locator('[data-testid="alert-timeline"]')).toBeVisible(); + await expect(page.locator('.timeline-event')).toHaveCountGreaterThan(1); +}); + +test('MTTR chart renders with real data', async ({ page }) => { + await page.goto('/dashboard/analytics'); + await expect(page.locator('[data-testid="mttr-chart"]')).toBeVisible(); +}); + +test('noise reduction percentage displays correctly', async ({ page }) => { + await page.goto('/dashboard/analytics'); + const noise = page.locator('[data-testid="noise-reduction"]'); + await expect(noise).toContainText('%'); +}); + +test('webhook setup wizard generates correct URL', async ({ page }) => { + await page.goto('/dashboard/settings/integrations'); + await page.click('[data-testid="add-datadog"]'); + const url = await page.locator('[data-testid="webhook-url"]').textContent(); + expect(url).toMatch(/\/v1\/webhooks\/ingest\/.+/); +}); +``` + +#### Epic 9: Onboarding & PLG + +```typescript +describe('Free Tier Enforcement', () => { + it('allows up to 10,000 alerts/month on free tier', async () => {}); + it('returns 429 with upgrade prompt at 10,001st alert', async () => {}); + it('resets counter on first of each month', async () => {}); + it('purges alert data older than 7 days on free tier', async () => {}); + it('retains alert data for 90 days on pro tier', async () => {}); +}); + +describe('OAuth Signup', () => { + it('creates tenant record on first Cognito login', async () => {}); + it('assigns free tier by default', async () => {}); + it('generates unique webhook URL per tenant', async () => {}); +}); + +describe('Stripe Integration', () => { + it('creates checkout session with correct pricing', async () => {}); + it('upgrades tenant on checkout.session.completed webhook', async () => {}); + it('downgrades tenant on subscription.deleted webhook', async () => {}); + it('validates Stripe webhook signature', async () => {}); +}); +``` + +#### Epic 5.3: Slack Feedback Endpoint + +```typescript +describe('Slack Interactive Actions Endpoint', () => { + it('validates Slack request signature (HMAC-SHA256)', async () => {}); + it('rejects request with invalid signature', async () => {}); + it('handles "helpful" feedback — updates incident quality score', async () => {}); + it('handles "noise" feedback — adds to suppression training data', async () => {}); + it('handles "escalate" action — triggers PagerDuty/OpsGenie', async () => {}); + it('updates original Slack message after action', async () => {}); + it('scopes action to correct tenant', async () => {}); +}); +``` + +#### Epic 1.4: S3 Raw Payload Archival + +```typescript +describe('Raw Payload Archival', () => { + it('saves raw webhook payload to S3 asynchronously', async () => {}); + it('S3 key includes tenantId, source, and timestamp', async () => {}); + it('archival failure does not block alert processing', async () => {}); + it('archived payload is retrievable for replay', async () => {}); + it('S3 lifecycle policy deletes after retention period', async () => {}); +}); +``` + +### 11.2 Anti-Pattern Fixes + +#### Replace ioredis-mock with WindowStore Interface + +```typescript +// BEFORE (anti-pattern): +// import RedisMock from 'ioredis-mock'; +// const engine = new CorrelationEngine(new RedisMock()); + +// AFTER (correct): +interface WindowStore { + addEvent(tenantId: string, key: string, event: Alert, ttlMs: number): Promise; + getWindow(tenantId: string, key: string): Promise; + clearWindow(tenantId: string, key: string): Promise; +} + +class InMemoryWindowStore implements WindowStore { + private store = new Map(); + + async addEvent(tenantId: string, key: string, event: Alert, ttlMs: number) { + const fullKey = `${tenantId}:${key}`; + const existing = this.store.get(fullKey) || { events: [], expiresAt: Date.now() + ttlMs }; + existing.events.push(event); + this.store.set(fullKey, existing); + } + + async getWindow(tenantId: string, key: string): Promise { + const fullKey = `${tenantId}:${key}`; + const entry = this.store.get(fullKey); + if (!entry || entry.expiresAt < Date.now()) return []; + return entry.events; + } +} + +// Unit tests use InMemoryWindowStore — no Redis dependency +// Integration tests use RedisWindowStore with Testcontainers +``` + +#### Replace sinon.useFakeTimers with Clock Interface + +```typescript +// BEFORE (anti-pattern): +// sinon.useFakeTimers(new Date('2026-03-01T00:00:00Z')); + +// AFTER (correct): +interface Clock { + now(): number; + advanceBy(ms: number): void; +} + +class FakeClock implements Clock { + private current: number; + constructor(start: Date = new Date()) { this.current = start.getTime(); } + now() { return this.current; } + advanceBy(ms: number) { this.current += ms; } +} + +class SystemClock implements Clock { + now() { return Date.now(); } + advanceBy() { throw new Error('Cannot advance system clock'); } +} + +// Inject into CorrelationEngine: +const engine = new CorrelationEngine(new InMemoryWindowStore(), new FakeClock()); +``` + +### 11.3 Trace Context Propagation Tests + +```typescript +describe('Trace Context Propagation', () => { + it('API Gateway passes trace_id to Lambda via X-Amzn-Trace-Id', async () => {}); + + it('Lambda propagates trace_id into SQS message attributes', async () => { + // Verify SQS message has MessageAttribute 'traceparent' with W3C format + const msg = await getLastSQSMessage(localstack, 'alert-queue'); + expect(msg.MessageAttributes.traceparent).toBeDefined(); + expect(msg.MessageAttributes.traceparent.StringValue).toMatch( + /^00-[0-9a-f]{32}-[0-9a-f]{16}-0[01]$/ + ); + }); + + it('ECS Correlation Engine extracts trace_id from SQS message', async () => { + // Verify the correlation span has the correct parent from SQS + const spans = inMemoryExporter.getFinishedSpans(); + const correlationSpan = spans.find(s => s.name === 'alert.correlation'); + const ingestSpan = spans.find(s => s.name === 'webhook.ingest'); + expect(correlationSpan.parentSpanId).toBeDefined(); + // Parent chain must trace back to the original ingest span + }); + + it('end-to-end trace spans webhook → SQS → correlation → notification', async () => { + // Fire a webhook, wait for Slack notification, verify all spans share trace_id + const traceId = await fireWebhookAndGetTraceId(); + const spans = await getSpansByTraceId(traceId); + const spanNames = spans.map(s => s.name); + expect(spanNames).toContain('webhook.ingest'); + expect(spanNames).toContain('alert.normalize'); + expect(spanNames).toContain('alert.correlation'); + expect(spanNames).toContain('notification.slack'); + }); +}); +``` + +### 11.4 HMAC Security Hardening + +```typescript +describe('HMAC Signature Validation (Hardened)', () => { + it('uses crypto.timingSafeEqual, not === comparison', () => { + // Inspect the source to verify timing-safe comparison + const source = fs.readFileSync('src/ingestion/hmac.ts', 'utf8'); + expect(source).toContain('timingSafeEqual'); + expect(source).not.toMatch(/signature\s*===\s*/); + }); + + it('handles case-insensitive header names (dd-webhook-signature vs DD-WEBHOOK-SIGNATURE)', async () => { + const payload = makeAlertPayload('datadog'); + const sig = computeHMAC(payload, DATADOG_SECRET); + + // Lowercase header + const resp1 = await ingest(payload, { 'dd-webhook-signature': sig }); + expect(resp1.status).toBe(200); + + // Uppercase header + const resp2 = await ingest(payload, { 'DD-WEBHOOK-SIGNATURE': sig }); + expect(resp2.status).toBe(200); + }); + + it('rejects completely missing signature header', async () => { + const resp = await ingest(makeAlertPayload('datadog'), {}); + expect(resp.status).toBe(401); + }); + + it('rejects empty signature header', async () => { + const resp = await ingest(makeAlertPayload('datadog'), { 'dd-webhook-signature': '' }); + expect(resp.status).toBe(401); + }); +}); +``` + +### 11.5 SQS 256KB Payload Limit + +```typescript +describe('Large Payload Handling', () => { + it('compresses payloads >200KB before sending to SQS', async () => { + const largePayload = makeLargeAlertPayload(300 * 1024); // 300KB + const resp = await ingest(largePayload); + expect(resp.status).toBe(200); + + const msg = await getLastSQSMessage(localstack, 'alert-queue'); + // Payload must be compressed or use S3 pointer + expect(msg.Body.length).toBeLessThan(256 * 1024); + }); + + it('uses S3 pointer for payloads >256KB after compression', async () => { + const hugePayload = makeLargeAlertPayload(500 * 1024); // 500KB + const resp = await ingest(hugePayload); + expect(resp.status).toBe(200); + + const msg = await getLastSQSMessage(localstack, 'alert-queue'); + const body = JSON.parse(msg.Body); + expect(body.s3Pointer).toBeDefined(); + expect(body.s3Pointer).toMatch(/^s3:\/\/dd0c-alert-overflow\//); + }); + + it('strips unnecessary fields from Datadog payload before SQS', async () => { + const payload = makeDatadogPayloadWithLargeTags(100); // 100 tags + const resp = await ingest(payload); + expect(resp.status).toBe(200); + + const msg = await getLastSQSMessage(localstack, 'alert-queue'); + const normalized = JSON.parse(msg.Body); + // Only essential fields should remain + expect(normalized.tags.length).toBeLessThanOrEqual(20); + }); + + it('rejects payloads >2MB at API Gateway level', async () => { + const massive = makeLargeAlertPayload(3 * 1024 * 1024); + const resp = await ingest(massive); + expect(resp.status).toBe(413); + }); +}); +``` + +### 11.6 DLQ Backpressure & Replay + +```typescript +describe('DLQ Replay with Backpressure', () => { + it('replays DLQ messages in batches of 100', async () => { + await seedDLQ(10000); // 10K messages + const replayer = new DLQReplayer({ batchSize: 100, delayBetweenBatchesMs: 500 }); + await replayer.start(); + + // Verify batched processing + expect(replayer.batchesProcessed).toBeGreaterThan(0); + expect(replayer.maxConcurrentMessages).toBeLessThanOrEqual(100); + }); + + it('pauses replay if correlation engine error rate exceeds 10%', async () => { + await seedDLQ(1000); + const replayer = new DLQReplayer({ batchSize: 100, errorThreshold: 0.1 }); + + // Simulate correlation engine returning errors + mockCorrelationEngine.failRate = 0.15; + await replayer.start(); + + expect(replayer.state).toBe('paused'); + expect(replayer.pauseReason).toContain('error rate exceeded'); + }); + + it('does not replay if circuit breaker is currently tripped', async () => { + await seedDLQ(100); + await tripCircuitBreaker(); + + const replayer = new DLQReplayer(); + await replayer.start(); + + expect(replayer.messagesReplayed).toBe(0); + expect(replayer.state).toBe('blocked_by_circuit_breaker'); + }); + + it('tracks replay progress for resumability', async () => { + await seedDLQ(500); + const replayer = new DLQReplayer({ batchSize: 50 }); + + // Process 3 batches then stop + await replayer.processNBatches(3); + expect(replayer.checkpoint).toBe(150); + + // Resume from checkpoint + const replayer2 = new DLQReplayer({ resumeFrom: replayer.checkpoint }); + await replayer2.start(); + expect(replayer2.startedFrom).toBe(150); + }); +}); +``` + +### 11.7 Multi-Tenancy Isolation (DynamoDB) + +```typescript +describe('DynamoDB Tenant Isolation', () => { + it('all DAO methods require tenantId parameter', () => { + // Compile-time check: DAO interface has tenantId as first param + const daoSource = fs.readFileSync('src/data/incident-dao.ts', 'utf8'); + const methods = extractPublicMethods(daoSource); + for (const method of methods) { + expect(method.params[0].name).toBe('tenantId'); + } + }); + + it('query for tenant A returns zero results for tenant B data', async () => { + const dao = new IncidentDAO(dynamoClient); + await dao.create('tenant-A', makeIncident()); + await dao.create('tenant-B', makeIncident()); + + const results = await dao.list('tenant-A'); + expect(results.every(r => r.tenantId === 'tenant-A')).toBe(true); + }); + + it('partition key always includes tenantId prefix', async () => { + const dao = new IncidentDAO(dynamoClient); + await dao.create('tenant-X', makeIncident()); + + // Read raw DynamoDB item + const item = await dynamoClient.scan({ TableName: 'dd0c-alert-main' }); + expect(item.Items[0].PK.S).toStartWith('TENANT#tenant-X'); + }); +}); +``` + +### 11.8 Slack Circuit Breaker + +```typescript +describe('Slack Notification Circuit Breaker', () => { + it('opens circuit after 10 consecutive 429s from Slack', async () => { + const slackClient = new SlackClient({ circuitBreakerThreshold: 10 }); + for (let i = 0; i < 10; i++) { + mockSlack.respondWith(429); + await slackClient.send(makeMessage()).catch(() => {}); + } + expect(slackClient.circuitState).toBe('open'); + }); + + it('queues notifications while circuit is open', async () => { + slackClient.openCircuit(); + await slackClient.send(makeMessage()); + expect(slackClient.queuedMessages).toBe(1); + }); + + it('half-opens circuit after 60 seconds', async () => { + slackClient.openCircuit(); + clock.advanceBy(61000); + expect(slackClient.circuitState).toBe('half-open'); + }); + + it('drains queue on successful half-open probe', async () => { + slackClient.openCircuit(); + slackClient.queue(makeMessage()); + slackClient.queue(makeMessage()); + clock.advanceBy(61000); + mockSlack.respondWith(200); + await slackClient.probe(); + expect(slackClient.circuitState).toBe('closed'); + expect(slackClient.queuedMessages).toBe(0); + }); +}); +``` + +### 11.9 Updated Test Pyramid (Post-Review) + +| Level | Original | Revised | Rationale | +|-------|----------|---------|-----------| +| Unit | 70% (~140) | 65% (~180) | More tests total, but integration share grows | +| Integration | 20% (~40) | 25% (~70) | Dashboard API, tenant isolation, trace propagation | +| E2E | 10% (~20) | 10% (~28) | Dashboard UI (Playwright), onboarding flow | + +*End of P3 Review Remediation Addendum* diff --git a/products/04-lightweight-idp/test-architecture/test-architecture.md b/products/04-lightweight-idp/test-architecture/test-architecture.md index da06895..9fdbc24 100644 --- a/products/04-lightweight-idp/test-architecture/test-architecture.md +++ b/products/04-lightweight-idp/test-architecture/test-architecture.md @@ -1107,3 +1107,161 @@ Phase 7: E2E Validation --- *End of dd0c/portal Test Architecture* + +--- + +## 11. Review Remediation Addendum (Post-Gemini Review) + +### 11.1 Resolve Database Misalignment (PostgreSQL vs DynamoDB) + +Epic 10.2 specified DynamoDB Single-Table, but the Architecture and Test Architecture are fundamentally built around PostgreSQL (Aurora Serverless v2) with pgvector. +**Resolution:** The IDP requires relational joins and vector search. PostgreSQL is the definitive catalog database. DynamoDB references are removed. + +```rust +// tests/schema/migration_validation_test.rs + +#[tokio::test] +async fn elastic_schema_postgres_migration_is_additive_only() { + let migrations = read_sql_migrations("./migrations"); + for migration in migrations { + assert!(!migration.contains("DROP COLUMN"), "Destructive schema change detected"); + assert!(!migration.contains("ALTER COLUMN"), "Type modification detected"); + assert!(!migration.contains("RENAME COLUMN"), "Column rename detected"); + } +} + +#[tokio::test] +async fn migration_does_not_hold_exclusive_locks_on_reads() { + // Concurrent index creation tests + assert!(migration_contains("CREATE INDEX CONCURRENTLY"), + "Indexes must be created concurrently to avoid locking the catalog"); +} +``` + +### 11.2 Invert the Test Pyramid (Integration Honeycomb) + +Shift from 70% Unit (with heavy moto/responses mocking) to 30/60/10 with VCR and LocalStack. + +```python +# tests/integration/scanners/test_aws_scanner.py + +@pytest.mark.vcr() +def test_aws_scanner_discovers_ecs_services_and_api_gateways(vcr_cassette): + # Uses real recorded AWS API responses, not moto mocks + # Validates actual boto3 parsing against real-world AWS shapes + scanner = AWSDiscoveryScanner(account_id="123456789012", region="us-east-1") + services = scanner.scan() + assert len(services) > 0 + assert any(s.type == "ecs_service" for s in services) + +@pytest.mark.vcr() +def test_github_scanner_handles_graphql_pagination(vcr_cassette): + # Validates real GitHub GraphQL paginated responses + scanner = GitHubDiscoveryScanner(org_name="dd0c") + repos = scanner.scan() + assert len(repos) > 100 # Proves pagination logic works +``` + +### 11.3 Missing Epic Coverage + +#### Epic 3.4: PagerDuty & OpsGenie Integrations +```python +# tests/integration/test_pagerduty_sync.py + +@pytest.mark.vcr() +def test_pagerduty_sync_maps_schedules_to_catalog_teams(): + sync = PagerDutySyncer(api_key="sk-test-key") + teams = sync.fetch_oncall_schedules() + assert teams[0].oncall_email is not None + +def test_pagerduty_credentials_are_encrypted_at_rest(): + # Verify KMS envelope encryption for 3rd party API keys + pass +``` + +#### Epic 4.3: Redis Prefix Caching for Cmd+K +```python +# tests/integration/test_search_cache.py + +def test_cmd_k_search_hits_redis_cache_before_postgres(): + redis_client.set("search:auth", json.dumps([{"name": "auth-service"}])) + # Must return < 5ms from Redis, skipping DB + result = search_api.query("auth") + assert result[0]['name'] == "auth-service" + +def test_catalog_update_invalidates_search_cache(): + # Create new service + catalog_api.create_service("billing-api") + # Prefix cache must be purged + assert redis_client.keys("search:*") == [] +``` + +#### Epics 5 & 6: UI and Dashboards (Playwright) +```typescript +// tests/e2e/ui/catalog.spec.ts + +test('service catalog renders progressive disclosure UI', async ({ page }) => { + await page.goto('/catalog'); + // Click expands details instead of navigating away + await page.click('[data-testid="service-row-auth-api"]'); + await expect(page.locator('[data-testid="service-drawer"]')).toBeVisible(); +}); + +test('dashboard KPI aggregation shows total services and ownership coverage', async ({ page }) => { + await page.goto('/dashboard'); + await expect(page.locator('[data-testid="kpi-total-services"]')).toHaveText("150"); + await expect(page.locator('[data-testid="kpi-ownership"]')).toHaveText("85%"); +}); +``` + +#### Epic 9: Onboarding & Stripe +```python +# tests/integration/test_stripe_webhooks.py + +def test_stripe_checkout_completed_upgrades_tenant_tier(): + payload = load_fixture("stripe_checkout_completed.json") + signature = generate_stripe_signature(payload, secret) + + response = api_client.post("/webhooks/stripe", data=payload, headers={"Stripe-Signature": signature}) + assert response.status_code == 200 + + tenant = db.get_tenant("t-123") + assert tenant.tier == "pro" + +def test_websocket_streams_discovery_progress_during_onboarding(): + # Connect WS client, trigger discovery, assert WS receives "discovering AWS...", "found 50 resources..." + pass +``` + +### 11.4 Scaled Performance Benchmarks +```python +# tests/performance/test_discovery_scale.py + +def test_discovery_pipeline_handles_10000_aws_resources_without_step_functions_payload_limit(): + # Simulate an AWS environment with 10k resources + # Must chunk state machine transitions to stay under 256KB Step Functions limit + pass + +def test_discovery_pipeline_handles_1000_github_repos(): + # Verify GraphQL batching and rate limit backoff + pass +``` + +### 11.5 Edge Case Resilience +```python +def test_github_graphql_concurrent_rate_limiting(): + # If 5 tenants scan concurrently, respect Retry-After headers across workers + pass + +def test_partial_discovery_scan_does_not_corrupt_catalog(): + # If GitHub scan times out halfway, existing services must NOT be marked stale + pass + +def test_ownership_conflict_resolution(): + # If two discovery sources claim the same repo, prioritize Explicit (Config) over Implicit (Tags) + pass + +def test_meilisearch_index_rebuild_does_not_drop_search(): + # Verify zero-downtime index swapping during mapping updates + pass +``` diff --git a/products/05-aws-cost-anomaly/test-architecture/test-architecture.md b/products/05-aws-cost-anomaly/test-architecture/test-architecture.md index 00b7784..c379f1c 100644 --- a/products/05-aws-cost-anomaly/test-architecture/test-architecture.md +++ b/products/05-aws-cost-anomaly/test-architecture/test-architecture.md @@ -1,8 +1,8 @@ # dd0c/cost — Test Architecture & TDD Strategy **Product:** dd0c/cost — AWS Cost Anomaly Detective -**Author:** Test Architecture Phase -**Date:** February 28, 2026 +**Author:** Test Architecture Phase (v2 — Post-Review Rewrite) +**Date:** March 1, 2026 **Status:** V1 MVP — Solo Founder Scope --- @@ -13,7 +13,9 @@ dd0c/cost sits at the intersection of **money and infrastructure**. A false negative means a customer loses thousands of dollars. A false positive means alert fatigue and churn. The test suite's primary job is to mathematically prove the anomaly scoring engine works across edge cases. -Guiding principle: **Test the math first, test the infrastructure second.** The Z-score and novelty algorithms must be exhaustively unit-tested with synthetic data before any AWS APIs are mocked. +Guiding principle: **Test the math first, test the infrastructure second.** The Z-score and novelty algorithms must be exhaustively tested with property-based testing before any AWS APIs are mocked. + +Second principle: **Every dollar matters.** Cost calculations involve floating-point arithmetic on money. Rounding errors, precision loss, and currency handling must be tested with the same rigor as a financial system. ### 1.2 Red-Green-Refactor Adapted to dd0c/cost @@ -28,33 +30,52 @@ REFACTOR → Optimize the baseline lookup, extract novelty checks, ``` **When to write tests first (strict TDD):** -- Anomaly scoring engine (Z-scores, novelty checks, composite severity) -- Cold-start heuristics (fast-path for >$5/hr resources) -- Baseline calculation (moving averages, standard deviation) -- Governance policy (strict vs. audit mode, 14-day promotion) +- All anomaly scoring (Z-scores, novelty checks, composite severity) +- All cold-start heuristics (fast-path for >$5/hr resources) +- All baseline calculation (Welford algorithm, maturity transitions) +- All governance policy (strict vs. audit mode, 14-day auto-promotion, panic mode) +- All Slack signature validation (security-critical) +- All cost calculations (pricing lookup, hourly cost estimation) +- All feature flag circuit breakers **When integration tests lead:** - CloudTrail ingestion (implement against LocalStack EventBridge, then lock in) - DynamoDB Single-Table schema (build access patterns, then integration test) +- Cross-account STS role assumption (test against LocalStack) **When E2E tests lead:** -- The Slack alert interaction (format block kit, test the "Snooze/Terminate" buttons) +- Slack alert interaction (format block kit, test "Snooze/Terminate" buttons) +- Onboarding wizard (CloudFormation quick-create → role validation → first alert) ### 1.3 Test Naming Conventions ```typescript +// Unit tests describe('AnomalyScorer', () => { - it('assigns critical severity when Z-score > 3 and hourly cost > $1', () => {}); - it('flags actor novelty when IAM role has never launched this service', () => {}); + it('assigns critical severity when Z-score exceeds 3 and hourly cost exceeds $1', () => {}); + it('flags actor novelty when IAM role has never launched this service type', () => {}); it('bypasses baseline and triggers fast-path critical for $10/hr instance', () => {}); }); -describe('CloudTrailNormalizer', () => { - it('extracts instance type and region from RunInstances event', () => {}); - it('looks up correct on-demand pricing for us-east-1 r6g.xlarge', () => {}); +describe('BaselineCalculator', () => { + it('updates running mean using Welford online algorithm', () => {}); + it('handles zero standard deviation without division by zero', () => {}); +}); + +// Property-based tests +describe('AnomalyScorer (property-based)', () => { + it('always returns severity between 0 and 100 for any valid input', () => {}); + it('monotonically increases score as Z-score increases', () => {}); + it('never assigns critical to events below $0.50/hr regardless of Z-score', () => {}); }); ``` +**Rules:** +- Describe the observable outcome, not the implementation +- Use present tense +- If you need "and" in the name, split into two tests +- Property-based tests explicitly state the invariant + --- ## Section 2: Test Pyramid @@ -63,93 +84,441 @@ describe('CloudTrailNormalizer', () => { | Level | Target | Count (V1) | Runtime | |-------|--------|------------|---------| -| Unit | 70% | ~250 tests | <20s | -| Integration | 20% | ~80 tests | <3min | -| E2E/Smoke | 10% | ~15 tests | <5min | +| Unit | 80% | ~350 tests | <25s | +| Integration | 15% | ~65 tests | <4min | +| E2E/Smoke | 5% | ~15 tests | <8min | + +Higher unit ratio than other dd0c products because the core value is pure math (scoring, baselines, Z-scores). ### 2.2 Unit Test Targets | Component | Key Behaviors | Est. Tests | |-----------|--------------|------------| -| Event Normalizer | CloudTrail parsing, pricing lookup, deduplication | 40 | -| Baseline Engine | Running mean/stddev calculation, maturity checks | 35 | -| Anomaly Scorer | Z-score math, novelty detection, composite scoring | 50 | -| Remediation Handler | Stop/Terminate payload parsing, IAM role assumption logic | 20 | -| Notification Engine | Slack formatting, daily digest aggregation | 30 | -| Governance Policy | Mode enforcement, 14-day auto-promotion | 25 | -| Feature Flags | Circuit breaker on alert volume, flag metadata | 15 | +| CloudTrail Normalizer | Event parsing, pricing lookup, dedup, field extraction | 40 | +| Baseline Engine | Welford algorithm, maturity transitions, feedback loop | 45 | +| Anomaly Scorer | Z-score, novelty, composite scoring, cold-start fast-path | 60 | +| Zombie Hunter | Idle resource detection, cost estimation, age calculation | 25 | +| Notification Formatter | Slack Block Kit, daily digest, CLI command generation | 30 | +| Slack Bot | Command parsing, signature validation, action handling | 25 | +| Remediation Handler | Stop/Terminate logic, IAM role assumption, snooze/dismiss | 20 | +| Dashboard API | CRUD, tenant isolation, pagination, filtering | 25 | +| Governance Policy | Mode enforcement, 14-day promotion, panic mode | 30 | +| Feature Flags | Circuit breaker, flag lifecycle, local evaluation | 15 | +| Onboarding | CFN template validation, role validation, free tier enforcement | 20 | +| Cost Calculations | Pricing precision, rounding, fallback pricing, currency | 15 | + +### 2.3 Integration Test Boundaries + +| Boundary | What's Tested | Infrastructure | +|----------|--------------|----------------| +| EventBridge → SQS FIFO | Cross-account event routing, dedup, ordering | LocalStack | +| SQS → Event Processor Lambda | Batch processing, error handling, DLQ routing | LocalStack | +| Event Processor → DynamoDB | CostEvent writes, baseline updates, transactions | Testcontainers DynamoDB Local | +| Anomaly Scorer → DynamoDB | Baseline reads, anomaly record writes | Testcontainers DynamoDB Local | +| Notifier → Slack API | Block Kit delivery, rate limiting, message updates | WireMock | +| API Gateway → Lambda | Auth (Cognito JWT), routing, throttling | LocalStack | +| STS → Customer Account | Cross-account role assumption, ExternalId validation | LocalStack | +| CDK Synth | Infrastructure snapshot, resource policy validation | CDK assertions | + +### 2.4 E2E/Smoke Scenarios + +1. **Real-Time Anomaly Detection**: CloudTrail event → scoring → Slack alert (<30s) +2. **Interactive Remediation**: Slack button click → StopInstances → message update +3. **Onboarding Flow**: Signup → CFN deploy → role validation → first alert +4. **14-Day Auto-Promotion**: Simulate 14 days → verify strict→audit transition +5. **Zombie Hunter**: Daily scan → detect idle EC2 → Slack digest +6. **Panic Mode**: Enable panic → all alerting stops → anomalies still logged --- ## Section 3: Unit Test Strategy -### 3.1 Cost Ingestion & Normalization +### 3.1 CloudTrail Normalizer ```typescript describe('CloudTrailNormalizer', () => { - it('normalizes EC2 RunInstances event to CostEvent schema', () => {}); - it('normalizes RDS CreateDBInstance event to CostEvent schema', () => {}); - it('extracts assumed role ARN as actor instead of base STS role', () => {}); - it('applies fallback pricing when instance type is not in static table', () => {}); - it('ignores non-cost-generating events (e.g., DescribeInstances)', () => {}); + describe('Event Parsing', () => { + it('normalizes EC2 RunInstances to CostEvent schema', () => {}); + it('normalizes RDS CreateDBInstance to CostEvent schema', () => {}); + it('normalizes Lambda CreateFunction to CostEvent schema', () => {}); + it('extracts assumed role ARN as actor (not base STS role)', () => {}); + it('extracts instance type, region, and AZ from event detail', () => {}); + it('handles batched RunInstances (multiple instances in one call)', () => {}); + it('ignores non-cost-generating events (DescribeInstances, ListBuckets)', () => {}); + it('handles malformed CloudTrail JSON without crashing', () => {}); + it('handles missing optional fields gracefully', () => {}); + }); + + describe('Pricing Lookup', () => { + it('looks up correct on-demand price for us-east-1 m5.xlarge', () => {}); + it('looks up correct on-demand price for us-west-2 r6g.2xlarge', () => {}); + it('applies fallback pricing when instance type not in static table', () => {}); + it('returns $0 for instance types with no pricing data and logs warning', () => {}); + it('handles GPU instances (p4d, g5) with correct pricing', () => {}); + }); + + describe('Deduplication', () => { + it('generates deterministic fingerprint from eventID', () => {}); + it('detects duplicate CloudTrail events by eventID', () => {}); + it('allows same resource type from different events', () => {}); + }); + + describe('Cost Precision', () => { + it('calculates hourly cost with 4 decimal places', () => {}); + it('rounds consistently (banker rounding) to avoid accumulation errors', () => {}); + it('handles sub-cent costs for Lambda invocations', () => {}); + }); }); ``` -### 3.2 Anomaly Engine (The Math) +### 3.2 Anomaly Scorer + +The most critical component. Uses property-based testing via `fast-check`. ```typescript describe('AnomalyScorer', () => { - describe('Statistical Scoring (Z-Score)', () => { - it('returns score=0 when event cost exactly matches baseline mean', () => {}); + describe('Z-Score Calculation', () => { + it('returns 0 when event cost exactly matches baseline mean', () => {}); it('returns proportional score for Z-scores between 1.0 and 3.0', () => {}); - it('caps Z-score contribution at max threshold', () => {}); + it('caps Z-score contribution at configurable max threshold', () => {}); + it('handles zero standard deviation without division by zero', () => {}); + it('handles single data point baseline (stddev undefined)', () => {}); + it('handles extremely large values without float overflow', () => {}); + it('handles negative cost delta (cost decrease) as non-anomalous', () => {}); }); describe('Novelty Scoring', () => { - it('adds novelty penalty when instance type is first seen for account', () => {}); - it('adds novelty penalty when IAM user has never provisioned this service', () => {}); + it('adds instance novelty penalty when type first seen for account', () => {}); + it('adds actor novelty penalty when IAM role is new', () => {}); + it('does not penalize known instance type + known actor', () => {}); + it('weights instance novelty higher than actor novelty', () => {}); + }); + + describe('Composite Scoring', () => { + it('combines Z-score + novelty into composite severity', () => {}); + it('classifies composite < 30 as info', () => {}); + it('classifies composite 30-60 as warning', () => {}); + it('classifies composite > 60 as critical', () => {}); + it('never assigns critical to events below $0.50/hr', () => {}); }); describe('Cold-Start Fast Path', () => { it('flags $5/hr instance as warning when baseline < 14 days', () => {}); it('flags $25/hr instance as critical immediately, bypassing baseline', () => {}); - it('ignores $0.10/hr instances during cold-start learning period', () => {}); + it('ignores $0.10/hr instances during cold-start learning', () => {}); + it('fast-path is always on — not behind a feature flag', () => {}); + it('transitions from fast-path to statistical scoring at maturity', () => {}); + }); + + describe('Feedback Loop', () => { + it('reduces score for resources marked as expected', () => {}); + it('adds actor to expected list after mark-as-expected', () => {}); + it('still flags expected actor if cost is 10x above baseline', () => {}); + }); + + describe('Property-Based Tests (fast-check)', () => { + it('score is always between 0 and 100 for any valid input', () => { + // fc.assert(fc.property( + // fc.record({ cost: fc.float({min: 0}), mean: fc.float({min: 0}), stddev: fc.float({min: 0}) }), + // (input) => { const score = scorer.score(input); return score >= 0 && score <= 100; } + // )) + }); + it('score monotonically increases as cost increases (baseline fixed)', () => {}); + it('score monotonically increases as Z-score increases', () => {}); + it('cold-start fast-path always triggers for cost > $25/hr', () => {}); + it('mature baseline never uses fast-path thresholds', () => {}); }); }); ``` -### 3.3 Baseline Learning +### 3.3 Baseline Engine ```typescript describe('BaselineCalculator', () => { - it('updates running mean and stddev using Welford algorithm', () => {}); - it('adds new actor to observed_actors set', () => {}); - it('marks baseline as mature when event_count > 20 and age_days > 14', () => {}); + describe('Welford Online Algorithm', () => { + it('updates running mean correctly after each observation', () => {}); + it('updates running variance correctly after each observation', () => {}); + it('produces correct stddev after 100 observations', () => {}); + it('handles first observation (count=1, stddev=0)', () => {}); + it('handles identical observations (stddev=0)', () => {}); + it('handles catastrophic cancellation with large values', () => { + // Welford is numerically stable — verify this property + }); + }); + + describe('Maturity Transitions', () => { + it('starts in cold-start state', () => {}); + it('transitions to learning after 5 events', () => {}); + it('transitions to mature after 20 events AND 14 days', () => {}); + it('does not mature with 100 events but only 3 days', () => {}); + it('does not mature with 14 days but only 5 events', () => {}); + }); + + describe('Actor & Instance Tracking', () => { + it('adds new actor to observed_actors set', () => {}); + it('adds new instance type to observed_types set', () => {}); + it('does not duplicate existing actors', () => {}); + }); + + describe('Property-Based Tests', () => { + it('mean converges to true mean as observations increase', () => {}); + it('variance is always non-negative', () => {}); + it('stddev equals sqrt(variance) within float tolerance', () => {}); + }); +}); +``` + +### 3.4 Zombie Hunter + +```typescript +describe('ZombieHunter', () => { + it('detects EC2 instance running >7 days with <5% CPU utilization', () => {}); + it('detects RDS instance with 0 connections for >3 days', () => {}); + it('detects unattached EBS volumes older than 7 days', () => {}); + it('calculates cumulative waste cost for each zombie', () => {}); + it('excludes instances tagged dd0c:ignore', () => {}); + it('handles API pagination for accounts with 500+ instances', () => {}); + it('respects read-only IAM permissions (never modifies resources)', () => {}); +}); +``` + +### 3.5 Notification Formatter + +```typescript +describe('NotificationFormatter', () => { + describe('Slack Block Kit', () => { + it('formats EC2 anomaly with resource type, region, cost, actor', () => {}); + it('formats RDS anomaly with engine, storage, multi-AZ status', () => {}); + it('includes "Why this alert" section with anomaly signals', () => {}); + it('includes suggested CLI commands for remediation', () => {}); + it('includes Snooze/Mark Expected/Stop Instance buttons', () => {}); + it('generates correct aws ec2 stop-instances command', () => {}); + it('generates correct aws rds stop-db-instance command', () => {}); + }); + + describe('Daily Digest', () => { + it('aggregates 24h of anomalies into summary stats', () => {}); + it('includes total estimated spend across all accounts', () => {}); + it('highlights top 3 costliest anomalies', () => {}); + it('includes zombie resource count and waste estimate', () => {}); + it('shows baseline learning progress for new accounts', () => {}); + }); +}); +``` + +### 3.6 Slack Bot + +```typescript +describe('SlackBot', () => { + describe('Signature Validation', () => { + it('validates correct Slack request signature (HMAC-SHA256)', () => {}); + it('rejects request with invalid signature', () => {}); + it('rejects request with missing X-Slack-Signature header', () => {}); + it('rejects request with expired timestamp (>5 min)', () => {}); + it('uses timing-safe comparison to prevent timing attacks', () => {}); + }); + + describe('Command Parsing', () => { + it('routes /dd0c status to status handler', () => {}); + it('routes /dd0c anomalies to anomaly list handler', () => {}); + it('routes /dd0c digest to digest handler', () => {}); + it('returns help text for unknown commands', () => {}); + it('responds within 3 seconds or defers with 200 OK', () => {}); + }); + + describe('Interactive Actions', () => { + it('validates interactive payload signature', () => {}); + it('handles mark_expected action and updates baseline', () => {}); + it('handles snooze_1h action and sets snoozeUntil', () => {}); + it('handles snooze_24h action', () => {}); + it('updates original Slack message after action', () => {}); + it('rejects action from user not in authorized workspace', () => {}); + }); +}); +``` + +### 3.7 Governance Policy Engine + +```typescript +describe('GovernancePolicy', () => { + describe('Mode Enforcement', () => { + it('strict mode: logs anomaly but does not send Slack alert', () => {}); + it('audit mode: sends Slack alert with full logging', () => {}); + it('defaults new accounts to strict mode', () => {}); + }); + + describe('14-Day Auto-Promotion', () => { + it('does not promote account with <14 days of baseline', () => {}); + it('does not promote account with >10% false-positive rate', () => {}); + it('promotes account on day 15 if FP rate <10%', () => {}); + it('calculates false-positive rate from mark-as-expected actions', () => {}); + it('auto-promotion check runs daily via cron', () => {}); + }); + + describe('Panic Mode', () => { + it('stops all alerting when panic=true', () => {}); + it('continues scoring and logging during panic', () => {}); + it('activates in <1 second via Redis key', () => {}); + it('activatable via POST /admin/panic', () => {}); + it('dashboard API returns "alerting paused" header during panic', () => {}); + }); + + describe('Per-Account Override', () => { + it('account can set stricter mode than system default', () => {}); + it('account cannot downgrade from system strict to audit', () => {}); + it('merge logic: max_restrictive(system, account)', () => {}); + }); + + describe('Policy Decision Logging', () => { + it('logs "suppressed by strict mode" with anomaly context', () => {}); + it('logs "auto-promoted to audit mode" with baseline stats', () => {}); + it('logs "panic mode active — alerting paused"', () => {}); + }); +}); +``` + +### 3.8 Dashboard API + +```typescript +describe('DashboardAPI', () => { + describe('Account Management', () => { + it('GET /v1/accounts returns connected accounts for tenant', () => {}); + it('DELETE /v1/accounts/:id marks account as disconnecting', () => {}); + it('returns 401 without valid Cognito JWT', () => {}); + it('scopes all queries to authenticated tenantId', () => {}); + }); + + describe('Anomaly Listing', () => { + it('GET /v1/anomalies returns recent anomalies', () => {}); + it('supports since, status, severity filters', () => {}); + it('implements cursor-based pagination', () => {}); + it('includes slackMessageUrl when alert was sent', () => {}); + }); + + describe('Baseline Overrides', () => { + it('PATCH /v1/accounts/:id/baselines/:service/:type updates sensitivity', () => {}); + it('rejects invalid sensitivity values', () => {}); + }); + + describe('Tenant Isolation', () => { + it('never returns anomalies from another tenant', () => {}); + it('never returns accounts from another tenant', () => {}); + it('enforces tenantId on all DynamoDB queries', () => {}); + }); +}); +``` + +### 3.9 Onboarding & PLG + +```typescript +describe('Onboarding', () => { + describe('CloudFormation Template', () => { + it('generates valid CFN YAML with correct IAM permissions', () => {}); + it('includes ExternalId parameter', () => {}); + it('includes EventBridge rule for cost-relevant CloudTrail events', () => {}); + it('quick-create URL contains correct template URL and parameters', () => {}); + }); + + describe('Role Validation', () => { + it('successfully assumes role with correct ExternalId', () => {}); + it('returns clear error on role not found', () => {}); + it('returns clear error on ExternalId mismatch', () => {}); + it('triggers zombie scan on successful connection', () => {}); + }); + + describe('Free Tier Enforcement', () => { + it('allows first account connection on free tier', () => {}); + it('rejects second account with 403 and upgrade prompt', () => {}); + it('allows multiple accounts on pro tier', () => {}); + }); + + describe('Stripe Integration', () => { + it('creates Stripe Checkout session with correct pricing', () => {}); + it('handles checkout.session.completed webhook', () => {}); + it('handles customer.subscription.deleted webhook', () => {}); + it('validates Stripe webhook signature', () => {}); + it('updates tenant tier to pro on successful payment', () => {}); + it('downgrades tenant on subscription cancellation', () => {}); + }); +}); +``` + +### 3.10 Feature Flag Circuit Breaker + +```typescript +describe('AlertVolumeCircuitBreaker', () => { + it('allows alerting when volume is within 3x baseline', () => {}); + it('trips breaker when alerts exceed 3x baseline over 1 hour', () => {}); + it('auto-disables the scoring flag when breaker trips', () => {}); + it('buffers suppressed alerts in DLQ for review', () => {}); + it('tracks alert-per-account rate in Redis sliding window', () => {}); + it('resets breaker after manual flag re-enable', () => {}); + it('fast-path alerts are exempt from circuit breaker', () => {}); }); ``` --- - ## Section 4: Integration Test Strategy ### 4.1 DynamoDB Data Layer (Testcontainers) ```typescript -describe('DynamoDB Single-Table Patterns', () => { - it('writes CostEvent and updates Baseline in single transaction', async () => {}); - it('queries all anomalies for tenant within time range', async () => {}); - it('fetches tenant config and Slack tokens securely', async () => {}); +describe('DynamoDB Integrations', () => { + let dynamodb: StartedTestContainer; + + beforeAll(async () => { + dynamodb = await new GenericContainer('amazon/dynamodb-local:latest') + .withExposedPorts(8000).start(); + // Create dd0c-cost-main table with GSIs + }); + + describe('Transactional Writes', () => { + it('writes CostEvent and updates Baseline in single TransactWriteItem', async () => {}); + it('fails gracefully if TransactWriteItem encounters ConditionalCheckFailed', async () => {}); + it('handles partial failure recovery when Baseline update conflicts', async () => {}); + }); + + describe('Access Patterns', () => { + it('queries all anomalies for tenant within time range (GSI3)', async () => {}); + it('fetches tenant config and Slack tokens securely', async () => {}); + it('retrieves accurate Baseline snapshot by resource type', async () => {}); + }); }); ``` -### 4.2 AWS API Contract Tests +### 4.2 Cross-Account STS & AWS APIs (LocalStack) ```typescript -describe('AWS Cross-Account Actions', () => { - // Uses LocalStack to simulate target account - it('assumes target account remediation role successfully', async () => {}); - it('executes ec2:StopInstances when remediation approved', async () => {}); - it('executes rds:DeleteDBInstance with skip-final-snapshot', async () => {}); +describe('AWS Cross-Account Integrations', () => { + let localstack: StartedTestContainer; + + beforeAll(async () => { + localstack = await new GenericContainer('localstack/localstack:3') + .withEnv('SERVICES', 'sts,ec2,rds') + .withExposedPorts(4566).start(); + }); + + describe('Role Assumption', () => { + it('successfully assumes target account remediation role via STS', async () => {}); + it('fails when ExternalId does not match (Security)', async () => {}); + it('handles STS credential expiration gracefully', async () => {}); + }); + + describe('Remediation Actions', () => { + it('executes ec2:StopInstances when remediation approved', async () => {}); + it('executes rds:StopDBInstance when remediation approved', async () => {}); + it('fails safely when target IAM role lacks StopInstances permission', async () => {}); + }); +}); +``` + +### 4.3 Slack API Contract (WireMock) + +```typescript +describe('Slack Integration', () => { + it('formats and delivers Block Kit message successfully', async () => {}); + it('handles 429 Rate Limit by throwing retryable error for SQS visibility timeout', async () => {}); + it('updates existing Slack message when anomaly is snoozed', async () => {}); }); ``` @@ -159,24 +528,65 @@ describe('AWS Cross-Account Actions', () => { ### 5.1 Critical User Journeys -**Journey 1: Real-Time Anomaly Detection** -1. Send synthetic `RunInstances` event to EventBridge (p9.16xlarge, $40/hr). -2. Verify system processes event and triggers fast-path (no baseline). -3. Verify Slack alert is generated with correct cost estimate. +**Journey 1: Real-Time Anomaly Detection (The Golden Path)** +```typescript +describe('E2E: Anomaly Detection', () => { + it('detects anomaly and alerts Slack within 30 seconds', async () => { + // 1. Inject synthetic CloudTrail `RunInstances` event (p4d.24xlarge) into SQS Ingestion Queue + // 2. Poll DynamoDB to ensure CostEvent was recorded + // 3. Poll DynamoDB to ensure AnomalyRecord was created (fast-path triggered) + // 4. Assert WireMock received the Slack chat.postMessage call with Block Kit + }); +}); +``` **Journey 2: Interactive Remediation** -1. Send webhook simulating user clicking "Stop Instance" in Slack. -2. Verify API Gateway → Lambda executes `StopInstances` against LocalStack. -3. Verify Slack message updates to "Remediation Successful". +```typescript +describe('E2E: Interactive Remediation', () => { + it('stops EC2 instance when user clicks Stop in Slack', async () => { + // 1. Simulate Slack sending interactive webhook payload for "Stop Instance" + // 2. Validate HMAC signature in API Gateway lambda + // 3. Verify LocalStack EC2 mock receives StopInstances call + // 4. Verify Slack message is updated to "Remediation Successful" + }); +}); +``` + +**Journey 3: Onboarding & First Scan** +```typescript +describe('E2E: Onboarding', () => { + it('validates IAM role and triggers initial zombie scan', async () => { + // 1. Trigger POST /v1/accounts with new role ARN + // 2. Verify account marked active + // 3. Verify EventBridge Scheduler creates cron for Zombie Hunter + }); +}); +``` --- ## Section 6: Performance & Load Testing +### 6.1 Ingestion & Scoring Throughput ```typescript -describe('Ingestion Throughput', () => { - it('processes 500 CloudTrail events/second via SQS FIFO', async () => {}); - it('DynamoDB baseline updates complete in <20ms p95', async () => {}); +describe('Performance: Alert Storm', () => { + it('processes 1000 CloudTrail events/sec without SQS DLQ overflow', async () => { + // k6 load test hitting SQS directly + }); + + it('DynamoDB baseline updates complete in <20ms p95 under load', async () => { + // Ensure Single-Table schema does not create hot partitions + }); + + it('Anomaly Scorer Lambda consumes <256MB memory during burst', async () => {}); +}); +``` + +### 6.2 Data Scale Tests +```typescript +describe('Performance: Baseline Scale', () => { + it('calculates Z-score in <5ms even when observed_actors set exceeds 1000', async () => {}); + it('handles accounts with 100,000+ daily CostEvents without throttling DynamoDB (On-Demand scaling)', async () => {}); }); ``` @@ -184,49 +594,119 @@ describe('Ingestion Throughput', () => { ## Section 7: CI/CD Pipeline Integration -- **PR Gate:** Unit tests (<2min), Coverage >85% (Scoring engine >95%). -- **Merge:** Integration tests with LocalStack & Testcontainers DynamoDB. -- **Staging:** E2E journeys against isolated staging AWS account. +### 7.1 Pipeline Stages +``` +┌─────────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ +│ Pre-Commit │───▶│ PR Gate │───▶│ Merge │───▶│ Staging │───▶│ Prod │ +│ (local) │ │ (CI) │ │ (CI) │ │ (CD) │ │ (CD) │ +└─────────────┘ └──────────┘ └──────────┘ └──────────┘ └──────────┘ + lint + type unit tests integration E2E + perf canary + <10s math prop Testcontainers LocalStack <5 mins + tests <1m <4 mins <10 mins +``` + +### 7.2 Coverage Gates +| Component | Threshold | +|-----------|-----------| +| Anomaly Scorer (Math) | 100% | +| CloudTrail Normalizer | 95% | +| Governance Policy | 95% | +| Slack Signature Auth | 100% | +| Overall Pipeline | 85% | --- ## Section 8: Transparent Factory Tenet Testing -### 8.1 Atomic Flagging (Circuit Breaker) +### 8.1 Atomic Flagging ```typescript -it('auto-disables scoring rule if it generates >10 alerts/hour for single tenant', () => {}); +describe('Atomic Flagging', () => { + it('auto-disables scoring rule flag if alert volume exceeds 3x baseline in 1hr', () => {}); + it('buffers suppressed anomalies in SQS DLQ while flag is off', () => {}); + it('fails CI if any flag TTL exceeds 14 days', () => {}); + it('evaluates flags strictly locally (in-memory provider)', () => {}); +}); ``` -### 8.2 Configurable Autonomy (14-Day Auto-Promotion) +### 8.2 Elastic Schema ```typescript -it('keeps new tenant in strict mode (log-only) for first 14 days', () => {}); -it('auto-promotes to audit mode (auto-alert) on day 15 if false-positive rate < 10%', () => {}); +describe('Elastic Schema', () => { + it('rejects DynamoDB table definition modifications that alter key schemas', () => {}); + it('requires all DynamoDB item updates to use ADD/SET (additive only)', () => {}); + it('ignores unknown attributes (V2 fields) in V1 CostEvent decoders', () => {}); +}); +``` + +### 8.3 Cognitive Durability +```typescript +describe('Cognitive Durability', () => { + it('requires decision_log.json for any PR modifying Z-score thresholds or weights', () => {}); + it('enforces cyclomatic complexity < 10 for all AnomalyScorer math functions', () => {}); +}); +``` + +### 8.4 Semantic Observability +```typescript +describe('Semantic Observability', () => { + it('emits OTEL span for every Anomaly Scoring decision', () => {}); + it('includes attributes: cost.z_score, cost.anomaly_score, cost.baseline_days', () => {}); + it('includes cost.fast_path_triggered flag when baseline is bypassed', () => {}); + it('hashes AWS Account ID in spans to protect PII/tenant identity', () => {}); +}); +``` + +### 8.5 Configurable Autonomy +```typescript +describe('Configurable Autonomy', () => { + it('keeps new tenant in Strict Mode (log-only) for first 14 days', () => {}); + it('auto-promotes to Audit Mode on day 15 if false-positive rate < 10%', () => {}); + it('Panic Mode halts ALL Slack alerts in <1 second via Redis check', () => {}); + it('Panic Mode does NOT halt baseline recording (read-only tracking continues)', () => {}); +}); ``` --- ## Section 9: Test Data & Fixtures -``` -fixtures/ - cloudtrail/ - ec2-runinstances.json - rds-create-db.json - lambda-create-function.json - baselines/ - mature-steady-spend.json - volatile-dev-account.json - cold-start.json +### 9.1 Data Factories +```typescript +export const makeCloudTrailEvent = (overrides) => ({ + eventVersion: '1.08', + userIdentity: { type: 'AssumedRole', arn: 'arn:aws:sts::123:assumed-role/user' }, + eventTime: new Date().toISOString(), + eventSource: 'ec2.amazonaws.com', + eventName: 'RunInstances', + requestParameters: { instanceType: 'm5.large' }, + ...overrides +}); + +export const makeBaseline = (overrides) => ({ + meanHourlyCost: 1.25, + stdDev: 0.15, + eventCount: 45, + ageDays: 16, + observedActors: ['arn:aws:iam::123:role/ci'], + observedInstanceTypes: ['t3.medium', 'm5.large'], + ...overrides +}); ``` --- ## Section 10: TDD Implementation Order -1. **Phase 1:** Anomaly math + Unit tests (Strict TDD). -2. **Phase 2:** CloudTrail normalizer + Pricing tables. -3. **Phase 3:** DynamoDB single-table implementation (Integration led). -4. **Phase 4:** Slack formatting + Remediation Lambda. -5. **Phase 5:** Governance policies (14-day promotion logic). +1. **Phase 1: Math & Core Logic (Strict TDD)** + - Welford algorithm, Z-score math, Novelty scoring, `fast-check` property tests. +2. **Phase 2: Ingestion & Normalization** + - CloudTrail parsers, pricing static tables, event deduplication. +3. **Phase 3: Data Persistence (Integration Led)** + - DynamoDB Single-Table setup, TransactWriteItems, Testcontainers tests. +4. **Phase 4: Notifications & Slack Actions** + - Block Kit formatting, Slack signature validation, API Gateway endpoints. +5. **Phase 5: Governance & Tenets** + - 14-day promotion logic, Panic mode, OTEL tracing. +6. **Phase 6: E2E Pipeline** + - CDK definitions, LocalStack event injection, wire everything together. -*End of dd0c/cost Test Architecture* +*End of dd0c/cost Test Architecture (v2)* diff --git a/products/06-runbook-automation/test-architecture/test-architecture.md b/products/06-runbook-automation/test-architecture/test-architecture.md index b35ef45..bf176b6 100644 --- a/products/06-runbook-automation/test-architecture/test-architecture.md +++ b/products/06-runbook-automation/test-architecture/test-architecture.md @@ -1760,3 +1760,527 @@ Before writing the `impl ExecutionEngine { pub async fn execute(...) }` function 5. `engine_pauses_in_flight_execution_when_panic_mode_set` Only once these tests are defined can the state machine be implemented to make them pass (Green phase). This ensures no execution path can bypass the Trust Gradient. + +--- + +## 11. Review Remediation Addendum (Post-Gemini Review) + +The following sections address all gaps identified in the TDD review. These are net-new test specifications that must be integrated into the relevant sections above during implementation. + +### 11.1 Missing Epic Coverage + +#### Epic 3.4: Divergence Analysis + +```rust +// pkg/executor/divergence/tests.rs + +#[test] fn divergence_detects_extra_command_not_in_runbook() {} +#[test] fn divergence_detects_modified_command_vs_prescribed() {} +#[test] fn divergence_detects_skipped_step_not_marked_as_skipped() {} +#[test] fn divergence_report_includes_diff_of_prescribed_vs_actual() {} +#[test] fn divergence_flags_env_var_changes_made_during_execution() {} +#[test] fn divergence_ignores_whitespace_differences_in_commands() {} +#[test] fn divergence_analysis_runs_automatically_after_execution_completes() {} +#[test] fn divergence_report_written_to_audit_trail() {} + +#[tokio::test] +async fn integration_divergence_analysis_detects_agent_side_extra_commands() { + // Agent executes an extra `whoami` not in the runbook + // Divergence analyzer must flag it +} +``` + +#### Epic 5.3: Compliance Export + +```rust +// pkg/audit/export/tests.rs + +#[tokio::test] async fn export_generates_valid_csv_for_date_range() {} +#[tokio::test] async fn export_generates_valid_pdf_with_execution_summary() {} +#[tokio::test] async fn export_uploads_to_s3_and_returns_presigned_url() {} +#[tokio::test] async fn export_presigned_url_expires_after_24_hours() {} +#[tokio::test] async fn export_scoped_to_tenant_via_rls() {} +#[tokio::test] async fn export_includes_hash_chain_verification_status() {} +#[tokio::test] async fn export_redacts_command_output_but_includes_hashes() {} +``` + +#### Epic 6.4: Classification Query API Rate Limiting + +```rust +// tests/integration/api_rate_limit_test.rs + +#[tokio::test] +async fn api_rate_limit_30_requests_per_minute_per_tenant() { + let stack = E2EStack::start().await; + for i in 0..30 { + let resp = stack.api().get("/v1/run/classifications").send().await; + assert_eq!(resp.status(), 200); + } + // 31st request must be rate-limited + let resp = stack.api().get("/v1/run/classifications").send().await; + assert_eq!(resp.status(), 429); +} + +#[tokio::test] +async fn api_rate_limit_resets_after_60_seconds() {} + +#[tokio::test] +async fn api_rate_limit_is_per_tenant_not_global() { + // Tenant A hitting limit must not affect Tenant B +} + +#[tokio::test] +async fn api_rate_limit_returns_retry_after_header() {} +``` + +#### Epic 7: Dashboard UI (Playwright) + +```typescript +// tests/e2e/ui/dashboard.spec.ts + +test('parse preview renders within 5 seconds of paste', async ({ page }) => { + await page.goto('/dashboard/runbooks/new'); + await page.fill('[data-testid="runbook-input"]', FIXTURE_RUNBOOK); + const preview = page.locator('[data-testid="parse-preview"]'); + await expect(preview).toBeVisible({ timeout: 5000 }); + await expect(preview.locator('.step-card')).toHaveCount(4); +}); + +test('trust level visualization shows correct colors per step', async ({ page }) => { + // 🟢 safe = green, 🟡 caution = yellow, 🔴 dangerous = red +}); + +test('MTTR dashboard loads and displays chart', async ({ page }) => { + await page.goto('/dashboard/analytics'); + await expect(page.locator('[data-testid="mttr-chart"]')).toBeVisible(); +}); + +test('execution timeline shows real-time step progress', async ({ page }) => {}); +test('approval modal requires typed confirmation for dangerous steps', async ({ page }) => {}); +test('panic mode banner appears when panic is active', async ({ page }) => {}); +``` + +#### Epic 9: Onboarding & PLG + +```rust +// pkg/onboarding/tests.rs + +#[test] fn free_tier_allows_5_runbooks() {} +#[test] fn free_tier_allows_50_executions_per_month() {} +#[test] fn free_tier_rejects_6th_runbook_with_upgrade_prompt() {} +#[test] fn free_tier_rejects_51st_execution_with_upgrade_prompt() {} +#[test] fn free_tier_counter_resets_monthly() {} + +#[test] fn agent_install_snippet_includes_correct_api_key() {} +#[test] fn agent_install_snippet_includes_correct_gateway_url() {} +#[test] fn agent_install_snippet_is_valid_bash() {} + +#[tokio::test] async fn stripe_checkout_creates_session_with_correct_pricing() {} +#[tokio::test] async fn stripe_webhook_checkout_completed_upgrades_tenant() {} +#[tokio::test] async fn stripe_webhook_subscription_deleted_downgrades_tenant() {} +#[tokio::test] async fn stripe_webhook_validates_signature() {} +``` + +### 11.2 Agent-Side Security Tests (Zero-Trust Environment) + +The Agent runs in customer VPCs — untrusted territory. These tests prove the Agent defends itself independently of the SaaS backend. + +```rust +// pkg/agent/security/tests.rs + +// Agent-side deterministic blocking (mirrors SaaS scanner) +#[test] fn agent_scanner_blocks_rm_rf_independently_of_saas() {} +#[test] fn agent_scanner_blocks_kubectl_delete_namespace_independently() {} +#[test] fn agent_scanner_blocks_drop_table_independently() {} +#[test] fn agent_scanner_rejects_command_even_if_saas_says_safe() { + // Simulates compromised SaaS sending a "safe" classification for rm -rf + let saas_classification = Classification { risk: RiskLevel::Safe, .. }; + let agent_result = agent_scanner.classify("rm -rf /"); + assert_eq!(agent_result.risk, RiskLevel::Dangerous); + // Agent MUST override SaaS classification +} + +// Binary integrity +#[test] fn agent_validates_binary_checksum_on_startup() {} +#[test] fn agent_refuses_to_start_if_checksum_mismatch() {} + +// Payload tampering +#[tokio::test] async fn agent_rejects_grpc_payload_with_invalid_hmac() {} +#[tokio::test] async fn agent_rejects_grpc_payload_with_expired_timestamp() {} +#[tokio::test] async fn agent_rejects_grpc_payload_with_mismatched_execution_id() {} + +// Local fallback when SaaS is unreachable +#[tokio::test] async fn agent_falls_back_to_scanner_only_when_saas_disconnected() {} +#[tokio::test] async fn agent_in_fallback_mode_treats_all_unknowns_as_caution() {} +#[tokio::test] async fn agent_reconnects_automatically_when_saas_returns() {} +``` + +### 11.3 Realistic Sandbox Matrix + +Replace Alpine-only sandbox with a matrix of realistic execution targets. + +```rust +// tests/integration/sandbox_matrix_test.rs + +#[rstest] +#[case("ubuntu:22.04")] +#[case("amazonlinux:2023")] +#[case("alpine:3.19")] +async fn sandbox_safe_command_executes_on_all_targets(#[case] image: &str) { + let sandbox = SandboxContainer::start(image).await; + let agent = TestAgent::connect_to(sandbox.socket_path()).await; + let result = agent.execute("ls /tmp").await.unwrap(); + assert_eq!(result.exit_code, 0); +} + +#[rstest] +#[case("ubuntu:22.04")] +#[case("amazonlinux:2023")] +async fn sandbox_dangerous_command_blocked_on_all_targets(#[case] image: &str) { + let sandbox = SandboxContainer::start(image).await; + let agent = TestAgent::connect_to(sandbox.socket_path()).await; + let result = agent.execute("rm -rf /").await; + assert!(result.is_err()); +} + +// Non-root execution +#[tokio::test] +async fn sandbox_agent_runs_as_non_root_user() { + let sandbox = SandboxContainer::start_as_user("ubuntu:22.04", "dd0c-agent").await; + let agent = TestAgent::connect_to(sandbox.socket_path()).await; + let result = agent.execute("whoami").await.unwrap(); + assert_eq!(result.stdout.trim(), "dd0c-agent"); +} + +#[tokio::test] +async fn sandbox_non_root_agent_cannot_escalate_to_root() { + let sandbox = SandboxContainer::start_as_user("ubuntu:22.04", "dd0c-agent").await; + let agent = TestAgent::connect_to(sandbox.socket_path()).await; + let result = agent.execute("sudo cat /etc/shadow").await; + assert!(result.is_err() || result.unwrap().exit_code != 0); +} + +// RBAC-restricted K3s +#[tokio::test] +async fn sandbox_k3s_rbac_denies_kubectl_delete_namespace() { + let k3s = K3sContainer::start_with_rbac("read-only-role").await; + let agent = TestAgent::with_kubeconfig(k3s.kubeconfig()).await; + let result = agent.execute("kubectl delete namespace default").await; + // Should be blocked by BOTH scanner AND K8s RBAC + assert!(result.is_err()); +} +``` + +### 11.4 Advanced Command Injection Tests + +```rust +// pkg/classifier/scanner/injection_tests.rs + +// Semicolon injection +#[test] fn scanner_semicolon_rm_rf_is_dangerous() { + assert_dangerous("echo hello; rm -rf /"); +} + +// Pipe injection +#[test] fn scanner_pipe_to_rm_is_dangerous() { + assert_dangerous("find / -name '*.log' | xargs rm -rf"); +} + +// Backtick injection +#[test] fn scanner_backtick_rm_is_dangerous() { + assert_dangerous("echo `rm -rf /`"); +} + +// $() substitution (already tested, but more variants) +#[test] fn scanner_nested_substitution_is_dangerous() { + assert_dangerous("echo $(echo $(rm -rf /))"); +} + +// Newline injection +#[test] fn scanner_newline_injection_is_dangerous() { + assert_dangerous("echo safe\nrm -rf /"); +} + +// Null byte injection +#[test] fn scanner_null_byte_injection_is_dangerous() { + assert_dangerous("echo safe\0rm -rf /"); +} + +// Unicode homoglyph attack +#[test] fn scanner_unicode_homoglyph_rm_is_dangerous() { + // Using Cyrillic 'р' and 'м' that look like 'r' and 'm' + assert_dangerous("rм -rf /"); // Should still catch this +} + +// Base64 encoded payload +#[test] fn scanner_base64_decode_pipe_bash_is_dangerous() { + assert_dangerous("echo cm0gLXJmIC8= | base64 -d | bash"); +} + +// Heredoc injection +#[test] fn scanner_heredoc_with_destructive_is_dangerous() { + assert_dangerous("cat << EOF | bash\nrm -rf /\nEOF"); +} + +// Environment variable expansion +#[test] fn scanner_env_var_expansion_to_rm_is_dangerous() { + assert_dangerous("$CMD"); // Unknown variable expansion = unknown, not safe +} +``` + +### 11.5 Privilege Escalation Tests + +```rust +// pkg/classifier/scanner/escalation_tests.rs + +#[test] fn scanner_sudo_anything_is_at_least_caution() { + assert_at_least_caution("sudo systemctl restart nginx"); +} + +#[test] fn scanner_sudo_rm_is_dangerous() { + assert_dangerous("sudo rm -rf /var/log"); +} + +#[test] fn scanner_su_root_is_dangerous() { + assert_dangerous("su - root -c 'rm -rf /'"); +} + +#[test] fn scanner_chmod_suid_is_dangerous() { + assert_dangerous("chmod u+s /usr/bin/find"); +} + +#[test] fn scanner_chown_root_is_caution() { + assert_at_least_caution("chown root:root /tmp/exploit"); +} + +#[test] fn scanner_nsenter_is_dangerous() { + assert_dangerous("nsenter --target 1 --mount --uts --ipc --net --pid"); +} + +#[test] fn scanner_docker_run_privileged_is_dangerous() { + assert_dangerous("docker run --privileged -v /:/host ubuntu"); +} + +#[test] fn scanner_kubectl_exec_as_root_is_caution() { + assert_at_least_caution("kubectl exec -it pod -- /bin/bash"); +} +``` + +### 11.6 Rollback Failure & Nested Failure Tests + +```rust +// pkg/executor/rollback/tests.rs + +#[test] fn rollback_failure_transitions_to_manual_intervention() { + let mut engine = ExecutionEngine::new(); + engine.transition(State::RollingBack); + engine.report_rollback_failure("rollback command timed out"); + assert_eq!(engine.state(), State::ManualIntervention); +} + +#[test] fn rollback_failure_does_not_retry_automatically() { + // Rollback failures are terminal — no auto-retry +} + +#[test] fn rollback_timeout_kills_rollback_process_after_300s() {} + +#[test] fn rollback_hanging_indefinitely_triggers_manual_intervention_after_timeout() { + let mut engine = ExecutionEngine::with_rollback_timeout(Duration::from_secs(5)); + engine.transition(State::RollingBack); + // Simulate rollback that never completes + tokio::time::advance(Duration::from_secs(6)).await; + assert_eq!(engine.state(), State::ManualIntervention); +} + +#[test] fn manual_intervention_state_sends_slack_alert_to_oncall() {} +#[test] fn manual_intervention_state_logs_full_context_to_audit() {} +``` + +### 11.7 Double Execution & Network Partition Tests + +```rust +// pkg/executor/idempotency/tests.rs + +#[tokio::test] +async fn agent_reconnect_after_partition_resyncs_already_executed_step() { + let stack = E2EStack::start().await; + let execution = stack.start_execution().await; + + // Agent executes step successfully + stack.wait_for_step_state(&execution.id, &step_id, "executing").await; + + // Network partition AFTER execution but BEFORE ACK + stack.partition_agent().await; + + // Agent reconnects + stack.heal_partition().await; + + // Engine must recognize step was already executed — no double execution + let step = stack.get_step(&execution.id, &step_id).await; + assert_eq!(step.execution_count, 1); // Exactly once +} + +#[tokio::test] +async fn engine_does_not_re_send_command_after_agent_reconnect_if_step_completed() {} + +#[tokio::test] +async fn engine_re_sends_command_if_agent_never_started_execution_before_partition() {} +``` + +### 11.8 Slack Payload Forgery Tests + +```rust +// tests/integration/slack_security_test.rs + +#[tokio::test] +async fn slack_approval_webhook_rejects_missing_signature() { + let resp = stack.api() + .post("/v1/run/slack/actions") + .json(&fixture_approval_payload()) + // No X-Slack-Signature header + .send().await; + assert_eq!(resp.status(), 401); +} + +#[tokio::test] +async fn slack_approval_webhook_rejects_invalid_signature() { + let resp = stack.api() + .post("/v1/run/slack/actions") + .header("X-Slack-Signature", "v0=invalid_hmac") + .header("X-Slack-Request-Timestamp", &now_timestamp()) + .json(&fixture_approval_payload()) + .send().await; + assert_eq!(resp.status(), 401); +} + +#[tokio::test] +async fn slack_approval_webhook_rejects_replayed_timestamp() { + // Timestamp older than 5 minutes + let resp = stack.api() + .post("/v1/run/slack/actions") + .header("X-Slack-Signature", &valid_signature_for_old_timestamp()) + .header("X-Slack-Request-Timestamp", &five_minutes_ago()) + .json(&fixture_approval_payload()) + .send().await; + assert_eq!(resp.status(), 401); +} + +#[tokio::test] +async fn slack_approval_webhook_rejects_cross_tenant_approval() { + // Tenant A's user trying to approve Tenant B's execution +} +``` + +### 11.9 Audit Log Encryption Tests + +```rust +// tests/integration/audit_encryption_test.rs + +#[tokio::test] +async fn audit_log_command_field_is_encrypted_at_rest() { + let db = TestDb::start().await; + // Insert an audit event with a command + insert_audit_event(&db, "kubectl get pods").await; + + // Read raw bytes from PostgreSQL — must NOT contain plaintext command + let raw = db.query_raw_bytes("SELECT command FROM audit_events LIMIT 1").await; + assert!(!String::from_utf8_lossy(&raw).contains("kubectl get pods"), + "Command stored in plaintext — must be encrypted"); +} + +#[tokio::test] +async fn audit_log_output_field_is_encrypted_at_rest() { + let db = TestDb::start().await; + insert_audit_event_with_output(&db, "sensitive output data").await; + + let raw = db.query_raw_bytes("SELECT output FROM audit_events LIMIT 1").await; + assert!(!String::from_utf8_lossy(&raw).contains("sensitive output data")); +} + +#[tokio::test] +async fn audit_log_decryption_requires_kms_key() { + // Verify the app role can decrypt using the KMS key + let db = TestDb::start().await; + insert_audit_event(&db, "kubectl get pods").await; + + let decrypted = db.as_app_role() + .query("SELECT decrypt_command(command) FROM audit_events LIMIT 1").await; + assert_eq!(decrypted, "kubectl get pods"); +} +``` + +### 11.10 gRPC Output Buffer Limits + +```rust +// pkg/agent/streaming/tests.rs + +#[tokio::test] +async fn agent_truncates_stdout_at_10mb() { + let sandbox = SandboxContainer::start("ubuntu:22.04").await; + let agent = TestAgent::connect_to(sandbox.socket_path()).await; + + // Generate 50MB of output + let result = agent.execute("dd if=/dev/urandom bs=1M count=50 | base64").await.unwrap(); + + // Agent must truncate, not OOM + assert!(result.stdout.len() <= 10 * 1024 * 1024); + assert!(result.truncated); +} + +#[tokio::test] +async fn agent_streams_output_in_chunks_not_buffered() { + // Verify output arrives incrementally, not all at once after completion +} + +#[tokio::test] +async fn agent_memory_stays_under_256mb_during_large_output() { + // Memory profiling test — agent must not OOM on `cat /dev/urandom` +} + +#[tokio::test] +async fn engine_handles_truncated_output_gracefully() { + // Engine receives truncated flag and logs warning +} +``` + +### 11.11 Parse SLA End-to-End Benchmark + +```rust +// benches/parse_sla_bench.rs + +#[tokio::test] +async fn parse_plus_classify_pipeline_under_5s_p95() { + let stack = E2EStack::start().await; + let mut latencies = vec![]; + + for _ in 0..100 { + let start = Instant::now(); + stack.api() + .post("/v1/run/runbooks/parse-preview") + .json(&json!({ "raw_text": FIXTURE_RUNBOOK_10_STEPS })) + .send().await; + latencies.push(start.elapsed()); + } + + let p95 = percentile(&latencies, 95); + assert!(p95 < Duration::from_secs(5), + "Parse+Classify p95 latency: {:?} — exceeds 5s SLA", p95); +} +``` + +### 11.12 Updated Test Pyramid (Post-Review) + +The Execution Engine ratio shifts from 80/15/5 to 60/30/10 per review recommendation: + +| Component | Unit | Integration | E2E | +|-----------|------|-------------|-----| +| Safety Scanner | 80% | 15% | 5% | +| Merge Engine | 90% | 10% | 0% | +| Execution Engine | **60%** | **30%** | **10%** | +| Parser | 50% | 40% | 10% | +| Approval Workflow | 70% | 20% | 10% | +| Audit Trail | 60% | 35% | 5% | +| Agent | 50% | 35% | 15% | +| Dashboard API | 40% | 50% | 10% | + +*End of Review Remediation Addendum* diff --git a/products/plg-instrumentation-brainstorm.md b/products/plg-instrumentation-brainstorm.md new file mode 100644 index 0000000..b5fa02b --- /dev/null +++ b/products/plg-instrumentation-brainstorm.md @@ -0,0 +1,226 @@ +# dd0c Platform — PLG Instrumentation Brainstorm + +**Session:** Carson (Brainstorming Coach) — Cross-Product PLG Analytics +**Date:** March 1, 2026 +**Scope:** All 6 dd0c products + +--- + +## The Problem + +We built 6 products with onboarding flows, free tiers, and Stripe billing — but zero product analytics. We can't answer: + +- How many users hit "aha moment" vs. bounce? +- Where in the funnel do free users drop off before upgrading? +- Which features drive retention vs. which are ignored? +- Are users churning because of alert fatigue, false positives, or just not getting value? +- What's our time-to-first-value per product? + +Without instrumentation, PLG iteration is guesswork. + +--- + +## Brainstorm: What to Instrument + +### 1. Unified Event Taxonomy + +Every dd0c product shares a common event naming convention: + +``` +.. + +Examples: + account.signup.completed + account.aws.connected + anomaly.alert.sent + anomaly.alert.snoozed + slack.bot.installed + billing.checkout.started + billing.upgrade.completed + feature.flag.evaluated +``` + +**Rules:** +- Past tense for completed actions (`completed`, `sent`, `clicked`) +- Present tense for state changes (`active`, `learning`, `paused`) +- Always include `tenant_id`, `timestamp`, `product` (route/drift/alert/portal/cost/run) +- Never include PII — hash emails, account IDs + +### 2. Per-Product Activation Metrics + +The "aha moment" is different for each product: + +| Product | Aha Moment | Metric | Target | +|---------|-----------|--------|--------| +| dd0c/route | First dollar saved by model routing | `routing.savings.first_dollar` | <24hr from signup | +| dd0c/drift | First drift detected in real stack | `drift.detection.first_found` | <1hr from agent install | +| dd0c/alert | First alert correlated (not just forwarded) | `alert.correlation.first_match` | <60sec from first alert | +| dd0c/portal | First service auto-discovered | `portal.discovery.first_service` | <5min from install | +| dd0c/cost | First anomaly detected in real account | `cost.anomaly.first_detected` | <24hr from AWS connect | +| dd0c/run | First runbook executed successfully | `run.execution.first_success` | <10min from setup | + +### 3. Conversion Funnel (Universal) + +Every product shares this funnel shape: + +``` +Signup → Connect (AWS/Slack/Git) → First Value → Habit → Upgrade +``` + +Events per stage: + +**Stage 1: Signup** +- `account.signup.started` — landed on signup page +- `account.signup.completed` — account created +- `account.signup.method` — github_sso / google_sso / email + +**Stage 2: Connect** +- `account.integration.started` — began connecting external service +- `account.integration.completed` — connection verified +- `account.integration.failed` — connection failed (include `error_type`) +- Product-specific: `account.aws.connected`, `account.slack.installed`, `account.git.connected` + +**Stage 3: First Value** +- Product-specific aha moment event (see table above) +- `onboarding.wizard.step_completed` — which step, how long +- `onboarding.wizard.abandoned` — which step they quit on + +**Stage 4: Habit** +- `session.daily.active` — DAU ping +- `session.weekly.active` — WAU ping +- `feature..used` — per-feature usage +- `notification.digest.opened` — are they reading digests? +- `slack.command.used` — which slash commands, how often + +**Stage 5: Upgrade** +- `billing.checkout.started` +- `billing.checkout.completed` +- `billing.checkout.abandoned` +- `billing.plan.changed` — upgrade/downgrade +- `billing.churn.detected` — subscription cancelled + +### 4. Feature Usage Events (Per Product) + +**dd0c/route (LLM Cost Router)** +- `routing.request.processed` — model selected, latency, cost +- `routing.override.manual` — user forced a specific model +- `routing.savings.calculated` — weekly savings digest generated +- `routing.shadow.audit.run` — shadow mode comparison completed +- `dashboard.cost.viewed` — opened cost dashboard + +**dd0c/drift (IaC Drift Detection)** +- `drift.scan.completed` — scan finished, drifts found count +- `drift.remediation.clicked` — user clicked "fix drift" +- `drift.remediation.applied` — drift actually fixed +- `drift.false_positive.marked` — user dismissed a drift +- `drift.agent.heartbeat` — agent is alive and scanning + +**dd0c/alert (Alert Intelligence)** +- `alert.ingested` — raw alert received +- `alert.correlated` — alerts grouped into incident +- `alert.suppressed` — duplicate/noise suppressed +- `alert.escalated` — sent to on-call +- `alert.feedback.helpful` / `alert.feedback.noise` — user feedback +- `alert.mttr.measured` — time from alert to resolution + +**dd0c/portal (Lightweight IDP)** +- `portal.service.discovered` — auto-discovery found a service +- `portal.service.claimed` — team claimed ownership +- `portal.scorecard.viewed` — someone checked service health +- `portal.scorecard.action_taken` — acted on a recommendation +- `portal.search.performed` — searched the catalog + +**dd0c/cost (AWS Cost Anomaly)** +- `cost.event.ingested` — CloudTrail event processed +- `cost.anomaly.scored` — anomaly scoring completed +- `cost.anomaly.alerted` — Slack alert sent +- `cost.anomaly.snoozed` — user snoozed alert +- `cost.anomaly.expected` — user marked as expected +- `cost.remediation.clicked` — user clicked Stop/Terminate +- `cost.remediation.executed` — remediation completed +- `cost.zombie.detected` — idle resource found +- `cost.digest.sent` — daily digest delivered + +**dd0c/run (Runbook Automation)** +- `run.runbook.created` — new runbook authored +- `run.execution.started` — runbook execution began +- `run.execution.completed` — execution finished (include `success`/`failed`) +- `run.execution.approval_requested` — human approval needed +- `run.execution.approval_granted` — human approved +- `run.execution.rolled_back` — rollback triggered +- `run.sandbox.test.run` — dry-run in sandbox + +### 5. Health Scoring (Churn Prediction) + +Composite health score per tenant, updated daily: + +``` +health_score = ( + 0.3 * activation_complete + // did they hit aha moment? + 0.2 * weekly_active_days + // how many days active this week? + 0.2 * feature_breadth + // how many features used? + 0.15 * integration_depth + // how many integrations connected? + 0.15 * feedback_sentiment // positive vs negative actions +) +``` + +Thresholds: +- `health > 0.7` → Healthy (green) +- `health 0.4-0.7` → At Risk (yellow) → trigger re-engagement email +- `health < 0.4` → Churning (red) → trigger founder outreach + +### 6. Analytics Stack Recommendation + +**PostHog** (self-hosted on AWS): +- Open source, self-hostable → no vendor lock-in +- Free tier: unlimited events self-hosted +- Built-in: funnels, retention, feature flags, session replay +- Supports custom events via REST API or JS/Python SDK +- Can run on a single t3.medium for V1 traffic + +**Why not Segment/Amplitude/Mixpanel:** +- Segment: $120/mo minimum, overkill for solo founder +- Amplitude: free tier is generous but cloud-only, data leaves your infra +- Mixpanel: same cloud-only concern +- PostHog self-hosted: $0/mo, data stays in your AWS account, GDPR-friendly + +**Integration pattern:** +``` +Lambda/API → PostHog REST API (async, fire-and-forget) +Next.js UI → PostHog JS SDK (auto-captures pageviews, clicks) +Slack Bot → PostHog Python SDK (command usage, action clicks) +``` + +### 7. Cross-Product Flywheel Metrics + +dd0c is a platform — users on one product should discover others: + +- `platform.cross_sell.impression` — "Try dd0c/alert" banner shown +- `platform.cross_sell.clicked` — user clicked cross-sell +- `platform.cross_sell.activated` — user activated second product +- `platform.products.active_count` — how many dd0c products per tenant + +**Flywheel hypothesis:** Users who activate 2+ dd0c products have 3x lower churn than single-product users. We need data to prove/disprove this. + +--- + +## Epic 11 Proposal: PLG Instrumentation + +### Scope +Cross-cutting epic added to all 6 products. Shared analytics SDK, per-product event implementations, funnel dashboards, health scoring. + +### Stories (Draft) +1. **PostHog Infrastructure** — CDK stack for self-hosted PostHog on ECS Fargate +2. **Analytics SDK** — Shared TypeScript/Python wrapper with standard event schema +3. **Funnel Dashboard** — PostHog dashboard template per product +4. **Activation Tracking** — Per-product aha moment detection and logging +5. **Health Scoring Engine** — Daily cron that computes tenant health scores +6. **Cross-Sell Instrumentation** — Platform-level cross-product discovery events +7. **Churn Alert Pipeline** — Health score → Slack alert to founder when tenant goes red + +### Estimate +~25 story points across all products (shared infrastructure + per-product event wiring) + +--- + +*This brainstorm establishes the "what" and "why." Party Mode advisory board should stress-test: Is PostHog the right choice? Is the event taxonomy too granular? Should health scoring be V1 or V2? Is 25 points realistic?*