Implement review remediation + PLG analytics SDK

- All 6 test architectures patched with Section 11 addendums - P5 (cost) fully rewritten from 232 to ~600 lines - PLG brainstorm + party mode advisory board results - Analytics SDK v2 (PostHog Cloud, Zod strict, Lambda-safe) - Analytics tests v2 (safeParse, no , no timestamp, no PII) - Addresses all Gemini review findings across P1-P6
2026-03-01 01:42:49 +00:00
parent 2fe0ed856e
commit 03bfe931fc
9 changed files with 2950 additions and 85 deletions
--- a/products/02-iac-drift-detection/test-architecture/test-architecture.md
+++ b/products/02-iac-drift-detection/test-architecture/test-architecture.md
@@ -1727,3 +1727,370 @@ Before any code ships to production, these tests must be green:
 ---

 *Document complete. Total estimated test count at V1 launch: ~500 tests. Target by month 3: ~1,000 tests.*
+
+---
+
+## 11. Review Remediation Addendum (Post-Gemini Review)
+
+### 11.1 Missing Epic Coverage
+
+#### Epic 6: Dashboard UI (React Testing Library + Playwright)
+
+```typescript
+// tests/ui/components/DiffViewer.test.tsx
+describe('DiffViewer Component', () => {
+  it('renders added lines in green', () => {});
+  it('renders removed lines in red', () => {});
+  it('renders unchanged lines in default color', () => {});
+  it('collapses large diffs with "Show more" toggle', () => {});
+  it('highlights HCL syntax in diff blocks', () => {});
+  it('shows resource type icon next to each drift item', () => {});
+});
+
+describe('StackOverview Component', () => {
+  it('renders drift count badge per stack', () => {});
+  it('sorts stacks by drift severity (critical first)', () => {});
+  it('shows last scan timestamp', () => {});
+  it('shows agent health indicator (green/yellow/red)', () => {});
+});
+
+// tests/e2e/ui/dashboard.spec.ts (Playwright)
+test('OAuth login redirects to Cognito and back', async ({ page }) => {
+  await page.goto('/dashboard');
+  await expect(page).toHaveURL(/cognito/);
+});
+
+test('stack list renders with drift counts', async ({ page }) => {
+  await page.goto('/dashboard/stacks');
+  await expect(page.locator('[data-testid="stack-card"]')).toHaveCountGreaterThan(0);
+});
+
+test('diff viewer renders inline diff for Terraform resource', async ({ page }) => {
+  await page.goto('/dashboard/stacks/stack-1/drifts/drift-1');
+  await expect(page.locator('[data-testid="diff-viewer"]')).toBeVisible();
+  await expect(page.locator('.diff-added')).toHaveCountGreaterThan(0);
+});
+
+test('revert button triggers confirmation modal', async ({ page }) => {
+  await page.goto('/dashboard/stacks/stack-1/drifts/drift-1');
+  await page.click('[data-testid="revert-btn"]');
+  await expect(page.locator('[data-testid="confirm-modal"]')).toBeVisible();
+});
+```
+
+#### Epic 9: Onboarding & PLG (Stripe + drift init)
+
+```go
+// pkg/onboarding/stripe_test.go
+
+func TestStripeWebhookCheckoutCompleted_UpgradesTenant(t *testing.T) {}
+func TestStripeWebhookSubscriptionDeleted_DowngradesTenant(t *testing.T) {}
+func TestStripeWebhookInvalidSignature_Returns401(t *testing.T) {}
+func TestStripeWebhookReplayedEvent_IsIdempotent(t *testing.T) {}
+
+// pkg/agent/init_test.go
+
+func TestDriftInit_DetectsTerraformInCurrentDir(t *testing.T) {}
+func TestDriftInit_DetectsCloudFormationInCurrentDir(t *testing.T) {}
+func TestDriftInit_DetectsPulumiInCurrentDir(t *testing.T) {}
+func TestDriftInit_GeneratesValidYAMLConfig(t *testing.T) {}
+func TestDriftInit_HandlesWindowsPaths(t *testing.T) {}
+func TestDriftInit_HandlesMacPaths(t *testing.T) {}
+func TestDriftInit_HandlesLinuxPaths(t *testing.T) {}
+func TestDriftInit_FailsGracefullyOnEmptyDir(t *testing.T) {}
+```
+
+#### Epic 8: Infrastructure (Terratest)
+
+```go
+// tests/infra/terraform_test.go
+
+func TestTerraformPlan_CreatesExpectedResources(t *testing.T) {
+    terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
+        TerraformDir: "../../infra/terraform",
+    })
+    defer terraform.Destroy(t, terraformOptions)
+    terraform.InitAndPlan(t, terraformOptions)
+}
+
+func TestTerraformApply_SQSFIFOQueueCreated(t *testing.T) {}
+func TestTerraformApply_RDSInstanceCreated(t *testing.T) {}
+func TestTerraformApply_IAMRolesHaveLeastPrivilege(t *testing.T) {
+    // Verify no IAM policy has Action: "*"
+}
+func TestTerraformApply_VPCSecurityGroupsRestrictIngress(t *testing.T) {}
+```
+
+#### Epic 2: mTLS Certificate Lifecycle
+
+```go
+// pkg/agent/mtls_test.go
+
+func TestMTLS_CertificateGeneration_ValidX509(t *testing.T) {}
+func TestMTLS_CertificateExpiration_AgentRejectsExpiredCert(t *testing.T) {}
+func TestMTLS_CertificateRotation_NewCertAcceptedMidConnection(t *testing.T) {}
+func TestMTLS_CertificateRevocation_RevokedCertRejected(t *testing.T) {}
+func TestMTLS_SelfSignedCert_RejectedBySaaS(t *testing.T) {}
+func TestMTLS_CertificateChain_IntermediateCAValidated(t *testing.T) {}
+```
+
+### 11.2 Add t.Parallel() to Table-Driven Tests
+
+```go
+// BEFORE (sequential — wastes CI time):
+func TestSecretScrubber(t *testing.T) {
+    tests := []struct{ name, input, expected string }{...}
+    for _, tt := range tests {
+        t.Run(tt.name, func(t *testing.T) {
+            // runs sequentially
+        })
+    }
+}
+
+// AFTER (parallel):
+func TestSecretScrubber(t *testing.T) {
+    t.Parallel()
+    tests := []struct{ name, input, expected string }{...}
+    for _, tt := range tests {
+        tt := tt // capture range variable
+        t.Run(tt.name, func(t *testing.T) {
+            t.Parallel()
+            // runs in parallel
+        })
+    }
+}
+```
+
+### 11.3 Dynamic Resource Naming for LocalStack
+
+```go
+// BEFORE (shared state — flaky):
+// bucket := "drift-reports"
+
+// AFTER (per-test isolation):
+func uniqueBucket(t *testing.T) string {
+    return fmt.Sprintf("drift-reports-%s-%d", t.Name(), time.Now().UnixNano())
+}
+
+func TestDriftReportUpload(t *testing.T) {
+    t.Parallel()
+    bucket := uniqueBucket(t)
+    s3Client.CreateBucket(ctx, &s3.CreateBucketInput{Bucket: &bucket})
+    // Test uses isolated bucket — no cross-test contamination
+}
+```
+
+### 11.4 Distributed Tracing Cross-Boundary Tests
+
+```go
+// tests/integration/trace_propagation_test.go
+
+func TestTraceContext_AgentToSaaS_SpanParentChain(t *testing.T) {
+    // Agent generates drift_scan span with trace_id
+    // POST /v1/drift-reports carries traceparent header
+    // SaaS Event Processor creates child span
+    // Verify parent-child relationship across HTTP boundary
+    
+    exporter := tracetest.NewInMemoryExporter()
+    
+    // Fire drift report with traceparent
+    traceID := "4bf92f3577b34da6a3ce929d0e0e4736"
+    resp := postDriftReport(t, stack, traceID)
+    assert.Equal(t, 200, resp.StatusCode)
+    
+    spans := exporter.GetSpans()
+    eventProcessorSpan := findSpan(spans, "drift_report.process")
+    assert.Equal(t, traceID, eventProcessorSpan.SpanContext().TraceID().String())
+}
+
+func TestTraceContext_SQSBoundary_PreservesTraceID(t *testing.T) {
+    // Verify SQS message attributes contain traceparent
+    // Verify consumer extracts and continues the trace
+}
+
+func TestTraceContext_AgentScan_CreatesParentSpan(t *testing.T) {
+    // Verify agent drift_scan span has correct attributes:
+    // drift.stack_id, drift.resource_count, drift.duration_ms
+}
+```
+
+### 11.5 Backward Compatibility Serialization (Elastic Schema)
+
+```go
+// tests/schema/backward_compat_test.go
+
+func TestOldAgent_ParsesNewDynamoDBItem_WithV2Attributes(t *testing.T) {
+    // Simulate V2 DynamoDB item with new _v2 fields
+    item := map[string]types.AttributeValue{
+        "PK":              &types.AttributeValueMemberS{Value: "STACK#123"},
+        "drift_score":     &types.AttributeValueMemberN{Value: "85"},
+        "drift_score_v2":  &types.AttributeValueMemberN{Value: "92"}, // New field
+        "remediation_v2":  &types.AttributeValueMemberS{Value: "auto"}, // New field
+    }
+    
+    // V1 parser must ignore unknown fields
+    result, err := ParseDriftItem(item)
+    assert.NoError(t, err)
+    assert.Equal(t, 85, result.DriftScore) // Uses V1 field
+}
+
+func TestV1Code_ReadsV2Writes_DuringMigrationWindow(t *testing.T) {
+    // V2 writes both drift_score and drift_score_v2
+    // V1 reads drift_score (ignores _v2)
+    // Verify no data loss
+}
+```
+
+### 11.6 Security: RBAC Forgery & Replay Attacks
+
+```go
+// tests/integration/security_test.go
+
+func TestAgentCannotForgeStackID(t *testing.T) {
+    // Agent with API key for org-A sends drift report claiming stack belongs to org-B
+    orgAKey := createAPIKey(t, "org-a")
+    report := makeDriftReport("org-b-stack-id") // Wrong org
+    
+    resp := postDriftReportWithKey(t, report, orgAKey)
+    assert.Equal(t, 403, resp.StatusCode)
+}
+
+func TestReplayAttack_DuplicateReportID_Rejected(t *testing.T) {
+    report := makeDriftReport("stack-1")
+    resp1 := postDriftReport(t, report)
+    assert.Equal(t, 200, resp1.StatusCode)
+    
+    // Replay exact same report
+    resp2 := postDriftReport(t, report)
+    assert.Equal(t, 409, resp2.StatusCode) // Conflict — already processed
+}
+
+func TestReplayAttack_OldTimestamp_Rejected(t *testing.T) {
+    report := makeDriftReport("stack-1")
+    report.Timestamp = time.Now().Add(-10 * time.Minute) // 10 min old
+    
+    resp := postDriftReport(t, report)
+    assert.Equal(t, 400, resp.StatusCode) // Stale report
+}
+```
+
+### 11.7 Noisy Neighbor & Fair-Share Processing
+
+```go
+// tests/integration/fair_share_test.go
+
+func TestNoisyNeighbor_LargeOrgDoesNotStarveSmallOrg(t *testing.T) {
+    // Org A: 10,000 drifted resources
+    // Org B: 10 drifted resources
+    // Both submit reports simultaneously
+    
+    seedDriftReports(t, "org-a", 10000)
+    seedDriftReports(t, "org-b", 10)
+    
+    // Org B's reports must be processed within 30 seconds
+    // (not queued behind all 10K of Org A's)
+    start := time.Now()
+    waitForProcessed(t, "org-b", 10, 30*time.Second)
+    assert.Less(t, time.Since(start), 30*time.Second)
+}
+```
+
+### 11.8 Panic Mode Mid-Remediation Race Condition
+
+```go
+// tests/integration/panic_remediation_test.go
+
+func TestPanicMode_AbortsInFlightRemediation(t *testing.T) {
+    // Start a remediation (terraform apply)
+    execID := startRemediation(t, "stack-1", "drift-1")
+    waitForState(t, execID, "applying")
+    
+    // Trigger panic mode
+    triggerPanicMode(t)
+    
+    // Remediation must be aborted, not completed
+    state := waitForState(t, execID, "aborted")
+    assert.Equal(t, "aborted", state)
+    
+    // Verify terraform state is not corrupted
+    // (agent should have run terraform state pull to verify)
+}
+
+func TestPanicMode_DoesNotAbortReadOnlyScans(t *testing.T) {
+    // Drift scans (read-only) should continue during panic
+    // Only write operations (remediation) are halted
+    scanID := startDriftScan(t, "stack-1")
+    triggerPanicMode(t)
+    
+    state := waitForState(t, scanID, "completed")
+    assert.Equal(t, "completed", state) // Scan finishes normally
+}
+```
+
+### 11.9 Remediation vs. Concurrent Scan Race Condition
+
+```go
+func TestConcurrentScanDuringRemediation_DoesNotReportHalfAppliedState(t *testing.T) {
+    // Start remediation (terraform apply — takes ~30s)
+    execID := startRemediation(t, "stack-1", "drift-1")
+    waitForState(t, execID, "applying")
+    
+    // Trigger a drift scan while remediation is in progress
+    scanID := startDriftScan(t, "stack-1")
+    
+    // Scan must either:
+    // a) Wait for remediation to complete, OR
+    // b) Skip the stack with "remediation in progress" status
+    scanResult := waitForScanComplete(t, scanID)
+    assert.NotEqual(t, "half-applied", scanResult.Status)
+    // Must be either "skipped_remediation_in_progress" or show post-remediation state
+}
+```
+
+### 11.10 SaaS API Memory Profiling
+
+```go
+// tests/load/memory_profile_test.go
+
+func TestEventProcessor_DoesNotOOM_On1MB_DriftReport(t *testing.T) {
+    // Generate a 1MB drift report (1000 resources with large diffs)
+    report := makeLargeDriftReport(1000)
+    assert.Greater(t, len(report), 1024*1024)
+    
+    var memBefore, memAfter runtime.MemStats
+    runtime.ReadMemStats(&memBefore)
+    
+    processReport(t, report)
+    
+    runtime.ReadMemStats(&memAfter)
+    growth := memAfter.Alloc - memBefore.Alloc
+    assert.Less(t, growth, uint64(50*1024*1024)) // <50MB growth
+}
+```
+
+### 11.11 Trim E2E to Smoke Tier
+
+Per review recommendation, cap E2E at 10 critical paths. Remaining 40 tests pushed to integration:
+
+| E2E (Keep — 10 max) | Demoted to Integration |
+|---------------------|----------------------|
+| Onboarding: init → connect → first scan | Agent heartbeat variations |
+| First drift detected → Slack alert | Individual parser format tests |
+| Revert flow: Slack → agent apply → verify | Secret scrubber edge cases |
+| Panic mode halts remediation | DynamoDB access pattern tests |
+| Cross-tenant isolation | Individual webhook format tests |
+| OAuth login → dashboard → view diff | Notification batching |
+| Free tier limit enforcement | Agent config reload |
+| Agent disconnect → reconnect → resume | Baseline score calculations |
+| mTLS cert rotation mid-scan | Individual API endpoint tests |
+| Stripe upgrade → unlock features | Cache invalidation patterns |
+
+### 11.12 Updated Test Pyramid (Post-Review)
+
+| Level | Original | Revised | Rationale |
+|-------|----------|---------|-----------|
+| Unit | 70% (~350) | 65% (~350) | Add t.Parallel(), keep count but add UI component tests |
+| Integration | 20% (~100) | 28% (~150) | Terratest, mTLS, trace propagation, fair-share, security |
+| E2E/Smoke | 10% (~50) | 7% (~35) | Capped at 10 true E2E + 25 Playwright UI tests |
+
+*End of P2 Review Remediation Addendum*