Implement BMad Must-Have Before Launch fixes for all 6 products

P1: API key redaction, SSE billing leak, token math edge cases, CI runner config P2: mTLS revocation lockout, terraform state lock recovery, RLS pool leak, entropy scrubber, pgmq visibility P3: HMAC replay prevention, cross-tenant negative tests, correlation window edge cases, SQS claim-check, free tier P4: Discovery partial failure recovery, ownership conflict integration test, VCR freshness CI, Meilisearch rebuild, Cmd+K latency P5: Concurrent baseline conflicts, remediation RBAC, Clock interface for governance, 10K property-based runs, Redis panic fallback P6: Cryptographic agent update signatures, streaming audit logs with WAL, shell AST parsing (mvdan/sh), intervention deadlock TTL, canary suite CI gate
2026-03-01 02:14:04 +00:00
parent b24cfa7c0d
commit d038cd9c5c
6 changed files with 1305 additions and 0 deletions
--- a/products/02-iac-drift-detection/test-architecture/test-architecture.md
+++ b/products/02-iac-drift-detection/test-architecture/test-architecture.md
@@ -2094,3 +2094,208 @@ Per review recommendation, cap E2E at 10 critical paths. Remaining 40 tests push
 | E2E/Smoke | 10% (~50) | 7% (~35) | Capped at 10 true E2E + 25 Playwright UI tests |

 *End of P2 Review Remediation Addendum*
+
+---
+
+## 12. BMad Review Implementation (Must-Have Before Launch)
+
+### 12.1 mTLS Revocation — Instant Lockout
+
+```go
+// tests/integration/mtls_revocation_test.go
+
+func TestRevokedCert_ExistingConnectionDropped(t *testing.T) {
+    // 1. Agent connects with valid cert
+    agent := connectAgentWithCert(t, validCert)
+    assert.True(t, agent.IsConnected())
+    
+    // 2. Revoke the cert via CRL update
+    revokeCert(t, validCert.SerialNumber)
+    
+    // 3. Force CRL refresh on SaaS (< 30 seconds)
+    triggerCRLRefresh(t)
+    
+    // 4. Existing connection must be terminated
+    time.Sleep(5 * time.Second)
+    assert.False(t, agent.IsConnected(),
+        "Revoked agent still has active mTLS connection — cached session not cleared")
+}
+
+func TestRevokedCert_NewConnectionRejected(t *testing.T) {
+    revokeCert(t, validCert.SerialNumber)
+    triggerCRLRefresh(t)
+    
+    _, err := connectAgentWithCert(t, validCert)
+    assert.Error(t, err)
+    assert.Contains(t, err.Error(), "certificate revoked")
+}
+
+func TestPayloadReplay_RejectedByNonce(t *testing.T) {
+    // Capture a legitimate drift report
+    report := captureValidDriftReport(t)
+    
+    // Replay it
+    resp := postDriftReport(t, report)
+    assert.Equal(t, 409, resp.StatusCode) // Conflict — nonce already used
+}
+```
+
+### 12.2 Terraform State Lock Recovery on Panic
+
+```go
+// tests/integration/remediation_panic_test.go
+
+func TestPanicMode_ReleasesTerraformStateLock(t *testing.T) {
+    // 1. Start a remediation (terraform apply)
+    execID := startRemediation(t, "stack-1", "drift-1")
+    waitForState(t, execID, "applying")
+    
+    // 2. Verify state lock is held
+    lockInfo := getTerraformStateLock(t, "stack-1")
+    assert.NotNil(t, lockInfo, "State lock should be held during apply")
+    
+    // 3. Trigger panic mode
+    triggerPanicMode(t)
+    
+    // 4. Wait for agent to abort
+    waitForState(t, execID, "aborted")
+    
+    // 5. State lock MUST be released
+    lockInfo = getTerraformStateLock(t, "stack-1")
+    assert.Nil(t, lockInfo,
+        "Terraform state lock not released after panic — infrastructure is now in zombie state")
+}
+
+func TestPanicMode_AgentRunsTerraformForceUnlock(t *testing.T) {
+    // If normal unlock fails, agent must run `terraform force-unlock`
+    startRemediation(t, "stack-1", "drift-1")
+    waitForState(t, execID, "applying")
+    
+    // Simulate lock that can't be released normally
+    corruptStateLock(t, "stack-1")
+    triggerPanicMode(t)
+    
+    // Agent should attempt force-unlock
+    logs := getAgentLogs(t)
+    assert.Contains(t, logs, "terraform force-unlock")
+}
+```
+
+### 12.3 RLS Connection Pool Leak Test
+
+```go
+// tests/integration/rls_pool_leak_test.go
+
+func TestPgBouncer_ClearsTenantContext_BetweenRequests(t *testing.T) {
+    t.Parallel()
+    
+    // 1. Request as Tenant A — sets SET LOCAL app.tenant_id = 'tenant-a'
+    respA := apiRequest(t, tenantAToken, "GET", "/v1/stacks")
+    assert.Equal(t, 200, respA.StatusCode)
+    stacksA := parseStacks(respA)
+    
+    // 2. Immediately request as Tenant B on same PgBouncer connection
+    respB := apiRequest(t, tenantBToken, "GET", "/v1/stacks")
+    assert.Equal(t, 200, respB.StatusCode)
+    stacksB := parseStacks(respB)
+    
+    // 3. Tenant B must NOT see Tenant A's stacks
+    for _, stack := range stacksB {
+        assert.NotEqual(t, "tenant-a", stack.TenantID,
+            "CRITICAL: Tenant B received Tenant A's data — PgBouncer leaked context")
+    }
+}
+
+func TestRLS_100ConcurrentTenants_NoLeakage(t *testing.T) {
+    // Stress test: 100 concurrent requests from different tenants
+    var wg sync.WaitGroup
+    violations := make(chan string, 100)
+    
+    for i := 0; i < 100; i++ {
+        wg.Add(1)
+        go func(tenantID string) {
+            defer wg.Done()
+            token := createTenantToken(t, tenantID)
+            resp := apiRequest(t, token, "GET", "/v1/stacks")
+            stacks := parseStacks(resp)
+            for _, s := range stacks {
+                if s.TenantID != tenantID {
+                    violations <- fmt.Sprintf("Tenant %s saw data from %s", tenantID, s.TenantID)
+                }
+            }
+        }(fmt.Sprintf("tenant-%d", i))
+    }
+    wg.Wait()
+    close(violations)
+    
+    for v := range violations {
+        t.Fatal("CROSS-TENANT LEAK:", v)
+    }
+}
+```
+
+### 12.4 Secret Scrubber Entropy Scanning
+
+```go
+// pkg/agent/scrubber/entropy_test.go
+
+func TestEntropyScan_DetectsBase64EncodedAWSKey(t *testing.T) {
+    // AWS key base64-encoded inside a JSON block
+    input := `{"config": "` + base64.StdEncoding.EncodeToString([]byte("AKIAIOSFODNN7EXAMPLE")) + `"}`
+    result := scrubber.Scrub(input)
+    assert.NotContains(t, result, "AKIAIOSFODNN7EXAMPLE")
+}
+
+func TestEntropyScan_DetectsMultiLineRSAKey(t *testing.T) {
+    input := `-----BEGIN RSA PRIVATE KEY-----
+MIIEpAIBAAKCAQEA0Z3VS5JJcds3xfn/ygWyF8PbnGy5AhJPnUfGqlTlGa...
+-----END RSA PRIVATE KEY-----`
+    result := scrubber.Scrub(input)
+    assert.Contains(t, result, "[REDACTED RSA KEY]")
+}
+
+func TestEntropyScan_DetectsHighEntropyCustomToken(t *testing.T) {
+    // 40-char hex string that looks like a custom API token
+    token := "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0"
+    input := fmt.Sprintf(`{"api_token": "%s"}`, token)
+    result := scrubber.Scrub(input)
+    // Shannon entropy > 3.5 bits/char should trigger redaction
+    assert.NotContains(t, result, token)
+}
+
+func TestEntropyScan_DoesNotRedactNormalText(t *testing.T) {
+    input := `{"message": "Hello world, this is a normal log message"}`
+    result := scrubber.Scrub(input)
+    assert.Equal(t, input, result) // No false positives
+}
+```
+
+### 12.5 pgmq Visibility Timeout for Long Scans
+
+```go
+// tests/integration/pgmq_visibility_test.go
+
+func TestPgmq_LongScan_DoesNotTriggerDuplicateProcessing(t *testing.T) {
+    // Simulate a scan that takes 5 minutes
+    queue := setupPgmqQueue(t)
+    
+    // Enqueue a drift report
+    queue.Send(t, makeDriftReport("stack-1"))
+    
+    // Consumer 1 picks it up with 2-minute visibility timeout
+    msg := queue.Read(t, 120) // 120s visibility
+    assert.NotNil(t, msg)
+    
+    // Simulate long processing (extend visibility)
+    for i := 0; i < 3; i++ {
+        time.Sleep(90 * time.Second)
+        queue.ExtendVisibility(t, msg.ID, 120) // Extend by another 2 min
+    }
+    
+    // Consumer 2 should NOT get the same message
+    msg2 := queue.Read(t, 120)
+    assert.Nil(t, msg2, "pgmq handed job to second worker while first was still processing")
+}
+```
+
+*End of P2 BMad Implementation*