Implement BMad Must-Have Before Launch fixes for all 6 products

P1: API key redaction, SSE billing leak, token math edge cases, CI runner config
P2: mTLS revocation lockout, terraform state lock recovery, RLS pool leak, entropy scrubber, pgmq visibility
P3: HMAC replay prevention, cross-tenant negative tests, correlation window edge cases, SQS claim-check, free tier
P4: Discovery partial failure recovery, ownership conflict integration test, VCR freshness CI, Meilisearch rebuild, Cmd+K latency
P5: Concurrent baseline conflicts, remediation RBAC, Clock interface for governance, 10K property-based runs, Redis panic fallback
P6: Cryptographic agent update signatures, streaming audit logs with WAL, shell AST parsing (mvdan/sh), intervention deadlock TTL, canary suite CI gate
This commit is contained in:
2026-03-01 02:14:04 +00:00
parent b24cfa7c0d
commit d038cd9c5c
6 changed files with 1305 additions and 0 deletions

View File

@@ -2094,3 +2094,208 @@ Per review recommendation, cap E2E at 10 critical paths. Remaining 40 tests push
| E2E/Smoke | 10% (~50) | 7% (~35) | Capped at 10 true E2E + 25 Playwright UI tests |
*End of P2 Review Remediation Addendum*
---
## 12. BMad Review Implementation (Must-Have Before Launch)
### 12.1 mTLS Revocation — Instant Lockout
```go
// tests/integration/mtls_revocation_test.go
func TestRevokedCert_ExistingConnectionDropped(t *testing.T) {
// 1. Agent connects with valid cert
agent := connectAgentWithCert(t, validCert)
assert.True(t, agent.IsConnected())
// 2. Revoke the cert via CRL update
revokeCert(t, validCert.SerialNumber)
// 3. Force CRL refresh on SaaS (< 30 seconds)
triggerCRLRefresh(t)
// 4. Existing connection must be terminated
time.Sleep(5 * time.Second)
assert.False(t, agent.IsConnected(),
"Revoked agent still has active mTLS connection — cached session not cleared")
}
func TestRevokedCert_NewConnectionRejected(t *testing.T) {
revokeCert(t, validCert.SerialNumber)
triggerCRLRefresh(t)
_, err := connectAgentWithCert(t, validCert)
assert.Error(t, err)
assert.Contains(t, err.Error(), "certificate revoked")
}
func TestPayloadReplay_RejectedByNonce(t *testing.T) {
// Capture a legitimate drift report
report := captureValidDriftReport(t)
// Replay it
resp := postDriftReport(t, report)
assert.Equal(t, 409, resp.StatusCode) // Conflict — nonce already used
}
```
### 12.2 Terraform State Lock Recovery on Panic
```go
// tests/integration/remediation_panic_test.go
func TestPanicMode_ReleasesTerraformStateLock(t *testing.T) {
// 1. Start a remediation (terraform apply)
execID := startRemediation(t, "stack-1", "drift-1")
waitForState(t, execID, "applying")
// 2. Verify state lock is held
lockInfo := getTerraformStateLock(t, "stack-1")
assert.NotNil(t, lockInfo, "State lock should be held during apply")
// 3. Trigger panic mode
triggerPanicMode(t)
// 4. Wait for agent to abort
waitForState(t, execID, "aborted")
// 5. State lock MUST be released
lockInfo = getTerraformStateLock(t, "stack-1")
assert.Nil(t, lockInfo,
"Terraform state lock not released after panic — infrastructure is now in zombie state")
}
func TestPanicMode_AgentRunsTerraformForceUnlock(t *testing.T) {
// If normal unlock fails, agent must run `terraform force-unlock`
startRemediation(t, "stack-1", "drift-1")
waitForState(t, execID, "applying")
// Simulate lock that can't be released normally
corruptStateLock(t, "stack-1")
triggerPanicMode(t)
// Agent should attempt force-unlock
logs := getAgentLogs(t)
assert.Contains(t, logs, "terraform force-unlock")
}
```
### 12.3 RLS Connection Pool Leak Test
```go
// tests/integration/rls_pool_leak_test.go
func TestPgBouncer_ClearsTenantContext_BetweenRequests(t *testing.T) {
t.Parallel()
// 1. Request as Tenant A — sets SET LOCAL app.tenant_id = 'tenant-a'
respA := apiRequest(t, tenantAToken, "GET", "/v1/stacks")
assert.Equal(t, 200, respA.StatusCode)
stacksA := parseStacks(respA)
// 2. Immediately request as Tenant B on same PgBouncer connection
respB := apiRequest(t, tenantBToken, "GET", "/v1/stacks")
assert.Equal(t, 200, respB.StatusCode)
stacksB := parseStacks(respB)
// 3. Tenant B must NOT see Tenant A's stacks
for _, stack := range stacksB {
assert.NotEqual(t, "tenant-a", stack.TenantID,
"CRITICAL: Tenant B received Tenant A's data — PgBouncer leaked context")
}
}
func TestRLS_100ConcurrentTenants_NoLeakage(t *testing.T) {
// Stress test: 100 concurrent requests from different tenants
var wg sync.WaitGroup
violations := make(chan string, 100)
for i := 0; i < 100; i++ {
wg.Add(1)
go func(tenantID string) {
defer wg.Done()
token := createTenantToken(t, tenantID)
resp := apiRequest(t, token, "GET", "/v1/stacks")
stacks := parseStacks(resp)
for _, s := range stacks {
if s.TenantID != tenantID {
violations <- fmt.Sprintf("Tenant %s saw data from %s", tenantID, s.TenantID)
}
}
}(fmt.Sprintf("tenant-%d", i))
}
wg.Wait()
close(violations)
for v := range violations {
t.Fatal("CROSS-TENANT LEAK:", v)
}
}
```
### 12.4 Secret Scrubber Entropy Scanning
```go
// pkg/agent/scrubber/entropy_test.go
func TestEntropyScan_DetectsBase64EncodedAWSKey(t *testing.T) {
// AWS key base64-encoded inside a JSON block
input := `{"config": "` + base64.StdEncoding.EncodeToString([]byte("AKIAIOSFODNN7EXAMPLE")) + `"}`
result := scrubber.Scrub(input)
assert.NotContains(t, result, "AKIAIOSFODNN7EXAMPLE")
}
func TestEntropyScan_DetectsMultiLineRSAKey(t *testing.T) {
input := `-----BEGIN RSA PRIVATE KEY-----
MIIEpAIBAAKCAQEA0Z3VS5JJcds3xfn/ygWyF8PbnGy5AhJPnUfGqlTlGa...
-----END RSA PRIVATE KEY-----`
result := scrubber.Scrub(input)
assert.Contains(t, result, "[REDACTED RSA KEY]")
}
func TestEntropyScan_DetectsHighEntropyCustomToken(t *testing.T) {
// 40-char hex string that looks like a custom API token
token := "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0"
input := fmt.Sprintf(`{"api_token": "%s"}`, token)
result := scrubber.Scrub(input)
// Shannon entropy > 3.5 bits/char should trigger redaction
assert.NotContains(t, result, token)
}
func TestEntropyScan_DoesNotRedactNormalText(t *testing.T) {
input := `{"message": "Hello world, this is a normal log message"}`
result := scrubber.Scrub(input)
assert.Equal(t, input, result) // No false positives
}
```
### 12.5 pgmq Visibility Timeout for Long Scans
```go
// tests/integration/pgmq_visibility_test.go
func TestPgmq_LongScan_DoesNotTriggerDuplicateProcessing(t *testing.T) {
// Simulate a scan that takes 5 minutes
queue := setupPgmqQueue(t)
// Enqueue a drift report
queue.Send(t, makeDriftReport("stack-1"))
// Consumer 1 picks it up with 2-minute visibility timeout
msg := queue.Read(t, 120) // 120s visibility
assert.NotNil(t, msg)
// Simulate long processing (extend visibility)
for i := 0; i < 3; i++ {
time.Sleep(90 * time.Second)
queue.ExtendVisibility(t, msg.ID, 120) // Extend by another 2 min
}
// Consumer 2 should NOT get the same message
msg2 := queue.Read(t, 120)
assert.Nil(t, msg2, "pgmq handed job to second worker while first was still processing")
}
```
*End of P2 BMad Implementation*