Implement BMad Must-Have Before Launch fixes for all 6 products
P1: API key redaction, SSE billing leak, token math edge cases, CI runner config P2: mTLS revocation lockout, terraform state lock recovery, RLS pool leak, entropy scrubber, pgmq visibility P3: HMAC replay prevention, cross-tenant negative tests, correlation window edge cases, SQS claim-check, free tier P4: Discovery partial failure recovery, ownership conflict integration test, VCR freshness CI, Meilisearch rebuild, Cmd+K latency P5: Concurrent baseline conflicts, remediation RBAC, Clock interface for governance, 10K property-based runs, Redis panic fallback P6: Cryptographic agent update signatures, streaming audit logs with WAL, shell AST parsing (mvdan/sh), intervention deadlock TTL, canary suite CI gate
This commit is contained in:
@@ -2094,3 +2094,208 @@ Per review recommendation, cap E2E at 10 critical paths. Remaining 40 tests push
|
||||
| E2E/Smoke | 10% (~50) | 7% (~35) | Capped at 10 true E2E + 25 Playwright UI tests |
|
||||
|
||||
*End of P2 Review Remediation Addendum*
|
||||
|
||||
---
|
||||
|
||||
## 12. BMad Review Implementation (Must-Have Before Launch)
|
||||
|
||||
### 12.1 mTLS Revocation — Instant Lockout
|
||||
|
||||
```go
|
||||
// tests/integration/mtls_revocation_test.go
|
||||
|
||||
func TestRevokedCert_ExistingConnectionDropped(t *testing.T) {
|
||||
// 1. Agent connects with valid cert
|
||||
agent := connectAgentWithCert(t, validCert)
|
||||
assert.True(t, agent.IsConnected())
|
||||
|
||||
// 2. Revoke the cert via CRL update
|
||||
revokeCert(t, validCert.SerialNumber)
|
||||
|
||||
// 3. Force CRL refresh on SaaS (< 30 seconds)
|
||||
triggerCRLRefresh(t)
|
||||
|
||||
// 4. Existing connection must be terminated
|
||||
time.Sleep(5 * time.Second)
|
||||
assert.False(t, agent.IsConnected(),
|
||||
"Revoked agent still has active mTLS connection — cached session not cleared")
|
||||
}
|
||||
|
||||
func TestRevokedCert_NewConnectionRejected(t *testing.T) {
|
||||
revokeCert(t, validCert.SerialNumber)
|
||||
triggerCRLRefresh(t)
|
||||
|
||||
_, err := connectAgentWithCert(t, validCert)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "certificate revoked")
|
||||
}
|
||||
|
||||
func TestPayloadReplay_RejectedByNonce(t *testing.T) {
|
||||
// Capture a legitimate drift report
|
||||
report := captureValidDriftReport(t)
|
||||
|
||||
// Replay it
|
||||
resp := postDriftReport(t, report)
|
||||
assert.Equal(t, 409, resp.StatusCode) // Conflict — nonce already used
|
||||
}
|
||||
```
|
||||
|
||||
### 12.2 Terraform State Lock Recovery on Panic
|
||||
|
||||
```go
|
||||
// tests/integration/remediation_panic_test.go
|
||||
|
||||
func TestPanicMode_ReleasesTerraformStateLock(t *testing.T) {
|
||||
// 1. Start a remediation (terraform apply)
|
||||
execID := startRemediation(t, "stack-1", "drift-1")
|
||||
waitForState(t, execID, "applying")
|
||||
|
||||
// 2. Verify state lock is held
|
||||
lockInfo := getTerraformStateLock(t, "stack-1")
|
||||
assert.NotNil(t, lockInfo, "State lock should be held during apply")
|
||||
|
||||
// 3. Trigger panic mode
|
||||
triggerPanicMode(t)
|
||||
|
||||
// 4. Wait for agent to abort
|
||||
waitForState(t, execID, "aborted")
|
||||
|
||||
// 5. State lock MUST be released
|
||||
lockInfo = getTerraformStateLock(t, "stack-1")
|
||||
assert.Nil(t, lockInfo,
|
||||
"Terraform state lock not released after panic — infrastructure is now in zombie state")
|
||||
}
|
||||
|
||||
func TestPanicMode_AgentRunsTerraformForceUnlock(t *testing.T) {
|
||||
// If normal unlock fails, agent must run `terraform force-unlock`
|
||||
startRemediation(t, "stack-1", "drift-1")
|
||||
waitForState(t, execID, "applying")
|
||||
|
||||
// Simulate lock that can't be released normally
|
||||
corruptStateLock(t, "stack-1")
|
||||
triggerPanicMode(t)
|
||||
|
||||
// Agent should attempt force-unlock
|
||||
logs := getAgentLogs(t)
|
||||
assert.Contains(t, logs, "terraform force-unlock")
|
||||
}
|
||||
```
|
||||
|
||||
### 12.3 RLS Connection Pool Leak Test
|
||||
|
||||
```go
|
||||
// tests/integration/rls_pool_leak_test.go
|
||||
|
||||
func TestPgBouncer_ClearsTenantContext_BetweenRequests(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// 1. Request as Tenant A — sets SET LOCAL app.tenant_id = 'tenant-a'
|
||||
respA := apiRequest(t, tenantAToken, "GET", "/v1/stacks")
|
||||
assert.Equal(t, 200, respA.StatusCode)
|
||||
stacksA := parseStacks(respA)
|
||||
|
||||
// 2. Immediately request as Tenant B on same PgBouncer connection
|
||||
respB := apiRequest(t, tenantBToken, "GET", "/v1/stacks")
|
||||
assert.Equal(t, 200, respB.StatusCode)
|
||||
stacksB := parseStacks(respB)
|
||||
|
||||
// 3. Tenant B must NOT see Tenant A's stacks
|
||||
for _, stack := range stacksB {
|
||||
assert.NotEqual(t, "tenant-a", stack.TenantID,
|
||||
"CRITICAL: Tenant B received Tenant A's data — PgBouncer leaked context")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRLS_100ConcurrentTenants_NoLeakage(t *testing.T) {
|
||||
// Stress test: 100 concurrent requests from different tenants
|
||||
var wg sync.WaitGroup
|
||||
violations := make(chan string, 100)
|
||||
|
||||
for i := 0; i < 100; i++ {
|
||||
wg.Add(1)
|
||||
go func(tenantID string) {
|
||||
defer wg.Done()
|
||||
token := createTenantToken(t, tenantID)
|
||||
resp := apiRequest(t, token, "GET", "/v1/stacks")
|
||||
stacks := parseStacks(resp)
|
||||
for _, s := range stacks {
|
||||
if s.TenantID != tenantID {
|
||||
violations <- fmt.Sprintf("Tenant %s saw data from %s", tenantID, s.TenantID)
|
||||
}
|
||||
}
|
||||
}(fmt.Sprintf("tenant-%d", i))
|
||||
}
|
||||
wg.Wait()
|
||||
close(violations)
|
||||
|
||||
for v := range violations {
|
||||
t.Fatal("CROSS-TENANT LEAK:", v)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 12.4 Secret Scrubber Entropy Scanning
|
||||
|
||||
```go
|
||||
// pkg/agent/scrubber/entropy_test.go
|
||||
|
||||
func TestEntropyScan_DetectsBase64EncodedAWSKey(t *testing.T) {
|
||||
// AWS key base64-encoded inside a JSON block
|
||||
input := `{"config": "` + base64.StdEncoding.EncodeToString([]byte("AKIAIOSFODNN7EXAMPLE")) + `"}`
|
||||
result := scrubber.Scrub(input)
|
||||
assert.NotContains(t, result, "AKIAIOSFODNN7EXAMPLE")
|
||||
}
|
||||
|
||||
func TestEntropyScan_DetectsMultiLineRSAKey(t *testing.T) {
|
||||
input := `-----BEGIN RSA PRIVATE KEY-----
|
||||
MIIEpAIBAAKCAQEA0Z3VS5JJcds3xfn/ygWyF8PbnGy5AhJPnUfGqlTlGa...
|
||||
-----END RSA PRIVATE KEY-----`
|
||||
result := scrubber.Scrub(input)
|
||||
assert.Contains(t, result, "[REDACTED RSA KEY]")
|
||||
}
|
||||
|
||||
func TestEntropyScan_DetectsHighEntropyCustomToken(t *testing.T) {
|
||||
// 40-char hex string that looks like a custom API token
|
||||
token := "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0"
|
||||
input := fmt.Sprintf(`{"api_token": "%s"}`, token)
|
||||
result := scrubber.Scrub(input)
|
||||
// Shannon entropy > 3.5 bits/char should trigger redaction
|
||||
assert.NotContains(t, result, token)
|
||||
}
|
||||
|
||||
func TestEntropyScan_DoesNotRedactNormalText(t *testing.T) {
|
||||
input := `{"message": "Hello world, this is a normal log message"}`
|
||||
result := scrubber.Scrub(input)
|
||||
assert.Equal(t, input, result) // No false positives
|
||||
}
|
||||
```
|
||||
|
||||
### 12.5 pgmq Visibility Timeout for Long Scans
|
||||
|
||||
```go
|
||||
// tests/integration/pgmq_visibility_test.go
|
||||
|
||||
func TestPgmq_LongScan_DoesNotTriggerDuplicateProcessing(t *testing.T) {
|
||||
// Simulate a scan that takes 5 minutes
|
||||
queue := setupPgmqQueue(t)
|
||||
|
||||
// Enqueue a drift report
|
||||
queue.Send(t, makeDriftReport("stack-1"))
|
||||
|
||||
// Consumer 1 picks it up with 2-minute visibility timeout
|
||||
msg := queue.Read(t, 120) // 120s visibility
|
||||
assert.NotNil(t, msg)
|
||||
|
||||
// Simulate long processing (extend visibility)
|
||||
for i := 0; i < 3; i++ {
|
||||
time.Sleep(90 * time.Second)
|
||||
queue.ExtendVisibility(t, msg.ID, 120) // Extend by another 2 min
|
||||
}
|
||||
|
||||
// Consumer 2 should NOT get the same message
|
||||
msg2 := queue.Read(t, 120)
|
||||
assert.Nil(t, msg2, "pgmq handed job to second worker while first was still processing")
|
||||
}
|
||||
```
|
||||
|
||||
*End of P2 BMad Implementation*
|
||||
|
||||
Reference in New Issue
Block a user