Implement BMad Must-Have Before Launch fixes for all 6 products

P1: API key redaction, SSE billing leak, token math edge cases, CI runner config P2: mTLS revocation lockout, terraform state lock recovery, RLS pool leak, entropy scrubber, pgmq visibility P3: HMAC replay prevention, cross-tenant negative tests, correlation window edge cases, SQS claim-check, free tier P4: Discovery partial failure recovery, ownership conflict integration test, VCR freshness CI, Meilisearch rebuild, Cmd+K latency P5: Concurrent baseline conflicts, remediation RBAC, Clock interface for governance, 10K property-based runs, Redis panic fallback P6: Cryptographic agent update signatures, streaming audit logs with WAL, shell AST parsing (mvdan/sh), intervention deadlock TTL, canary suite CI gate
2026-03-01 02:14:04 +00:00
parent b24cfa7c0d
commit d038cd9c5c
6 changed files with 1305 additions and 0 deletions
--- a/products/04-lightweight-idp/test-architecture/test-architecture.md
+++ b/products/04-lightweight-idp/test-architecture/test-architecture.md
@@ -1265,3 +1265,225 @@ def test_meilisearch_index_rebuild_does_not_drop_search():
    # Verify zero-downtime index swapping during mapping updates
    pass
 ```
+
+---
+
+## 12. BMad Review Implementation (Must-Have Before Launch)
+
+### 12.1 Discovery Scan Timeout / Partial Failure Recovery
+
+```python
+# tests/integration/test_discovery_resilience.py
+
+def test_partial_aws_scan_does_not_delete_existing_services():
+    """If AWS scanner times out after discovering 500 of 1000 resources,
+    existing catalog entries must NOT be marked stale or deleted."""
+    
+    # Seed catalog with 1000 services
+    seed_catalog(count=1000)
+    
+    # Simulate scanner timeout after 500 resources
+    with mock_aws_timeout_after(500):
+        result = run_aws_discovery_scan()
+    
+    assert result.status == "partial_failure"
+    assert result.discovered == 500
+    
+    # All 1000 services must still exist in catalog
+    services = catalog_api.list_services()
+    assert len(services) == 1000  # NOT 500
+    
+    # Partial results should be staged, not committed
+    staged = catalog_api.list_staged_updates()
+    assert len(staged) == 500
+
+def test_partial_github_scan_does_not_corrupt_ownership():
+    """If GitHub scanner hits rate limit mid-scan, existing ownership
+    mappings must be preserved."""
+    
+    seed_catalog_with_ownership(count=100)
+    
+    with mock_github_rate_limit_after(50):
+        result = run_github_discovery_scan()
+    
+    assert result.status == "partial_failure"
+    
+    # All 100 ownership mappings intact
+    services = catalog_api.list_services()
+    owned = [s for s in services if s.owner is not None]
+    assert len(owned) == 100  # NOT 50
+
+def test_scan_failure_triggers_alert_not_silent_failure():
+    result = run_aws_discovery_scan_with_invalid_credentials()
+    assert result.status == "failed"
+    
+    # Must alert the admin
+    alerts = get_admin_alerts()
+    assert any("discovery scan failed" in a.message for a in alerts)
+```
+
+### 12.2 Ownership Conflict Resolution Integration Test
+
+```python
+# tests/integration/test_ownership_conflict.py
+
+def test_explicit_config_overrides_implicit_tag():
+    """Explicit (CODEOWNERS/config) > Implicit (AWS tags) > Heuristic (commits)"""
+    
+    # AWS tag says owner is "team-infra"
+    aws_scanner.discover_service("auth-api", owner_tag="team-infra")
+    
+    # GitHub CODEOWNERS says owner is "team-platform"
+    github_scanner.discover_service("auth-api", codeowners="team-platform")
+    
+    # Resolve conflict
+    service = catalog_api.get_service("auth-api")
+    assert service.owner == "team-platform"  # Explicit wins
+    assert service.owner_source == "codeowners"
+
+def test_concurrent_discovery_sources_do_not_race():
+    """Two scanners discovering the same service simultaneously
+    must not create duplicate entries."""
+    
+    import asyncio
+    
+    async def run_both():
+        await asyncio.gather(
+            aws_scanner.discover_service_async("billing-api"),
+            github_scanner.discover_service_async("billing-api"),
+        )
+    
+    asyncio.run(run_both())
+    
+    services = catalog_api.search("billing-api")
+    assert len(services) == 1  # No duplicates
+
+def test_heuristic_ownership_does_not_override_explicit():
+    # Explicit owner set via config
+    catalog_api.set_owner("auth-api", "team-platform", source="config")
+    
+    # Heuristic scanner infers different owner from commit history
+    github_scanner.infer_ownership("auth-api", top_committer="dev@other-team.com")
+    
+    service = catalog_api.get_service("auth-api")
+    assert service.owner == "team-platform"  # Explicit preserved
+```
+
+### 12.3 VCR Cassette Freshness Validation
+
+```yaml
+# .github/workflows/vcr-refresh.yml
+# Weekly job to re-record VCR cassettes against real AWS
+name: VCR Cassette Freshness
+on:
+  schedule:
+    - cron: '0 6 * * 1'  # Every Monday 6 AM UTC
+
+jobs:
+  refresh:
+    runs-on: self-hosted
+    steps:
+      - uses: actions/checkout@v4
+      - name: Re-record cassettes
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.VCR_AWS_KEY }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.VCR_AWS_SECRET }}
+        run: |
+          VCR_RECORD=all pytest tests/integration/scanners/ -v
+      - name: Diff cassettes
+        run: |
+          git diff --stat tests/cassettes/
+          CHANGED=$(git diff --name-only tests/cassettes/ | wc -l)
+          if [ "$CHANGED" -gt 0 ]; then
+            echo "⚠️ $CHANGED cassettes changed — AWS API responses have drifted"
+            echo "Review and commit updated cassettes"
+          fi
+      - name: Create PR if cassettes changed
+        uses: peter-evans/create-pull-request@v6
+        with:
+          title: "chore: refresh VCR cassettes (AWS API drift)"
+          branch: vcr-refresh
+```
+
+### 12.4 Meilisearch Zero-Downtime Index Rebuild
+
+```python
+# tests/integration/test_meilisearch_rebuild.py
+
+def test_search_returns_results_during_index_rebuild():
+    """Cmd+K search must work during index rebuild (zero downtime)."""
+    
+    # Seed index with 100 services
+    meili.index("services").add_documents(make_services(100))
+    meili.index("services").wait_for_pending_update()
+    
+    # Start rebuild (creates services_v2, swaps when ready)
+    rebuild_task = start_index_rebuild()
+    
+    # Search must still work during rebuild
+    results = meili.index("services").search("auth")
+    assert len(results["hits"]) > 0
+    
+    # Wait for rebuild to complete
+    rebuild_task.wait()
+    
+    # Search still works after swap
+    results = meili.index("services").search("auth")
+    assert len(results["hits"]) > 0
+
+def test_index_rebuild_failure_does_not_corrupt_active_index():
+    meili.index("services").add_documents(make_services(50))
+    
+    # Simulate rebuild failure (e.g., OOM during indexing)
+    with mock_meili_oom_during_rebuild():
+        result = start_index_rebuild()
+    
+    assert result.status == "failed"
+    
+    # Active index must be untouched
+    results = meili.index("services").search("billing")
+    assert len(results["hits"]) > 0  # Still works
+```
+
+### 12.5 Cmd+K Search Latency from Redis Cache
+
+```python
+# tests/performance/test_search_latency.py
+
+def test_cmd_k_search_under_10ms_from_redis():
+    """Prefix cache hit must return in <10ms."""
+    
+    # Warm the cache
+    redis_client.set("search:prefix:auth", json.dumps([
+        {"name": "auth-service", "owner": "team-platform"},
+        {"name": "auth-proxy", "owner": "team-infra"},
+    ]))
+    
+    import time
+    start = time.perf_counter_ns()
+    results = search_api.prefix_search("auth")
+    elapsed_ms = (time.perf_counter_ns() - start) / 1_000_000
+    
+    assert elapsed_ms < 10, f"Cmd+K search took {elapsed_ms:.1f}ms — exceeds 10ms SLA"
+    assert len(results) == 2
+```
+
+### 12.6 Free Tier Enforcement (50 Services)
+
+```python
+def test_free_tier_allows_50_services():
+    tenant = create_tenant(tier="free")
+    for i in range(50):
+        resp = catalog_api.create_service(tenant, f"service-{i}")
+        assert resp.status_code == 201
+
+def test_free_tier_rejects_51st_service():
+    tenant = create_tenant(tier="free")
+    seed_services(tenant, count=50)
+    
+    resp = catalog_api.create_service(tenant, "service-51")
+    assert resp.status_code == 403
+    assert "upgrade" in resp.json()["error"].lower()
+```
+
+*End of P4 BMad Implementation*