Implement BMad Must-Have Before Launch fixes for all 6 products

P1: API key redaction, SSE billing leak, token math edge cases, CI runner config
P2: mTLS revocation lockout, terraform state lock recovery, RLS pool leak, entropy scrubber, pgmq visibility
P3: HMAC replay prevention, cross-tenant negative tests, correlation window edge cases, SQS claim-check, free tier
P4: Discovery partial failure recovery, ownership conflict integration test, VCR freshness CI, Meilisearch rebuild, Cmd+K latency
P5: Concurrent baseline conflicts, remediation RBAC, Clock interface for governance, 10K property-based runs, Redis panic fallback
P6: Cryptographic agent update signatures, streaming audit logs with WAL, shell AST parsing (mvdan/sh), intervention deadlock TTL, canary suite CI gate
This commit is contained in:
2026-03-01 02:14:04 +00:00
parent b24cfa7c0d
commit d038cd9c5c
6 changed files with 1305 additions and 0 deletions

View File

@@ -1265,3 +1265,225 @@ def test_meilisearch_index_rebuild_does_not_drop_search():
# Verify zero-downtime index swapping during mapping updates
pass
```
---
## 12. BMad Review Implementation (Must-Have Before Launch)
### 12.1 Discovery Scan Timeout / Partial Failure Recovery
```python
# tests/integration/test_discovery_resilience.py
def test_partial_aws_scan_does_not_delete_existing_services():
"""If AWS scanner times out after discovering 500 of 1000 resources,
existing catalog entries must NOT be marked stale or deleted."""
# Seed catalog with 1000 services
seed_catalog(count=1000)
# Simulate scanner timeout after 500 resources
with mock_aws_timeout_after(500):
result = run_aws_discovery_scan()
assert result.status == "partial_failure"
assert result.discovered == 500
# All 1000 services must still exist in catalog
services = catalog_api.list_services()
assert len(services) == 1000 # NOT 500
# Partial results should be staged, not committed
staged = catalog_api.list_staged_updates()
assert len(staged) == 500
def test_partial_github_scan_does_not_corrupt_ownership():
"""If GitHub scanner hits rate limit mid-scan, existing ownership
mappings must be preserved."""
seed_catalog_with_ownership(count=100)
with mock_github_rate_limit_after(50):
result = run_github_discovery_scan()
assert result.status == "partial_failure"
# All 100 ownership mappings intact
services = catalog_api.list_services()
owned = [s for s in services if s.owner is not None]
assert len(owned) == 100 # NOT 50
def test_scan_failure_triggers_alert_not_silent_failure():
result = run_aws_discovery_scan_with_invalid_credentials()
assert result.status == "failed"
# Must alert the admin
alerts = get_admin_alerts()
assert any("discovery scan failed" in a.message for a in alerts)
```
### 12.2 Ownership Conflict Resolution Integration Test
```python
# tests/integration/test_ownership_conflict.py
def test_explicit_config_overrides_implicit_tag():
"""Explicit (CODEOWNERS/config) > Implicit (AWS tags) > Heuristic (commits)"""
# AWS tag says owner is "team-infra"
aws_scanner.discover_service("auth-api", owner_tag="team-infra")
# GitHub CODEOWNERS says owner is "team-platform"
github_scanner.discover_service("auth-api", codeowners="team-platform")
# Resolve conflict
service = catalog_api.get_service("auth-api")
assert service.owner == "team-platform" # Explicit wins
assert service.owner_source == "codeowners"
def test_concurrent_discovery_sources_do_not_race():
"""Two scanners discovering the same service simultaneously
must not create duplicate entries."""
import asyncio
async def run_both():
await asyncio.gather(
aws_scanner.discover_service_async("billing-api"),
github_scanner.discover_service_async("billing-api"),
)
asyncio.run(run_both())
services = catalog_api.search("billing-api")
assert len(services) == 1 # No duplicates
def test_heuristic_ownership_does_not_override_explicit():
# Explicit owner set via config
catalog_api.set_owner("auth-api", "team-platform", source="config")
# Heuristic scanner infers different owner from commit history
github_scanner.infer_ownership("auth-api", top_committer="dev@other-team.com")
service = catalog_api.get_service("auth-api")
assert service.owner == "team-platform" # Explicit preserved
```
### 12.3 VCR Cassette Freshness Validation
```yaml
# .github/workflows/vcr-refresh.yml
# Weekly job to re-record VCR cassettes against real AWS
name: VCR Cassette Freshness
on:
schedule:
- cron: '0 6 * * 1' # Every Monday 6 AM UTC
jobs:
refresh:
runs-on: self-hosted
steps:
- uses: actions/checkout@v4
- name: Re-record cassettes
env:
AWS_ACCESS_KEY_ID: ${{ secrets.VCR_AWS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.VCR_AWS_SECRET }}
run: |
VCR_RECORD=all pytest tests/integration/scanners/ -v
- name: Diff cassettes
run: |
git diff --stat tests/cassettes/
CHANGED=$(git diff --name-only tests/cassettes/ | wc -l)
if [ "$CHANGED" -gt 0 ]; then
echo "⚠️ $CHANGED cassettes changed — AWS API responses have drifted"
echo "Review and commit updated cassettes"
fi
- name: Create PR if cassettes changed
uses: peter-evans/create-pull-request@v6
with:
title: "chore: refresh VCR cassettes (AWS API drift)"
branch: vcr-refresh
```
### 12.4 Meilisearch Zero-Downtime Index Rebuild
```python
# tests/integration/test_meilisearch_rebuild.py
def test_search_returns_results_during_index_rebuild():
"""Cmd+K search must work during index rebuild (zero downtime)."""
# Seed index with 100 services
meili.index("services").add_documents(make_services(100))
meili.index("services").wait_for_pending_update()
# Start rebuild (creates services_v2, swaps when ready)
rebuild_task = start_index_rebuild()
# Search must still work during rebuild
results = meili.index("services").search("auth")
assert len(results["hits"]) > 0
# Wait for rebuild to complete
rebuild_task.wait()
# Search still works after swap
results = meili.index("services").search("auth")
assert len(results["hits"]) > 0
def test_index_rebuild_failure_does_not_corrupt_active_index():
meili.index("services").add_documents(make_services(50))
# Simulate rebuild failure (e.g., OOM during indexing)
with mock_meili_oom_during_rebuild():
result = start_index_rebuild()
assert result.status == "failed"
# Active index must be untouched
results = meili.index("services").search("billing")
assert len(results["hits"]) > 0 # Still works
```
### 12.5 Cmd+K Search Latency from Redis Cache
```python
# tests/performance/test_search_latency.py
def test_cmd_k_search_under_10ms_from_redis():
"""Prefix cache hit must return in <10ms."""
# Warm the cache
redis_client.set("search:prefix:auth", json.dumps([
{"name": "auth-service", "owner": "team-platform"},
{"name": "auth-proxy", "owner": "team-infra"},
]))
import time
start = time.perf_counter_ns()
results = search_api.prefix_search("auth")
elapsed_ms = (time.perf_counter_ns() - start) / 1_000_000
assert elapsed_ms < 10, f"Cmd+K search took {elapsed_ms:.1f}ms — exceeds 10ms SLA"
assert len(results) == 2
```
### 12.6 Free Tier Enforcement (50 Services)
```python
def test_free_tier_allows_50_services():
tenant = create_tenant(tier="free")
for i in range(50):
resp = catalog_api.create_service(tenant, f"service-{i}")
assert resp.status_code == 201
def test_free_tier_rejects_51st_service():
tenant = create_tenant(tier="free")
seed_services(tenant, count=50)
resp = catalog_api.create_service(tenant, "service-51")
assert resp.status_code == 403
assert "upgrade" in resp.json()["error"].lower()
```
*End of P4 BMad Implementation*